NVIDIA-NeMo · terrykong · Jan 2, 2026 · Jan 2, 2026 · Jan 3, 2026 · Jan 5, 2026
@@ -216,6 +216,15 @@ policy:
     top_k: null
     stop_token_ids: null
     stop_strings: null
+    mcore_generation_config:
+      buffer_size_gb: 20  # Total GPU memory (in GB) allocated for KV cache buffers
+      buffer_guaranteed_fraction: 0.1  # Fraction of buffer reserved for guaranteed active requests
+      num_cuda_graphs: 16  # Number of CUDA graphs to pre-compile for different batch sizes
+      block_size_tokens: 256  # Size of each KV cache block in tokens (affects memory granularity)
+      use_cuda_graphs_for_non_decode_steps: true  # Enable CUDA graphs for prefill/context processing
+      enable_chunked_prefill: true  # Split long prefills into chunks for better memory management
+      unified_memory_level: 0  # Unified memory usage level (0=disabled, higher values enable more aggressive paging)
+      max_tokens: ${policy.max_total_sequence_length} # Maximum number of tokens to use in a single step
     vllm_cfg:
       async_engine: false
       precision: ${policy.precision}

@@ -150,7 +150,7 @@ policy:
       use_cuda_graphs_for_non_decode_steps: true  # Enable CUDA graphs for prefill/context processing
       enable_chunked_prefill: true  # Split long prefills into chunks for better memory management
       unified_memory_level: 0  # Unified memory usage level (0=disabled, higher values enable more aggressive paging)
-      max_tokens: 16384  # Maximum number of tokens to use in a single step
+      max_tokens: ${policy.max_total_sequence_length} # Maximum number of tokens to use in a single step
 
     vllm_cfg:
       tensor_parallel_size: 1

@@ -29,7 +29,7 @@ checkpointing:
   checkpoint_dir: results/dapo-qwen2.5-7b
   keep_top_k: 5
   save_period: 5
-  model_save_format: "dcp"
+  model_save_format: null
 policy:
   model_name: Qwen/Qwen2.5-Math-7B
   hf_config_overrides:

@@ -388,6 +388,10 @@ def _patch_vllm_vit_flash_attn_backend():
                 )
                 # disable quantization
                 vllm_kwargs["hf_overrides"]["quantization_config"] = {}
+        elif "Gemma3ForConditionalGeneration" in getattr(hf_config, "architectures", []):
+            if self.cfg["vllm_cfg"]["skip_tokenizer_init"]:
+                print("Gemma3ForConditionalGeneration models may crash when skip_tokenizer_init is True. NeMo-RL is forcing it to False for this architecture. See https://github.com/NVIDIA-NeMo/RL/issues/1681 for more details.")
+            self.cfg["vllm_cfg"]["skip_tokenizer_init"] = False
 
         llm_kwargs = dict(
             model=self.model_name,

@@ -1839,7 +1839,7 @@ def move_buffer_to_device(
     ) -> nn.Module:
         # FSDP modules do not move buffers to the device automatically
         for v in model.buffers():
-            v.data = v.data.to(device)
+            v = v.to(device)
 
         return model
 

@@ -324,10 +324,10 @@ def __init__(
             print(
                 "[WARNING]: sequence_parallel=True, but tp_size=1 which has no effect. Enable tp_size > 1 to use sequence parallelism."
             )
-        elif sequence_parallel_enabled and tp_size > 1:
-            raise RuntimeError(
-                "Sequence parallel + tp_size >1 is currently broken in torch==2.8.0. See https://github.com/NVIDIA-NeMo/Automodel/issues/652 for more details."
-            )
+        #elif sequence_parallel_enabled and tp_size > 1:
+        #    raise RuntimeError(
+        #        "Sequence parallel + tp_size >1 is currently broken in torch==2.8.0. See https://github.com/NVIDIA-NeMo/Automodel/issues/652 for more details."
+        #    )
 
         if cp_size > 1:
             assert not isinstance(self.model, Gemma3ForCausalLM), (

@@ -121,6 +121,23 @@ def __init__(self, cfg: TensorboardConfig, log_dir: Optional[str] = None):
         self.writer = SummaryWriter(log_dir=log_dir)
         print(f"Initialized TensorboardLogger at {log_dir}")
 
+    @staticmethod
+    def _coerce_to_scalar(value: Any) -> int | float | bool | str | None:
+        """Coerce a value to a Python scalar for TensorBoard logging.
+
+        Returns the coerced value, or None if it can't be converted to a scalar.
+        """
+        if isinstance(value, (int, float, bool, str)):
+            return value
+        if isinstance(value, (np.floating, np.integer, np.bool_)):
+            return value.item()
+        if isinstance(value, np.ndarray) and (value.ndim == 0 or value.size == 1):
+            return value.item()
+        if isinstance(value, torch.Tensor) and (value.ndim == 0 or value.numel() == 1):
+            return value.item()
+        # dict, list, multi-element arrays/tensors, or incompatible types
+        return None
+
     def log_metrics(
         self,
         metrics: dict[str, Any],
@@ -137,23 +154,19 @@ def log_metrics(
             step_metric: Optional step metric name (ignored in TensorBoard)
         """
         for name, value in metrics.items():
-            # NeMo-Gym will add additional metrics like wandb histograms. However, some people will log to Tensorboard instead which may not be compatible
-            # This logic catches non-compatible objects being logged.
-            if not isinstance(value, (int, float, bool, str)):
-                continue
-
             if prefix:
                 name = f"{prefix}/{name}"
 
-            # Skip non-scalar values that TensorBoard can't handle
-            if isinstance(value, (dict, list)):
+            scalar = self._coerce_to_scalar(value)
+            if scalar is None:
                 print(
-                    f"Warning: Skipping non-scalar metric '{name}' for TensorBoard logging (type: {type(value).__name__})"
+                    f"Warning: Skipping metric '{name}' for TensorBoard logging "
+                    f"(unsupported type: {type(value).__name__})"
                 )
                 continue
 
             try:
-                self.writer.add_scalar(name, value, step)
+                self.writer.add_scalar(name, scalar, step)
             except Exception as e:
                 print(f"Warning: Failed to log metric '{name}' to TensorBoard: {e}")
                 continue

@@ -97,6 +97,38 @@ def mean(value, range_start=1, range_end=0, ignore_top_p=0.0):
     return statistics.mean(vals)
 
 
+def median(value, range_start=1, range_end=0):
+    """Return the median of values (or a range of values) in a dictionary.
+
+    Note:
+        step, and ranges, are 1 indexed. Range_end is exclusive.
+        range_end=0 means to include until the last step in the run
+
+    Args:
+        value: Dictionary of step -> value
+        range_start: Starting step (1-indexed, default=1)
+        range_end: Ending step (1-indexed, exclusive, 0 means last step)
+    """
+
+    ## find potential offset that might arise from resuming from a checkpoint
+    max_step_reached = builtins.max([int(s) for s in value.keys()])
+    ## this is the number of steps that occurred prior to resuming
+    offset = max_step_reached - len(value)
+
+    num_elem = len(value)
+    if range_start < 0:
+        range_start += num_elem + 1 + offset
+    if range_end <= 0:
+        range_end += num_elem + 1 + offset
+
+    vals = []
+    for step, v in value.items():
+        if range_start <= int(step) and int(step) < range_end:
+            vals.append(float(v))
+
+    return statistics.median(vals)
+
+
 def evaluate_check(data: dict, check: str) -> tuple[bool, str, object]:
     """Evaluate a check against the data.
 
@@ -109,6 +141,7 @@ def evaluate_check(data: dict, check: str) -> tuple[bool, str, object]:
         "min": min,
         "max": max,
         "mean": mean,
+        "median": median,
         "ratio_above": ratio_above,
     }
 
@@ -152,6 +185,7 @@ def main():
       # Use helper functions
       python check_metrics.py results.json "min(data['class_f1']) > 0.6"
       python check_metrics.py results.json "mean(data['accuracies']) > 0.85"
+      python check_metrics.py results.json "median(data['accuracies']) > 0.85"
       python check_metrics.py results.json "mean(data['loss'], ignore_top_p=0.05) < 1.5"
       python check_metrics.py results.json "ratio_above(data['error'], 1.05) < 0.02"
     """

@@ -34,8 +34,11 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     uv run tests/check_metrics.py $JSON_METRICS \
-        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'median(data["train/token_mult_prob_error"]) < 1.1' \
         'data["train/token_mult_prob_error"]["20"] < 1.05' \
         'data["train/reward"]["20"] > -0.45' \
         'data["train/filtered_reward"]["20"] > -0.2'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
@@ -20,7 +20,7 @@ uv run examples/run_distillation_math.py \
     distillation.val_period=20 \
     logger.log_dir=$LOG_DIR \
     logger.wandb_enabled=True \
-    logger.wandb.project=nemo-rl-distillation \
+    logger.wandb.project=nemo-rl \
     logger.wandb.name=$EXP_NAME \
     logger.monitor_gpus=True \
     logger.tensorboard_enabled=True \
@@ -39,4 +39,7 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma
         'data["train/loss"]["10"] < 0.5' \
         'max(data["ray/node.0.gpu.0.mem_gb"]) < 70' \
         'mean(data["timing/train/total_step_time"], -6, -1) < 500'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
@@ -20,7 +20,7 @@ uv run examples/run_distillation_math.py \
     distillation.val_period=20 \
     logger.log_dir=$LOG_DIR \
     logger.wandb_enabled=True \
-    logger.wandb.project=nemo-rl-distillation \
+    logger.wandb.project=nemo-rl \
     logger.wandb.name=$EXP_NAME \
     logger.monitor_gpus=True \
     logger.tensorboard_enabled=True \
@@ -39,4 +39,7 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma
         'data["train/loss"]["10"] < 0.5' \
         'max(data["ray/node.0.gpu.0.mem_gb"]) < 75' \
         'mean(data["timing/train/total_step_time"], -6, -1) < 500'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
@@ -19,7 +19,7 @@ uv run examples/run_distillation_math.py \
     distillation.max_num_steps=$MAX_STEPS \
     logger.log_dir=$LOG_DIR \
     logger.wandb_enabled=True \
-    logger.wandb.project=nemo-rl-distillation \
+    logger.wandb.project=nemo-rl \
     logger.wandb.name=$EXP_NAME \
     logger.monitor_gpus=True \
     logger.tensorboard_enabled=True \
@@ -38,4 +38,7 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma
         'data["train/loss"]["20"] < 0.3' \
         'data["validation/accuracy"]["20"] > 0.1' \
         'mean(data["timing/train/total_step_time"], -6, -1) < 1000'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
@@ -19,7 +19,7 @@ uv run examples/run_distillation_math.py \
     distillation.max_num_steps=$MAX_STEPS \
     logger.log_dir=$LOG_DIR \
     logger.wandb_enabled=True \
-    logger.wandb.project=nemo-rl-distillation \
+    logger.wandb.project=nemo-rl \
     logger.wandb.name=$EXP_NAME \
     logger.monitor_gpus=True \
     logger.tensorboard_enabled=True \
@@ -38,4 +38,7 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma
         'data["train/loss"]["100"] < 0.25' \
         'data["validation/accuracy"]["100"] > 0.2' \
         'mean(data["timing/train/total_step_time"], -6, -1) < 1600'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
@@ -19,7 +19,7 @@ uv run examples/run_distillation_math.py \
     distillation.max_num_steps=$MAX_STEPS \
     logger.log_dir=$LOG_DIR \
     logger.wandb_enabled=True \
-    logger.wandb.project=nemo-rl-distillation \
+    logger.wandb.project=nemo-rl \
     logger.wandb.name=$EXP_NAME \
     logger.monitor_gpus=True \
     logger.tensorboard_enabled=True \
@@ -38,4 +38,7 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma
         'data["train/loss"]["20"] < 0.3' \
         'data["validation/accuracy"]["20"] > 0.1' \
         'mean(data["timing/train/total_step_time"], -6, -1) < 1000'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
@@ -19,7 +19,7 @@ uv run examples/run_distillation_math.py \
     distillation.max_num_steps=$MAX_STEPS \
     logger.log_dir=$LOG_DIR \
     logger.wandb_enabled=True \
-    logger.wandb.project=nemo-rl-distillation \
+    logger.wandb.project=nemo-rl \
     logger.wandb.name=$EXP_NAME \
     logger.monitor_gpus=True \
     logger.tensorboard_enabled=True \
@@ -38,4 +38,7 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma
         'data["train/loss"]["20"] < 0.3' \
         'data["validation/accuracy"]["20"] > 0.1' \
         'mean(data["timing/train/total_step_time"], -6, -1) < 1000'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
@@ -40,4 +40,7 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma
         'data["train/preference_loss"]["1"] < 0.69316' \
         'data["train/preference_loss"]["20"] < 0.6' \
         'mean(data["timing/train/total_step_time"], -10, -1) < 7.8'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
@@ -34,10 +34,13 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     uv run tests/check_metrics.py $JSON_METRICS \
-        'data["train/loss"]["1"] < 3.6' \
+        'data["train/loss"]["1"] < 3.65' \
         'data["train/loss"]["150"] < 3.0' \
         'data["train/preference_loss"]["1"] > 0.69314' \
         'data["train/preference_loss"]["1"] < 0.69316' \
         'data["train/preference_loss"]["150"] < 0.4' \
         'mean(data["timing/train/total_step_time"], -11, -1) < 24'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
@@ -34,10 +34,13 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     uv run tests/check_metrics.py $JSON_METRICS \
-        'data["train/loss"]["1"] < 3.6' \
+        'data["train/loss"]["1"] < 3.65' \
         'data["train/loss"]["150"] < 3.0' \
         'data["train/preference_loss"]["1"] > 0.69314' \
         'data["train/preference_loss"]["1"] < 0.69316' \
         'data["train/preference_loss"]["150"] < 0.4' \
         'mean(data["timing/train/total_step_time"], -11, -1) < 11.5'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
@@ -40,4 +40,7 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma
         'data["train/preference_loss"]["1"] < 0.69316' \
         'data["train/preference_loss"]["20"] < 0.6' \
         'mean(data["timing/train/total_step_time"], -10) < 6.7'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
@@ -39,4 +39,7 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma
         'data["train/preference_loss"]["1"] > 0.6930' \
         'data["train/preference_loss"]["1"] < 0.6932' \
         'data["train/preference_loss"]["150"] < 0.68'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
@@ -38,4 +38,7 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma
         'data["train/loss"]["1"] < 0.69316' \
         'data["train/loss"]["150"] < 0.55' \
         'mean(data["timing/train/total_step_time"], -11, -1) < 1.3'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
@@ -34,7 +34,10 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     uv run tests/check_metrics.py $JSON_METRICS \
-        'data["train/loss"]["1"] > 0.6990' \
-        'data["train/loss"]["1"] < 0.6992' \
+        'data["train/loss"]["1"] > 0.680' \
+        'data["train/loss"]["1"] < 0.70' \
         'data["train/loss"]["100"] < 0.60'
-fi 
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
+fi
@@ -43,4 +43,7 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma
     uv run tests/check_metrics.py $JSON_METRICS \
         'min(data["train/token_mult_prob_error"]) < 1.05' \
         'data["train/reward"]["10"] > 0.4'
+
+    # Clean up checkpoint directory after successful run to save space.
+    rm -rf "$CKPT_DIR"
 fi
@@ -41,7 +41,7 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     uv run tests/check_metrics.py $JSON_METRICS \
-        'mean(data["train/token_mult_prob_error"]) < 1.05' \
+        'median(data["train/token_mult_prob_error"]) < 1.05' \
         "data['train/token_mult_prob_error']['$MAX_STEPS'] < 1.05"
 fi
 
@@ -66,3 +66,6 @@ cat ${RUN_LOG}.aime-16k       | grep "score=" | sed 's/.*score=\([^ ]*\).*/{"sco
 # 240 step checkpoint 0.3
 uv run tests/check_metrics.py ${RUN_LOG}-16k-metric.json \
   'data["score"] >= 0.2396'
+
+# Clean up checkpoint directory after successful run to save space.
+rm -rf "$CKPT_DIR"