Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 0 additions & 4 deletions .github/workflows/_example_tests_runner.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,6 @@ jobs:
echo "PATH=${PATH}:/usr/local/tensorrt/targets/x86_64-linux-gnu/bin" >> $GITHUB_ENV
- name: Install dependencies
run: |
# Install git-lfs for Daring-Anteater dataset
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are these changes related to the transformers + torch upgrade?

apt-get update && apt-get install -y git-lfs
git lfs install --system
# use `python -m pip` instead of `pip` to avoid conflicts with system pip for nemo containers
python -m pip install ".${{ inputs.pip_install_extras }}"
Expand Down
47 changes: 0 additions & 47 deletions .github/workflows/delete_outdated_pr_branches.yml
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this seems also irrelevant to transformers and torch upgrade?

This file was deleted.

12 changes: 6 additions & 6 deletions .github/workflows/example_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -70,9 +70,9 @@ jobs:
uses: ./.github/workflows/_example_tests_runner.yml
secrets: inherit
with:
docker_image: "nvcr.io/nvidia/pytorch:${{ matrix.docker_image || '26.01' }}-py3"
docker_image: "nvcr.io/nvidia/pytorch:${{ matrix.docker_image || '26.03' }}-py3"
example: ${{ matrix.example }}
timeout_minutes: 30
timeout_minutes: 45
pip_install_extras: "[hf,dev-test]"
runner: linux-amd64-gpu-h100-latest-1

Expand All @@ -82,9 +82,9 @@ jobs:
uses: ./.github/workflows/_example_tests_runner.yml
secrets: inherit
with:
docker_image: "nvcr.io/nvidia/pytorch:${{ matrix.docker_image || '26.01' }}-py3"
docker_image: "nvcr.io/nvidia/pytorch:${{ matrix.docker_image || '26.03' }}-py3"
example: ${{ matrix.example }}
timeout_minutes: 30
timeout_minutes: 45
pip_install_extras: "[hf,dev-test]"
runner: linux-amd64-gpu-rtxpro6000-latest-2

Expand All @@ -99,7 +99,7 @@ jobs:
uses: ./.github/workflows/_example_tests_runner.yml
secrets: inherit
with:
docker_image: "nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc5"
docker_image: "nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10"
example: ${{ matrix.example }}
pip_install_extras: "[hf,dev-test]"
runner: linux-amd64-gpu-rtxpro6000-latest-1
Expand All @@ -113,7 +113,7 @@ jobs:
uses: ./.github/workflows/_example_tests_runner.yml
secrets: inherit
with:
docker_image: "nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc5"
docker_image: "nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10"
example: ${{ matrix.example }}
pip_install_extras: "[hf,dev-test]"
runner: linux-amd64-gpu-rtxpro6000-latest-2
Expand Down
5 changes: 3 additions & 2 deletions .github/workflows/gpu_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -65,18 +65,19 @@ jobs:
- example: gpu
timeout: 45
container_image: pytorch:26.01-py3
# tests/gpu/_extensions/test_onnx_extensions.py fails for newer containers until https://github.com/tbenthompson/cppimport/pull/98
- example: gpu-megatron
timeout: 45
container_image: pytorch:26.01-py3
- example: gpu-trtllm
timeout: 30
container_image: tensorrt-llm/release:1.3.0rc5
container_image: tensorrt-llm/release:1.3.0rc10
runs-on: linux-amd64-gpu-rtxpro6000-latest-1
timeout-minutes: ${{ matrix.timeout }}
container: &gpu_container
image: nvcr.io/nvidia/${{ matrix.container_image }}
env:
GIT_DEPTH: 1000 # For correct version for tests/gpu/torch/quantization/plugins/test_megatron.py
GIT_DEPTH: 1000 # For correct version
PIP_CONSTRAINT: "" # Disable pip constraint for upgrading packages
HF_TOKEN: ${{ secrets.HF_TOKEN }}
steps: &gpu_steps
Expand Down
12 changes: 8 additions & 4 deletions .github/workflows/unit_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ jobs:
- uses: actions/checkout@v6
- uses: ./.github/actions/ubuntu-setup
- name: Run unit tests
run: pip install tox && COV_ARGS="--cov" tox -e py312-torch210-tf_latest-unit
run: pip install tox && COV_ARGS="--cov" tox -e py312-torch211-tf_latest-unit
- name: Upload coverage reports to Codecov
uses: codecov/codecov-action@v5
with:
Expand All @@ -64,6 +64,7 @@ jobs:
runs-on: ubuntu-latest
timeout-minutes: 30
strategy:
fail-fast: false
matrix:
py: [10, 11, 13]
steps:
Expand All @@ -72,15 +73,16 @@ jobs:
with:
python-version: "3.${{ matrix.py }}"
- name: Run unit tests
run: pip install tox && tox -e py3${{ matrix.py }}-torch210-tf_latest-unit
run: pip install tox && tox -e py3${{ matrix.py }}-torch211-tf_latest-unit
multi-torch:
if: github.event_name == 'pull_request'
needs: [linux]
runs-on: ubuntu-latest
timeout-minutes: 30
strategy:
fail-fast: false
matrix:
torch: [26, 27, 28, 29]
torch: [28, 29, 210]
steps:
- uses: actions/checkout@v6
- uses: ./.github/actions/ubuntu-setup
Expand All @@ -92,13 +94,14 @@ jobs:
runs-on: ubuntu-latest
timeout-minutes: 30
strategy:
fail-fast: false
matrix:
tf: [min]
steps:
- uses: actions/checkout@v6
- uses: ./.github/actions/ubuntu-setup
- name: Run unit tests
run: pip install tox && tox -e py312-torch210-tf_${{ matrix.tf }}-unit
run: pip install tox && tox -e py312-torch211-tf_${{ matrix.tf }}-unit
launcher:
if: github.event_name == 'pull_request'
needs: [linux]
Expand All @@ -122,6 +125,7 @@ jobs:
runs-on: ubuntu-latest
timeout-minutes: 30
strategy:
fail-fast: false
matrix:
test-env: [onnx, torch]
steps:
Expand Down
10 changes: 8 additions & 2 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
NVIDIA Model Optimizer Changelog
================================
Changelog
=========

0.44 (2026-05-xx)
^^^^^^^^^^^^^^^^^

Expand All @@ -15,6 +16,11 @@ NVIDIA Model Optimizer Changelog

- Fix Minitron pruning (``mcore_minitron``) for MoE models. Importance estimation hooks were incorrectly registered for MoE modules and NAS step was hanging before this.

**Misc**

- Bump minimum required PyTorch version to 2.8.
- Add experimental support for transformers>=5.0. Unified Hugging Face checkpoint export for quantized checkpoints may not work for some models with transformers>=5.0 yet.

0.43 (2026-04-09)
^^^^^^^^^^^^^^^^^

Expand Down
2 changes: 1 addition & 1 deletion docs/source/getting_started/_installation_for_Linux.rst
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ Latest Model Optimizer (``nvidia-modelopt``) currently has the following system
+-------------------------+-----------------------------+
| CUDA | 12.x, 13.x |
+-------------------------+-----------------------------+
| PyTorch | >=2.6 |
| PyTorch | >=2.8 |
+-------------------------+-----------------------------+
| TensorRT-LLM (Optional) | >=1.0 |
+-------------------------+-----------------------------+
Expand Down
6 changes: 3 additions & 3 deletions examples/gpt-oss/configs/sft_full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ per_device_train_batch_size: 2
per_device_eval_batch_size: 2
gradient_accumulation_steps: 2
max_length: 4096
warmup_ratio: 0.03
warmup_steps: 0.03 # use warmup_ratio if using transformers<5.0
lr_scheduler_type: cosine_with_min_lr
lr_scheduler_kwargs:
min_lr_rate: 0.1
Expand All @@ -30,6 +30,6 @@ eval_steps: 8
dataset_test_split: test

# ModelOpt Quantization Parameters
quant_cfg: # Examples: MXFP4_MLP_WEIGHT_ONLY_CFG, NVFP4_MLP_WEIGHT_ONLY_CFG, NVFP4_MLP_ONLY_CFG
# For the full list of supported configs, do: mtq.config.choices
quant_cfg: # Examples: MXFP4_MLP_WEIGHT_ONLY_CFG, NVFP4_MLP_WEIGHT_ONLY_CFG, NVFP4_MLP_ONLY_CFG
# For the full list of supported configs, do: mtq.config.choices
calib_size: 128
6 changes: 3 additions & 3 deletions examples/gpt-oss/configs/sft_lora.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ lora_alpha: 16
lora_dropout: 0.0
lora_target_modules: all-linear
max_length: 4096
warmup_ratio: 0.03
warmup_steps: 0.03 # use warmup_ratio if using transformers<5.0
lr_scheduler_type: cosine_with_min_lr
lr_scheduler_kwargs:
min_lr_rate: 0.1
Expand All @@ -35,6 +35,6 @@ eval_steps: 8
dataset_test_split: test

# ModelOpt Quantization Parameters
quant_cfg: # Examples: MXFP4_MLP_WEIGHT_ONLY_CFG, NVFP4_MLP_WEIGHT_ONLY_CFG, NVFP4_MLP_ONLY_CFG
# For the full list of supported configs, do: mtq.config.choices
quant_cfg: # Examples: MXFP4_MLP_WEIGHT_ONLY_CFG, NVFP4_MLP_WEIGHT_ONLY_CFG, NVFP4_MLP_ONLY_CFG
# For the full list of supported configs, do: mtq.config.choices
calib_size: 128
14 changes: 8 additions & 6 deletions examples/gpt-oss/convert_oai_mxfp4_weight_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,21 +95,23 @@ def convert_and_save(model, tokenizer, output_path: str):

def create_parser():
parser = argparse.ArgumentParser(description=__doc__)

parser.add_argument("--model_path", type=str, help="path to the fake-quantized model from QAT.")

parser.add_argument(
"--trust_remote_code",
help="Set trust_remote_code for Huggingface models and tokenizers",
default=False,
action="store_true",
)
parser.add_argument(
"--lora_path",
type=str,
help="path to the LoRA-QAT adapter weights. You can only specify lora_path or model_path, not both.",
)

parser.add_argument(
"--base_path",
type=str,
help="path to the base model used for LoRA-QAT. Only used if lora_path is specified.",
)

parser.add_argument(
"--output_path", type=str, required=True, help="location to save converted model."
)
Expand All @@ -121,7 +123,7 @@ def create_parser():
parser = create_parser()
args = parser.parse_args()

kwargs = {"device_map": "auto", "torch_dtype": "auto", "trust_remote_code": True}
kwargs = {"device_map": "auto", "dtype": "auto", "trust_remote_code": args.trust_remote_code}
if args.lora_path:
assert args.model_path is None, "You can only specify lora_path or model_path, not both."
model_path = args.base_path
Expand All @@ -140,7 +142,7 @@ def create_parser():
gc.collect()

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=args.trust_remote_code)

# Quantize and save model
convert_and_save(model, tokenizer, args.output_path)
2 changes: 1 addition & 1 deletion examples/gpt-oss/qat-finetune-transformers.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@
" per_device_eval_batch_size=1,\n",
" gradient_accumulation_steps=2,\n",
" max_length=4096,\n",
" warmup_ratio=0.03,\n",
" warmup_steps=0.03, # use warmup_ratio if using transformers<5.0\n",
" eval_strategy=\"steps\",\n",
" eval_on_start=True,\n",
" logging_steps=10,\n",
Expand Down
2 changes: 0 additions & 2 deletions examples/gpt-oss/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
kernels>=0.9.0
torch>2.7.1
trackio
transformers>=4.55.0
trl>=0.21.0
2 changes: 1 addition & 1 deletion examples/gpt-oss/sft.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def main(script_args, training_args, model_args, quant_args):
"revision": model_args.model_revision,
"trust_remote_code": model_args.trust_remote_code,
"attn_implementation": model_args.attn_implementation,
"torch_dtype": getattr(model_args, "dtype", "bfloat16"),
"dtype": getattr(model_args, "dtype", "bfloat16"),
"use_cache": not training_args.gradient_checkpointing,
}

Expand Down
12 changes: 10 additions & 2 deletions examples/llm_autodeploy/run_auto_quantize.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,18 +118,19 @@ def modelopt_ptq(
auto_quantize_bits: float | None = None,
calib_dataset: str = "cnn_dailymail",
calib_batch_size: int = 8,
trust_remote_code: bool = False,
) -> torch.nn.Module:
"""Quantize the model with modelopt."""
model = AutoModelForCausalLM.from_pretrained(
model_path, trust_remote_code=True, torch_dtype="auto", device_map="auto"
model_path, trust_remote_code=trust_remote_code, dtype="auto", device_map="auto"
)
model.eval()

tokenizer = AutoTokenizer.from_pretrained(
model_path,
model_max_length=2048,
padding_side="left",
trust_remote_code=True,
trust_remote_code=trust_remote_code,
)
# sanitize tokenizer
if tokenizer.pad_token != "<unk>":
Expand Down Expand Up @@ -203,6 +204,12 @@ def modelopt_ptq(
"regular quantization without auto_quantize search will be applied."
),
)
parser.add_argument(
"--trust_remote_code",
help="Set trust_remote_code for Huggingface models and tokenizers",
default=False,
action="store_true",
)

args = parser.parse_args()

Expand All @@ -213,4 +220,5 @@ def modelopt_ptq(
args.num_samples,
auto_quantize_bits=args.effective_bits,
calib_batch_size=args.calib_batch_size,
trust_remote_code=args.trust_remote_code,
)
1 change: 0 additions & 1 deletion examples/llm_distill/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
pyarrow
torchao>=0.14.1
transformers<5.0
trl>=0.23.0
3 changes: 1 addition & 2 deletions examples/llm_eval/lm_eval_hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
# limitations under the License.
import warnings

import datasets
from lm_eval import utils
from lm_eval.__main__ import cli_evaluate, parse_eval_args, setup_parser
from lm_eval.api.model import T
Expand Down Expand Up @@ -180,8 +181,6 @@ def setup_parser_with_modelopt_args():
model_args = utils.simple_parse_args_string(args.model_args)

if args.trust_remote_code:
import datasets

datasets.config.HF_DATASETS_TRUST_REMOTE_CODE = True
model_args["trust_remote_code"] = True
args.trust_remote_code = None
Expand Down
Loading
Loading