From 85c15edf7c3a1dc0eb1b114a4d442259e6a94bd3 Mon Sep 17 00:00:00 2001 From: Mike Preston Date: Wed, 21 Jan 2026 11:13:21 -0800 Subject: [PATCH] Moved changes to sync with main branch. --- aks-node-controller/helpers/const.go | 1 + aks-node-controller/parser/helper.go | 9 ++++ aks-node-controller/parser/parser.go | 1 + e2e/scenario_test.go | 42 +++++++++++++++++++ parts/linux/cloud-init/artifacts/cse_cmd.sh | 1 + .../linux/cloud-init/artifacts/cse_config.sh | 26 ++++++++++++ parts/linux/cloud-init/artifacts/cse_main.sh | 7 ++++ pkg/agent/baker.go | 12 ++++++ pkg/agent/datamodel/helper.go | 8 ++++ pkg/agent/variables.go | 1 + 10 files changed, 108 insertions(+) diff --git a/aks-node-controller/helpers/const.go b/aks-node-controller/helpers/const.go index e9d49da5ab8..a46b46f4cbf 100644 --- a/aks-node-controller/helpers/const.go +++ b/aks-node-controller/helpers/const.go @@ -11,6 +11,7 @@ const ( LoadBalancerStandard = "Standard" VMSizeStandardDc2s = "Standard_DC2s" VMSizeStandardDc4s = "Standard_DC4s" + VMSizeStandardNM16adsMA35D = "Standard_NM16ads_MA35D" DefaultLinuxUser = "azureuser" DefaultCloudName = "AzurePublicCloud" AksCustomCloudName = "akscustom" diff --git a/aks-node-controller/parser/helper.go b/aks-node-controller/parser/helper.go index a2b53403d34..4f873838af4 100644 --- a/aks-node-controller/parser/helper.go +++ b/aks-node-controller/parser/helper.go @@ -525,6 +525,15 @@ func getIsSgxEnabledSKU(vmSize string) bool { return false } +// getIsAmdAmaEnabledSKU determines if an VM SKU has AMD AMA GPU HW support. +func getIsAmdAmaEnabledSKU(vmSize string) bool { + switch vmSize { + case helpers.VMSizeStandardNM16adsMA35D: + return true + } + return false +} + func getShouldConfigureHTTPProxy(httpProxyConfig *aksnodeconfigv1.HttpProxyConfig) bool { return httpProxyConfig.GetHttpProxy() != "" || httpProxyConfig.GetHttpsProxy() != "" } diff --git a/aks-node-controller/parser/parser.go b/aks-node-controller/parser/parser.go index 0cc1e66c329..7f7309e4232 100644 --- a/aks-node-controller/parser/parser.go +++ b/aks-node-controller/parser/parser.go @@ -80,6 +80,7 @@ func getCSEEnv(config *aksnodeconfigv1.Configuration) map[string]string { "API_SERVER_NAME": config.GetApiServerConfig().GetApiServerName(), "IS_VHD": fmt.Sprintf("%v", getIsVHD(config.IsVhd)), "GPU_NODE": fmt.Sprintf("%v", getEnableNvidia(config)), + "AMDAMA_NODE": fmt.Sprintf("%v", getIsAmdAmaEnabledSKU(config.GetVmSize())), "SGX_NODE": fmt.Sprintf("%v", getIsSgxEnabledSKU(config.GetVmSize())), "MIG_NODE": fmt.Sprintf("%v", getIsMIGNode(config.GetGpuConfig().GetGpuInstanceProfile())), "CONFIG_GPU_DRIVER_IF_NEEDED": fmt.Sprintf("%v", config.GetGpuConfig().GetConfigGpuDriver()), diff --git a/e2e/scenario_test.go b/e2e/scenario_test.go index 925b435329e..a9c7c1bd545 100644 --- a/e2e/scenario_test.go +++ b/e2e/scenario_test.go @@ -1295,6 +1295,48 @@ func Test_AzureLinuxV3_MessageOfTheDay_Scriptless(t *testing.T) { }) } +func Test_AzureLinuxV3_MA35D(t *testing.T) { + RunScenario(t, &Scenario{ + Description: "Tests that a node using a AzureLinuxV3 can support MA35D SKU", + Config: Config{ + Cluster: ClusterKubenet, + VHD: config.VHDAzureLinuxV3Gen2, + BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { + nbc.ContainerService.Properties.AgentPoolProfiles[0].VMSize = "Standard_NM16ads_MA35D" + nbc.AgentPoolProfile.VMSize = "Standard_NM16ads_MA35D" + }, + VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) { + vmss.SKU.Name = to.Ptr("Standard_NM16ads_MA35D") + }, + Validator: func(ctx context.Context, s *Scenario) { + ValidateSystemdUnitIsRunning(ctx, s, "amdama-device-plugin.service") + }, + }, + }) +} + +func Test_AzureLinuxV3_MA35D_Scriptless(t *testing.T) { + RunScenario(t, &Scenario{ + Description: "Tests that a node using a AzureLinuxV3 can support MA35D SKU", + Tags: Tags{ + Scriptless: true, + }, + Config: Config{ + Cluster: ClusterKubenet, + VHD: config.VHDAzureLinuxV3Gen2, + AKSNodeConfigMutator: func(config *aksnodeconfigv1.Configuration) { + config.VmSize = "Standard_NM16ads_MA35D" + }, + VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) { + vmss.SKU.Name = to.Ptr("Standard_NM16ads_MA35D") + }, + Validator: func(ctx context.Context, s *Scenario) { + ValidateSystemdUnitIsRunning(ctx, s, "amdama-device-plugin.service") + }, + }, + }) +} + func Test_AzureLinuxV3LocalDns_Disabled_Scriptless(t *testing.T) { RunScenario(t, &Scenario{ Description: "Tests that a node using a AzureLinuxV3 can be bootstrapped with localdns disabled", diff --git a/parts/linux/cloud-init/artifacts/cse_cmd.sh b/parts/linux/cloud-init/artifacts/cse_cmd.sh index 2a84db81527..6ce0bc118ae 100644 --- a/parts/linux/cloud-init/artifacts/cse_cmd.sh +++ b/parts/linux/cloud-init/artifacts/cse_cmd.sh @@ -74,6 +74,7 @@ IDENTITY_BINDINGS_LOCAL_AUTHORITY_SNI={{GetVariable "identityBindingsLocalAuthor API_SERVER_NAME={{GetKubernetesEndpoint}} IS_VHD={{GetVariable "isVHD"}} GPU_NODE={{GetVariable "gpuNode"}} +AMDAMA_NODE="{{AmdAmaEnabledSKU}}" SGX_NODE={{GetVariable "sgxNode"}} MIG_NODE={{GetVariable "migNode"}} CONFIG_GPU_DRIVER_IF_NEEDED={{GetVariable "configGPUDriverIfNeeded"}} diff --git a/parts/linux/cloud-init/artifacts/cse_config.sh b/parts/linux/cloud-init/artifacts/cse_config.sh index 860097e3207..c7d6a4e1893 100755 --- a/parts/linux/cloud-init/artifacts/cse_config.sh +++ b/parts/linux/cloud-init/artifacts/cse_config.sh @@ -980,6 +980,32 @@ ensureGPUDrivers() { fi } +setupAmdAma() { + if [ "$(isARM64)" -eq 1 ]; then + return + fi + + if isMarinerOrAzureLinux "$OS"; then + # Install driver + sudo tdnf install -y azurelinux-repos-amd + KERNEL_VERSION=$(uname -r | sed 's/-/./g') + AMD_AMA_DRIVER_PACKAGE=$(dnf repoquery -y --available "amd-ama-driver*" | grep -E "amd-ama-driver-[0-9]+.*_$KERNEL_VERSION" | sort -V | tail -n 1) + sudo tdnf install -y $AMD_AMA_DRIVER_PACKAGE + # Install core package + sudo tdnf install -y libzip + sudo tdnf install -y azurelinux-repos-extended + sudo RPM_FRONTEND=noninteractive tdnf install -y https://download.microsoft.com/download/16b04fa7-883e-4a94-88c2-801881a47b28/amd-ama-core_1.3.0-2503242033-amd64.rpm + # Install AKS device plugin + sudo tdnf install -y amdama-device-plugin.x86_64 + # Configure huge pages + sudo sh -c "echo 'vm.nr_hugepages=4096' >> /etc/sysctl.conf" + sudo sh -c "echo 4096 >> /proc/sys/vm/nr_hugepages" + if [ $(systemctl is-active kubelet) = "active" ]; then + sudo systemctl restart kubelet + fi + fi +} + disableSSH() { # On ubuntu, the ssh service is named "ssh.service" systemctlDisableAndStop ssh || exit $ERR_DISABLE_SSH diff --git a/parts/linux/cloud-init/artifacts/cse_main.sh b/parts/linux/cloud-init/artifacts/cse_main.sh index d1442ab8db7..44b4f278c9f 100755 --- a/parts/linux/cloud-init/artifacts/cse_main.sh +++ b/parts/linux/cloud-init/artifacts/cse_main.sh @@ -395,6 +395,13 @@ function nodePrep { echo $(date),$(hostname), "End configuring GPU drivers" fi + # Install and configure AMD AMA (Supernova) drivers if this is an AMA node + if [ "${AMDAMA_NODE}" = "true" ]; then + logs_to_events "AKS.CSE.setupAmdAma" setupAmdAma + else + logs_to_events "AKS.CSE.setupAmdAma" "echo AMD AMA HW not found!" + fi + export -f enableManagedGPUExperience ENABLE_MANAGED_GPU_EXPERIENCE=$(retrycmd_silent 10 1 10 bash -cx enableManagedGPUExperience) if [ "$?" -ne 0 ] && [ "${GPU_NODE}" = "true" ] && [ "${skip_nvidia_driver_install}" != "true" ]; then diff --git a/pkg/agent/baker.go b/pkg/agent/baker.go index 9995807d6af..cf0b3b01a3d 100644 --- a/pkg/agent/baker.go +++ b/pkg/agent/baker.go @@ -1083,6 +1083,9 @@ func getContainerServiceFuncMap(config *datamodel.NodeBootstrappingConfiguration "GPUDriverType": func() string { return GetGPUDriverType(profile.VMSize) }, + "AmdAmaEnabledSKU": func() bool { + return IsAmdAmaEnabledSKU(profile.VMSize) + }, "GetHnsRemediatorIntervalInMinutes": func() uint32 { // Only need to enable HNSRemediator for Windows 2019 if cs.Properties.WindowsProfile != nil && profile.Distro == datamodel.AKSWindows2019Containerd { @@ -1269,6 +1272,15 @@ func GPUNeedsFabricManager(size string) bool { return datamodel.FabricManagerGPUSizes[strings.ToLower(size)] } +// IsAmdAmaEnabledSKU determines if an VM SKU has AMD AMA GPU HW support. +func IsAmdAmaEnabledSKU(vmSize string) bool { + switch vmSize { + case "Standard_NM16ads_MA35D": + return true + } + return false +} + func areCustomCATrustCertsPopulated(config datamodel.NodeBootstrappingConfiguration) bool { return config.CustomCATrustConfig != nil && len(config.CustomCATrustConfig.CustomCATrustCerts) > 0 } diff --git a/pkg/agent/datamodel/helper.go b/pkg/agent/datamodel/helper.go index c9d65a014c2..af0f18b9320 100644 --- a/pkg/agent/datamodel/helper.go +++ b/pkg/agent/datamodel/helper.go @@ -44,6 +44,14 @@ func IsSgxEnabledSKU(vmSize string) bool { return false } +func IsAmdAmaEnabledSKU(vmSize string) bool { + switch vmSize { + case "Standard_NM16ads_MA35D": + return true + } + return false +} + // GetStorageAccountType returns the support managed disk storage tier for a give VM size. func GetStorageAccountType(sizeName string) (string, error) { spl := strings.Split(sizeName, "_") diff --git a/pkg/agent/variables.go b/pkg/agent/variables.go index 61116902b1a..18a5a2f8281 100644 --- a/pkg/agent/variables.go +++ b/pkg/agent/variables.go @@ -132,6 +132,7 @@ func getCSECommandVariables(config *datamodel.NodeBootstrappingConfiguration) pa "userAssignedIdentityID": config.UserAssignedIdentityClientID, "isVHD": isVHD(profile), "gpuNode": strconv.FormatBool(config.EnableNvidia), + "amdamaNode": strconv.FormatBool(datamodel.IsAmdAmaEnabledSKU(profile.VMSize)), "sgxNode": strconv.FormatBool(datamodel.IsSgxEnabledSKU(profile.VMSize)), "configGPUDriverIfNeeded": config.ConfigGPUDriverIfNeeded, "enableGPUDevicePluginIfNeeded": config.EnableGPUDevicePluginIfNeeded,