Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions aks-node-controller/helpers/const.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ const (
LoadBalancerStandard = "Standard"
VMSizeStandardDc2s = "Standard_DC2s"
VMSizeStandardDc4s = "Standard_DC4s"
VMSizeStandardNM16adsMA35D = "Standard_NM16ads_MA35D"
DefaultLinuxUser = "azureuser"
DefaultCloudName = "AzurePublicCloud"
AksCustomCloudName = "akscustom"
Expand Down
9 changes: 9 additions & 0 deletions aks-node-controller/parser/helper.go
Original file line number Diff line number Diff line change
Expand Up @@ -525,6 +525,15 @@ func getIsSgxEnabledSKU(vmSize string) bool {
return false
}

// getIsAmdAmaEnabledSKU determines if an VM SKU has AMD AMA GPU HW support.
func getIsAmdAmaEnabledSKU(vmSize string) bool {
switch vmSize {
case helpers.VMSizeStandardNM16adsMA35D:
return true
}
return false
}

func getShouldConfigureHTTPProxy(httpProxyConfig *aksnodeconfigv1.HttpProxyConfig) bool {
return httpProxyConfig.GetHttpProxy() != "" || httpProxyConfig.GetHttpsProxy() != ""
}
Expand Down
1 change: 1 addition & 0 deletions aks-node-controller/parser/parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ func getCSEEnv(config *aksnodeconfigv1.Configuration) map[string]string {
"API_SERVER_NAME": config.GetApiServerConfig().GetApiServerName(),
"IS_VHD": fmt.Sprintf("%v", getIsVHD(config.IsVhd)),
"GPU_NODE": fmt.Sprintf("%v", getEnableNvidia(config)),
"AMDAMA_NODE": fmt.Sprintf("%v", getIsAmdAmaEnabledSKU(config.GetVmSize())),
"SGX_NODE": fmt.Sprintf("%v", getIsSgxEnabledSKU(config.GetVmSize())),
"MIG_NODE": fmt.Sprintf("%v", getIsMIGNode(config.GetGpuConfig().GetGpuInstanceProfile())),
"CONFIG_GPU_DRIVER_IF_NEEDED": fmt.Sprintf("%v", config.GetGpuConfig().GetConfigGpuDriver()),
Expand Down
42 changes: 42 additions & 0 deletions e2e/scenario_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1295,6 +1295,48 @@ func Test_AzureLinuxV3_MessageOfTheDay_Scriptless(t *testing.T) {
})
}

func Test_AzureLinuxV3_MA35D(t *testing.T) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AzureLinux is the only OS that support this VMSku correct ?

RunScenario(t, &Scenario{
Description: "Tests that a node using a AzureLinuxV3 can support MA35D SKU",
Config: Config{
Cluster: ClusterKubenet,
VHD: config.VHDAzureLinuxV3Gen2,
BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) {
nbc.ContainerService.Properties.AgentPoolProfiles[0].VMSize = "Standard_NM16ads_MA35D"
nbc.AgentPoolProfile.VMSize = "Standard_NM16ads_MA35D"
},
VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) {
vmss.SKU.Name = to.Ptr("Standard_NM16ads_MA35D")
},
Validator: func(ctx context.Context, s *Scenario) {
ValidateSystemdUnitIsRunning(ctx, s, "amdama-device-plugin.service")
},
},
})
}

func Test_AzureLinuxV3_MA35D_Scriptless(t *testing.T) {
RunScenario(t, &Scenario{
Description: "Tests that a node using a AzureLinuxV3 can support MA35D SKU",
Tags: Tags{
Scriptless: true,
},
Config: Config{
Cluster: ClusterKubenet,
VHD: config.VHDAzureLinuxV3Gen2,
AKSNodeConfigMutator: func(config *aksnodeconfigv1.Configuration) {
config.VmSize = "Standard_NM16ads_MA35D"
},
VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) {
vmss.SKU.Name = to.Ptr("Standard_NM16ads_MA35D")
},
Validator: func(ctx context.Context, s *Scenario) {
ValidateSystemdUnitIsRunning(ctx, s, "amdama-device-plugin.service")
},
},
})
}

func Test_AzureLinuxV3LocalDns_Disabled_Scriptless(t *testing.T) {
RunScenario(t, &Scenario{
Description: "Tests that a node using a AzureLinuxV3 can be bootstrapped with localdns disabled",
Expand Down
1 change: 1 addition & 0 deletions parts/linux/cloud-init/artifacts/cse_cmd.sh
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ IDENTITY_BINDINGS_LOCAL_AUTHORITY_SNI={{GetVariable "identityBindingsLocalAuthor
API_SERVER_NAME={{GetKubernetesEndpoint}}
IS_VHD={{GetVariable "isVHD"}}
GPU_NODE={{GetVariable "gpuNode"}}
AMDAMA_NODE="{{AmdAmaEnabledSKU}}"
SGX_NODE={{GetVariable "sgxNode"}}
MIG_NODE={{GetVariable "migNode"}}
CONFIG_GPU_DRIVER_IF_NEEDED={{GetVariable "configGPUDriverIfNeeded"}}
Expand Down
26 changes: 26 additions & 0 deletions parts/linux/cloud-init/artifacts/cse_config.sh
Original file line number Diff line number Diff line change
Expand Up @@ -980,6 +980,32 @@ ensureGPUDrivers() {
fi
}

setupAmdAma() {
if [ "$(isARM64)" -eq 1 ]; then
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this even possible ? to have AMDAMA_MODE == true and with a arm64 arch ? What would be the consequences here if this cluase is true and we expect to have AMDAMA_MODE and we dont setup anymore.

return
fi

if isMarinerOrAzureLinux "$OS"; then
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same, could we have ubuntu ? if it's the case what will happen ? should we baill out instead and cause anode bootstrap failure ? I would expect that Ubuntu/Flatcar and ARM64 will all be blocked at the RP level to provide users with early validation failure ?

One more question, does isMarinerOrAzureLinux return true for OSGuard I recall there were extrra check required to ensure it wasn't LinuxOSGuard

# Install driver
sudo tdnf install -y azurelinux-repos-amd
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we don't need sudo, since the CSE extension is running as root. we recently had an outage cause some scripts were calling SUDO using an expirer root password..

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we should use the helper function for any dnf command. which handle retries, etc

KERNEL_VERSION=$(uname -r | sed 's/-/./g')
AMD_AMA_DRIVER_PACKAGE=$(dnf repoquery -y --available "amd-ama-driver*" | grep -E "amd-ama-driver-[0-9]+.*_$KERNEL_VERSION" | sort -V | tail -n 1)
sudo tdnf install -y $AMD_AMA_DRIVER_PACKAGE
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we combine into a single dnf call ?

# Install core package
sudo tdnf install -y libzip
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

all tdnf calls will not work on VM with restricted egress network.

sudo tdnf install -y azurelinux-repos-extended
sudo RPM_FRONTEND=noninteractive tdnf install -y https://download.microsoft.com/download/16b04fa7-883e-4a94-88c2-801881a47b28/amd-ama-core_1.3.0-2503242033-amd64.rpm
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is very much hardcoded, if it's for testing purposes for now, I'm ok but we need to move the package to components.json and also ensure we can detect new versions automatically with renovate. Let's do that in another PR.

# Install AKS device plugin
sudo tdnf install -y amdama-device-plugin.x86_64
# Configure huge pages
sudo sh -c "echo 'vm.nr_hugepages=4096' >> /etc/sysctl.conf"
sudo sh -c "echo 4096 >> /proc/sys/vm/nr_hugepages"
if [ $(systemctl is-active kubelet) = "active" ]; then
sudo systemctl restart kubelet
fi
fi
}

disableSSH() {
# On ubuntu, the ssh service is named "ssh.service"
systemctlDisableAndStop ssh || exit $ERR_DISABLE_SSH
Expand Down
7 changes: 7 additions & 0 deletions parts/linux/cloud-init/artifacts/cse_main.sh
Original file line number Diff line number Diff line change
Expand Up @@ -395,6 +395,13 @@ function nodePrep {
echo $(date),$(hostname), "End configuring GPU drivers"
fi

# Install and configure AMD AMA (Supernova) drivers if this is an AMA node
if [ "${AMDAMA_NODE}" = "true" ]; then
logs_to_events "AKS.CSE.setupAmdAma" setupAmdAma
else
logs_to_events "AKS.CSE.setupAmdAma" "echo AMD AMA HW not found!"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we need to log this eventk, knowing it will be the case 99.9% of the time ?

fi

export -f enableManagedGPUExperience
ENABLE_MANAGED_GPU_EXPERIENCE=$(retrycmd_silent 10 1 10 bash -cx enableManagedGPUExperience)
if [ "$?" -ne 0 ] && [ "${GPU_NODE}" = "true" ] && [ "${skip_nvidia_driver_install}" != "true" ]; then
Expand Down
12 changes: 12 additions & 0 deletions pkg/agent/baker.go
Original file line number Diff line number Diff line change
Expand Up @@ -1083,6 +1083,9 @@
"GPUDriverType": func() string {
return GetGPUDriverType(profile.VMSize)
},
"AmdAmaEnabledSKU": func() bool {
return IsAmdAmaEnabledSKU(profile.VMSize)
},
"GetHnsRemediatorIntervalInMinutes": func() uint32 {
// Only need to enable HNSRemediator for Windows 2019
if cs.Properties.WindowsProfile != nil && profile.Distro == datamodel.AKSWindows2019Containerd {
Expand Down Expand Up @@ -1269,6 +1272,15 @@
return datamodel.FabricManagerGPUSizes[strings.ToLower(size)]
}

// IsAmdAmaEnabledSKU determines if an VM SKU has AMD AMA GPU HW support.
func IsAmdAmaEnabledSKU(vmSize string) bool {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe this is the first time we perform a logic in the VHD to mapping SKU to feature... normally this is done in RP.

switch vmSize {

Check failure on line 1277 in pkg/agent/baker.go

View workflow job for this annotation

GitHub Actions / lint (.)

singleCaseSwitch: should rewrite switch statement to if statement (gocritic)
case "Standard_NM16ads_MA35D":
return true
}
return false
}

func areCustomCATrustCertsPopulated(config datamodel.NodeBootstrappingConfiguration) bool {
return config.CustomCATrustConfig != nil && len(config.CustomCATrustConfig.CustomCATrustCerts) > 0
}
Expand Down
8 changes: 8 additions & 0 deletions pkg/agent/datamodel/helper.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,14 @@
return false
}

func IsAmdAmaEnabledSKU(vmSize string) bool {
switch vmSize {

Check failure on line 48 in pkg/agent/datamodel/helper.go

View workflow job for this annotation

GitHub Actions / lint (.)

singleCaseSwitch: should rewrite switch statement to if statement (gocritic)
case "Standard_NM16ads_MA35D":
return true
}
return false
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This method duplicated 3 times. Should be no more than 2. (Ideally 1)

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just following the method used in each file. Presumably the ones we don't need will get removed when the move from scripted to scriptless is done?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also. would you recommend breaking form the model to make lint happy?


// GetStorageAccountType returns the support managed disk storage tier for a give VM size.
func GetStorageAccountType(sizeName string) (string, error) {
spl := strings.Split(sizeName, "_")
Expand Down
1 change: 1 addition & 0 deletions pkg/agent/variables.go
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ func getCSECommandVariables(config *datamodel.NodeBootstrappingConfiguration) pa
"userAssignedIdentityID": config.UserAssignedIdentityClientID,
"isVHD": isVHD(profile),
"gpuNode": strconv.FormatBool(config.EnableNvidia),
"amdamaNode": strconv.FormatBool(datamodel.IsAmdAmaEnabledSKU(profile.VMSize)),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks unused?

"sgxNode": strconv.FormatBool(datamodel.IsSgxEnabledSKU(profile.VMSize)),
"configGPUDriverIfNeeded": config.ConfigGPUDriverIfNeeded,
"enableGPUDevicePluginIfNeeded": config.EnableGPUDevicePluginIfNeeded,
Expand Down
Loading