Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 44 additions & 0 deletions e2e/validators.go
Original file line number Diff line number Diff line change
Expand Up @@ -1168,6 +1168,50 @@ func ValidateNodeProblemDetector(ctx context.Context, s *Scenario) {
execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, "Node Problem Detector (NPD) service validation failed")
}

func ValidateNodeExporter(ctx context.Context, s *Scenario) {
s.T.Helper()

skipFile := "/etc/node-exporter.d/skip_vhd_node_exporter"
serviceName := "node-exporter.service"

// Check if node-exporter is installed on this VHD by looking for the skip sentinel file.
// The skip file is only present on VHDs that have node-exporter installed (Ubuntu, Mariner, Azure Linux).
// Flatcar, OSGuard, and older VHDs do not have node-exporter installed and will not have the skip file.
if !fileExist(ctx, s, skipFile) {
s.T.Logf("Skipping node-exporter validation: sentinel file %s not found (VHD does not have node-exporter installed)", skipFile)
return
}

s.T.Logf("skip_vhd_node_exporter sentinel file found, validating node-exporter installation")

// Validate service is running
ValidateSystemdUnitIsRunning(ctx, s, serviceName)
ValidateSystemdUnitIsNotFailed(ctx, s, serviceName)

// Validate service is enabled
execScriptOnVMForScenarioValidateExitCode(ctx, s, fmt.Sprintf("systemctl is-enabled %s", serviceName), 0, fmt.Sprintf("%s should be enabled", serviceName))

// Validate binary exists and is executable
// The binary is installed at /usr/bin and symlinked to /opt/bin for consistency with other binaries (kubelet, etc.)
ValidateFileExists(ctx, s, "/usr/bin/node-exporter")
ValidateFileExists(ctx, s, "/opt/bin/node-exporter")
ValidateFileExists(ctx, s, "/opt/bin/node-exporter-startup.sh")

// Validate configuration files exist
ValidateFileExists(ctx, s, skipFile)
ValidateFileExists(ctx, s, "/etc/node-exporter.d/web-config.yml")

// Validate that node-exporter is listening on port 9100 (metrics endpoint)
s.T.Logf("Validating node-exporter metrics endpoint is accessible")
command := []string{
"set -ex",
"curl -sf http://localhost:9100/metrics > /dev/null",
}
execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, "node-exporter metrics endpoint should be accessible")

s.T.Logf("node-exporter validation passed")
}

func ValidateNPDFilesystemCorruption(ctx context.Context, s *Scenario) {
command := []string{
"set -ex",
Expand Down
36 changes: 36 additions & 0 deletions parts/common/components.json
Original file line number Diff line number Diff line change
Expand Up @@ -1764,6 +1764,42 @@
}
}
}
},
{
"name": "node-exporter",
"downloadLocation": "/opt/node-exporter",
"downloadURIs": {
"ubuntu": {
"current": {
"versionsV2": [
{
"renovateTag": "name=node-exporter-kubernetes, repository=production, os=ubuntu, release=18.04",
"latestVersion": "1.9.1-ubuntu18.04u5"
}
]
}
},
"mariner": {
"current": {
"versionsV2": [
{
"renovateTag": "RPM_registry=https://packages.microsoft.com/cbl-mariner/2.0/prod/cloud-native/x86_64/repodata, name=node-exporter-kubernetes, os=mariner, release=2.0",
"latestVersion": "1.9.1-8.cm2"
}
]
}
},
"azurelinux": {
"v3.0": {
"versionsV2": [
{
"renovateTag": "RPM_registry=https://packages.microsoft.com/azurelinux/3.0/prod/cloud-native/x86_64/repodata, name=node-exporter-kubernetes, os=azurelinux, release=3.0",
"latestVersion": "1.9.1-8.azl3"
}
]
}
}
}
}
],
"OCIArtifacts": [
Expand Down
19 changes: 19 additions & 0 deletions parts/linux/cloud-init/artifacts/cse_config.sh
Original file line number Diff line number Diff line change
Expand Up @@ -821,6 +821,25 @@ EOF
systemctlEnableAndStart mig-partition 300
}

configureNodeExporter() {
echo "Configuring Node Exporter"
# Check for skip file to determine if node-exporter was installed on this VHD
if [ ! -f /etc/node-exporter.d/skip_vhd_node_exporter ]; then
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if the skip file isn't there, meaning we don't want to skip, we end up skipping? this seems backwards

Copy link
Contributor

@cameronmeissner cameronmeissner Jan 23, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also, rather than commit and manage an empty flag file in version control, can we instead pivot off some other node exporter asset file's existence?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

skip file is for the extension logic as a signal that the component is agentbaker managed and extension should skip all managing.

existing nodes should have all components from extension previously. If we were to check for node-exporter running it should always be there vhd or extension install originally and we wouldn't be able to tell which. The skip file becomes a clear sign that this was vhd baked and extension should ignore.

once we get to a point where extension isn't needed anymore we can easily remove it

echo "Node Exporter assets not found on this VHD (missing /etc/node-exporter.d/skip_vhd_node_exporter); skipping configuration."
return 0
fi

if ! systemctlEnableAndStart node-exporter 30; then
echo "Failed to start node-exporter service"
return $ERR_NODE_EXPORTER_START_FAIL
fi
if ! systemctlEnableAndStart node-exporter-restart.path 30; then
echo "Failed to start node-exporter-restart.path"
return $ERR_NODE_EXPORTER_START_FAIL
fi
echo "Node Exporter started successfully"
}

ensureSysctl() {
SYSCTL_CONFIG_FILE=/etc/sysctl.d/999-sysctl-aks.conf
mkdir -p "$(dirname "${SYSCTL_CONFIG_FILE}")"
Expand Down
10 changes: 6 additions & 4 deletions parts/linux/cloud-init/artifacts/cse_helpers.sh
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,8 @@ ERR_ENABLE_MANAGED_GPU_EXPERIENCE=123 # Error confguring managed GPU experience
# Error code 124 is returned when a `timeout` command times out, and --preserve-status is not specified: https://man7.org/linux/man-pages/man1/timeout.1.html
ERR_VHD_BUILD_ERROR=125 # Reserved for VHD CI exit conditions

ERR_NODE_EXPORTER_START_FAIL=128 # Error starting or enabling node-exporter service

ERR_SWAP_CREATE_FAIL=130 # Error allocating swap file
ERR_SWAP_CREATE_INSUFFICIENT_DISK_SPACE=131 # Error insufficient disk space for swap file creation

Expand Down Expand Up @@ -938,10 +940,10 @@ fallbackToKubeBinaryInstall() {
if [ "${SHOULD_ENFORCE_KUBE_PMC_INSTALL}" = "true" ]; then
echo "Kube PMC install is enforced, skipping fallback to kube binary install for ${packageName}"
return 1
elif [ -f "/opt/bin/${packageName}-${packageVersion}" ]; then
mv "/opt/bin/${packageName}-${packageVersion}" "/opt/bin/${packageName}"
chmod a+x /opt/bin/${packageName}
rm -rf /opt/bin/${packageName}-* &
elif [ -f "/usr/local/bin/${packageName}-${packageVersion}" ]; then
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why the change here? we should be using /opt/bin

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah i think a few things got messed up when i decided to involve master in some rebasing and not main... everyone is pretty upset by that move.

mv "/usr/local/bin/${packageName}-${packageVersion}" "/usr/local/bin/${packageName}"
chmod a+x /usr/local/bin/${packageName}
rm -rf /usr/local/bin/${packageName}-* &
return 0
else
echo "No binary fallback found for ${packageName} version ${packageVersion}"
Expand Down
2 changes: 2 additions & 0 deletions parts/linux/cloud-init/artifacts/cse_main.sh
Original file line number Diff line number Diff line change
Expand Up @@ -455,6 +455,8 @@ function nodePrep {

logs_to_events "AKS.CSE.ensureKubelet" ensureKubelet

logs_to_events "AKS.CSE.configureNodeExporter" configureNodeExporter

if $REBOOTREQUIRED; then
echo 'reboot required, rebooting node in 1 minute'
/bin/bash -c "shutdown -r 1 &"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
tls_server_config:
cert_file: "/etc/kubernetes/certs/kubeletserver.crt"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

these paths aren't necessarily correct - they depend on whether kubelet serving certificate rotation is enabled - when it's disabled these paths are correct, however when it's enabled both cert_file and key_file should point towards: /var/lib/kubelet/pki/kubelet-server-current.pem

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

interesting this is copy paste from the aks-vm-extension repo and what is on my node today. I don't see any where it's touched, just a static file.

root@aks-sys-41317600-vmss000000:/etc/node-exporter.d# cat web-config.yml
tls_server_config:
  cert_file: "/etc/kubernetes/certs/kubeletserver.crt"
  key_file: "/etc/kubernetes/certs/kubeletserver.key"
  client_auth_type: "RequireAndVerifyClientCert"
  client_ca_file: "/etc/kubernetes/certs/ca.crt"

i think we could address this in the node-exporter-startup.sh and check for the existence of /var/lib/kubelet/pki/kubelet-server-current.pem and if it exists use it. And if not use /etc/kubernetes/certs/kubeletserver.crt

key_file: "/etc/kubernetes/certs/kubeletserver.key"
client_auth_type: "RequireAndVerifyClientCert"
client_ca_file: "/etc/kubernetes/certs/ca.crt"
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
[Path]
# Watch server cert paths - one will exist depending on whether kubelet serving cert rotation is enabled
# Rotation enabled: kubelet-server-current.pem (symlink updated on rotation)
# Rotation disabled: kubeletserver.crt (static cert)
PathModified=/var/lib/kubelet/pki/kubelet-server-current.pem
PathModified=/etc/kubernetes/certs/kubeletserver.crt

[Install]
WantedBy=multi-user.target
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[Service]
Type=OneShot
ExecStart=/bin/systemctl restart node-exporter.service
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
[Unit]
Description=Prometheus Node Exporter
Documentation=https://github.com/prometheus/node_exporter

[Service]
ExecStart=/opt/bin/node-exporter-startup.sh

Restart=on-failure
RestartSec=10

[Install]
WantedBy=multi-user.target
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#!/bin/sh

if [ "$(cat /etc/os-release | grep ^ID= | cut -c 4-)" = "flatcar" ]; then
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we need the flatcar check? it looks like we don't install node exporter on flatcar according to what's currently in install-dependencies.sh

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah for the initial setup in AB i'm just looking at getting ubuntu and mariner working. Plan is to get all distros and this is copy paste from existing extension so eventually needed

NODE_IP=$(ip -o -4 addr show dev eth0 | awk '{print $4}' | cut -d '/' -f 1)
else
NODE_IP=$(hostname -I | awk '{print $1}')
fi

TLS_CONFIG_PATH="/etc/node-exporter.d/web-config.yml"
TLS_CONFIG_ARG=""
KUBELET_DEFAULTS="/etc/default/kubelet"

# Detect TLS cert paths from kubelet configuration
# Priority: rotation cert > static cert paths from kubelet flags > skip TLS
CERT_FILE=""
KEY_FILE=""

# Check for rotation cert first (used when --rotate-server-certificates=true)
if [ -f "/var/lib/kubelet/pki/kubelet-server-current.pem" ]; then
CERT_FILE="/var/lib/kubelet/pki/kubelet-server-current.pem"
KEY_FILE="/var/lib/kubelet/pki/kubelet-server-current.pem"
elif [ -f "$KUBELET_DEFAULTS" ]; then
# Parse kubelet flags for static cert paths
KUBELET_FLAGS=$(grep "^KUBELET_FLAGS=" "$KUBELET_DEFAULTS" | cut -d'=' -f2-)
TLS_CERT=$(echo "$KUBELET_FLAGS" | grep -o '\--tls-cert-file=[^ ]*' | cut -d'=' -f2)
TLS_KEY=$(echo "$KUBELET_FLAGS" | grep -o '\--tls-private-key-file=[^ ]*' | cut -d'=' -f2)

if [ -n "$TLS_CERT" ] && [ -n "$TLS_KEY" ] && [ -f "$TLS_CERT" ] && [ -f "$TLS_KEY" ]; then
CERT_FILE="$TLS_CERT"
KEY_FILE="$TLS_KEY"
fi
fi

# Configure TLS if we found valid cert paths
if [ -n "$CERT_FILE" ] && [ -n "$KEY_FILE" ]; then
cat > "$TLS_CONFIG_PATH" <<EOF
tls_server_config:
cert_file: "$CERT_FILE"
key_file: "$KEY_FILE"
client_auth_type: "RequireAndVerifyClientCert"
client_ca_file: "/etc/kubernetes/certs/ca.crt"
EOF
TLS_CONFIG_ARG="--web.config.file=${TLS_CONFIG_PATH}"
fi

exec /opt/bin/node-exporter \
--web.listen-address=${NODE_IP}:19100 \
${TLS_CONFIG_ARG} \
--no-collector.wifi \
--no-collector.hwmon \
--collector.cpu.info \
--collector.filesystem.mount-points-exclude="^/(dev|proc|sys|run/containerd/.+|var/lib/docker/.+|var/lib/kubelet/.+)($|/)" \
--collector.netclass.ignored-devices="^(azv.*|veth.*|[a-f0-9]{15})$" \
--collector.netclass.netlink \
--collector.netdev.device-exclude="^(azv.*|veth.*|[a-f0-9]{15})$" \
--no-collector.arp.netlink
Empty file.
Loading
Loading