|
| 1 | +name: Self-Hosted Runner Health Check |
| 2 | + |
| 3 | +on: |
| 4 | + workflow_dispatch: |
| 5 | + |
| 6 | +jobs: |
| 7 | + health-check: |
| 8 | + if: github.repository_owner == 'bluerobotics' |
| 9 | + strategy: |
| 10 | + fail-fast: false |
| 11 | + matrix: |
| 12 | + include: |
| 13 | + - runner: blueos-ci |
| 14 | + name: "BlueOS CI (ARM32)" |
| 15 | + - runner: pi4-builder2 |
| 16 | + name: "Pi4 Builder 2 (ARM32)" |
| 17 | + - runner: pi5-builder |
| 18 | + name: "Pi5 Builder (ARM64)" |
| 19 | + |
| 20 | + runs-on: ${{ matrix.runner }} |
| 21 | + name: ${{ matrix.name }} |
| 22 | + |
| 23 | + steps: |
| 24 | + - name: System Information |
| 25 | + run: | |
| 26 | + echo "## System Information" >> $GITHUB_STEP_SUMMARY |
| 27 | + echo "| Property | Value |" >> $GITHUB_STEP_SUMMARY |
| 28 | + echo "|----------|-------|" >> $GITHUB_STEP_SUMMARY |
| 29 | + echo "| Hostname | $(hostname) |" >> $GITHUB_STEP_SUMMARY |
| 30 | + echo "| Kernel | $(uname -r) |" >> $GITHUB_STEP_SUMMARY |
| 31 | + echo "| Architecture | $(uname -m) |" >> $GITHUB_STEP_SUMMARY |
| 32 | + echo "| OS | $(cat /etc/os-release | grep PRETTY_NAME | cut -d'"' -f2) |" >> $GITHUB_STEP_SUMMARY |
| 33 | + echo "| Uptime | $(uptime -p) |" >> $GITHUB_STEP_SUMMARY |
| 34 | + echo "| Date | $(date) |" >> $GITHUB_STEP_SUMMARY |
| 35 | + echo "" >> $GITHUB_STEP_SUMMARY |
| 36 | +
|
| 37 | + - name: Disk Usage |
| 38 | + run: | |
| 39 | + echo "## Disk Usage" >> $GITHUB_STEP_SUMMARY |
| 40 | + echo "\`\`\`" >> $GITHUB_STEP_SUMMARY |
| 41 | + df -h | grep -E "^/dev|Filesystem" >> $GITHUB_STEP_SUMMARY |
| 42 | + echo "\`\`\`" >> $GITHUB_STEP_SUMMARY |
| 43 | + echo "" >> $GITHUB_STEP_SUMMARY |
| 44 | +
|
| 45 | + # Check if disk usage is critical (>85%) |
| 46 | + DISK_USAGE=$(df / | tail -1 | awk '{print $5}' | sed 's/%//') |
| 47 | + if [ "$DISK_USAGE" -gt 85 ]; then |
| 48 | + echo "::warning::Disk usage is at ${DISK_USAGE}% - consider cleaning up" |
| 49 | + echo "⚠️ **WARNING: Disk usage is at ${DISK_USAGE}%**" >> $GITHUB_STEP_SUMMARY |
| 50 | + else |
| 51 | + echo "✅ Disk usage is at ${DISK_USAGE}%" >> $GITHUB_STEP_SUMMARY |
| 52 | + fi |
| 53 | + echo "" >> $GITHUB_STEP_SUMMARY |
| 54 | +
|
| 55 | + - name: Memory Usage |
| 56 | + run: | |
| 57 | + echo "## Memory Usage" >> $GITHUB_STEP_SUMMARY |
| 58 | + echo "\`\`\`" >> $GITHUB_STEP_SUMMARY |
| 59 | + free -h >> $GITHUB_STEP_SUMMARY |
| 60 | + echo "\`\`\`" >> $GITHUB_STEP_SUMMARY |
| 61 | + echo "" >> $GITHUB_STEP_SUMMARY |
| 62 | +
|
| 63 | + # Check memory usage |
| 64 | + MEM_TOTAL=$(free | grep Mem | awk '{print $2}') |
| 65 | + MEM_USED=$(free | grep Mem | awk '{print $3}') |
| 66 | + MEM_PERCENT=$((MEM_USED * 100 / MEM_TOTAL)) |
| 67 | + if [ "$MEM_PERCENT" -gt 85 ]; then |
| 68 | + echo "::warning::Memory usage is at ${MEM_PERCENT}%" |
| 69 | + echo "⚠️ **WARNING: Memory usage is at ${MEM_PERCENT}%**" >> $GITHUB_STEP_SUMMARY |
| 70 | + else |
| 71 | + echo "✅ Memory usage is at ${MEM_PERCENT}%" >> $GITHUB_STEP_SUMMARY |
| 72 | + fi |
| 73 | + echo "" >> $GITHUB_STEP_SUMMARY |
| 74 | +
|
| 75 | + - name: CPU Information |
| 76 | + run: | |
| 77 | + echo "## CPU Information" >> $GITHUB_STEP_SUMMARY |
| 78 | + echo "| Property | Value |" >> $GITHUB_STEP_SUMMARY |
| 79 | + echo "|----------|-------|" >> $GITHUB_STEP_SUMMARY |
| 80 | + echo "| CPU Model | $(cat /proc/cpuinfo | grep 'model name\|Model' | head -1 | cut -d':' -f2 | xargs) |" >> $GITHUB_STEP_SUMMARY |
| 81 | + echo "| CPU Cores | $(nproc) |" >> $GITHUB_STEP_SUMMARY |
| 82 | + echo "| Load Average | $(cat /proc/loadavg | cut -d' ' -f1-3) |" >> $GITHUB_STEP_SUMMARY |
| 83 | +
|
| 84 | + # Check CPU temperature if available |
| 85 | + if [ -f /sys/class/thermal/thermal_zone0/temp ]; then |
| 86 | + TEMP=$(cat /sys/class/thermal/thermal_zone0/temp) |
| 87 | + TEMP_C=$((TEMP / 1000)) |
| 88 | + echo "| CPU Temperature | ${TEMP_C}°C |" >> $GITHUB_STEP_SUMMARY |
| 89 | + if [ "$TEMP_C" -gt 70 ]; then |
| 90 | + echo "::warning::CPU temperature is ${TEMP_C}°C - consider improving cooling" |
| 91 | + fi |
| 92 | + fi |
| 93 | + echo "" >> $GITHUB_STEP_SUMMARY |
| 94 | +
|
| 95 | + - name: Docker Status |
| 96 | + run: | |
| 97 | + echo "## Docker Status" >> $GITHUB_STEP_SUMMARY |
| 98 | +
|
| 99 | + # Check if Docker is running |
| 100 | + if systemctl is-active --quiet docker; then |
| 101 | + echo "✅ Docker service is running" >> $GITHUB_STEP_SUMMARY |
| 102 | + else |
| 103 | + echo "❌ **Docker service is NOT running**" >> $GITHUB_STEP_SUMMARY |
| 104 | + echo "::error::Docker service is not running" |
| 105 | + fi |
| 106 | + echo "" >> $GITHUB_STEP_SUMMARY |
| 107 | +
|
| 108 | + echo "### Docker Version" >> $GITHUB_STEP_SUMMARY |
| 109 | + echo "\`\`\`" >> $GITHUB_STEP_SUMMARY |
| 110 | + docker version --format '{{.Server.Version}}' 2>/dev/null || echo "Unable to get Docker version" |
| 111 | + echo "\`\`\`" >> $GITHUB_STEP_SUMMARY |
| 112 | + echo "" >> $GITHUB_STEP_SUMMARY |
| 113 | +
|
| 114 | + echo "### Docker Disk Usage" >> $GITHUB_STEP_SUMMARY |
| 115 | + echo "\`\`\`" >> $GITHUB_STEP_SUMMARY |
| 116 | + docker system df 2>/dev/null || echo "Unable to get Docker disk usage" |
| 117 | + echo "\`\`\`" >> $GITHUB_STEP_SUMMARY |
| 118 | + echo "" >> $GITHUB_STEP_SUMMARY |
| 119 | +
|
| 120 | + echo "### Running Containers" >> $GITHUB_STEP_SUMMARY |
| 121 | + CONTAINERS=$(docker ps -q 2>/dev/null | wc -l) |
| 122 | + echo "Running containers: $CONTAINERS" >> $GITHUB_STEP_SUMMARY |
| 123 | + if [ "$CONTAINERS" -gt 0 ]; then |
| 124 | + echo "\`\`\`" >> $GITHUB_STEP_SUMMARY |
| 125 | + docker ps --format "table {{.Names}}\t{{.Status}}\t{{.Size}}" 2>/dev/null |
| 126 | + echo "\`\`\`" >> $GITHUB_STEP_SUMMARY |
| 127 | + fi |
| 128 | + echo "" >> $GITHUB_STEP_SUMMARY |
| 129 | +
|
| 130 | + echo "### Docker Images" >> $GITHUB_STEP_SUMMARY |
| 131 | + IMAGES=$(docker images -q 2>/dev/null | wc -l) |
| 132 | + echo "Total images: $IMAGES" >> $GITHUB_STEP_SUMMARY |
| 133 | + echo "" >> $GITHUB_STEP_SUMMARY |
| 134 | +
|
| 135 | + - name: QEMU/binfmt Status |
| 136 | + run: | |
| 137 | + echo "## QEMU/binfmt Status" >> $GITHUB_STEP_SUMMARY |
| 138 | +
|
| 139 | + # Check binfmt_misc mount |
| 140 | + if mount | grep -q binfmt_misc; then |
| 141 | + echo "✅ binfmt_misc is mounted" >> $GITHUB_STEP_SUMMARY |
| 142 | + else |
| 143 | + echo "❌ **binfmt_misc is NOT mounted**" >> $GITHUB_STEP_SUMMARY |
| 144 | + echo "::warning::binfmt_misc is not mounted - cross-architecture builds may fail" |
| 145 | + fi |
| 146 | +
|
| 147 | + # List registered binfmt handlers |
| 148 | + echo "" >> $GITHUB_STEP_SUMMARY |
| 149 | + echo "### Registered Handlers" >> $GITHUB_STEP_SUMMARY |
| 150 | + if [ -d /proc/sys/fs/binfmt_misc ]; then |
| 151 | + echo "\`\`\`" >> $GITHUB_STEP_SUMMARY |
| 152 | + ls -la /proc/sys/fs/binfmt_misc/ 2>/dev/null | grep -v "^total" | head -20 |
| 153 | + echo "\`\`\`" >> $GITHUB_STEP_SUMMARY |
| 154 | +
|
| 155 | + # Check for qemu-arm specifically |
| 156 | + if [ -f /proc/sys/fs/binfmt_misc/qemu-arm ]; then |
| 157 | + echo "✅ qemu-arm handler is registered" >> $GITHUB_STEP_SUMMARY |
| 158 | + else |
| 159 | + echo "⚠️ qemu-arm handler is NOT registered" >> $GITHUB_STEP_SUMMARY |
| 160 | + fi |
| 161 | + else |
| 162 | + echo "binfmt_misc directory not found" >> $GITHUB_STEP_SUMMARY |
| 163 | + fi |
| 164 | + echo "" >> $GITHUB_STEP_SUMMARY |
| 165 | +
|
| 166 | + - name: Loop Devices Status |
| 167 | + run: | |
| 168 | + echo "## Loop Devices" >> $GITHUB_STEP_SUMMARY |
| 169 | + LOOP_COUNT=$(losetup -l 2>/dev/null | grep -v "^NAME" | wc -l) |
| 170 | + echo "Active loop devices: $LOOP_COUNT" >> $GITHUB_STEP_SUMMARY |
| 171 | + if [ "$LOOP_COUNT" -gt 0 ]; then |
| 172 | + echo "::warning::Found $LOOP_COUNT active loop devices - may indicate incomplete cleanup" |
| 173 | + echo "\`\`\`" >> $GITHUB_STEP_SUMMARY |
| 174 | + losetup -l 2>/dev/null |
| 175 | + echo "\`\`\`" >> $GITHUB_STEP_SUMMARY |
| 176 | + else |
| 177 | + echo "✅ No stale loop devices" >> $GITHUB_STEP_SUMMARY |
| 178 | + fi |
| 179 | + echo "" >> $GITHUB_STEP_SUMMARY |
| 180 | +
|
| 181 | + - name: Runner Work Directory |
| 182 | + run: | |
| 183 | + echo "## Runner Work Directory" >> $GITHUB_STEP_SUMMARY |
| 184 | +
|
| 185 | + # Check for leftover directories that could cause issues |
| 186 | + WORK_DIR="${GITHUB_WORKSPACE}/.." |
| 187 | + echo "Work directory: $WORK_DIR" >> $GITHUB_STEP_SUMMARY |
| 188 | + echo "" >> $GITHUB_STEP_SUMMARY |
| 189 | +
|
| 190 | + echo "### Directory Contents" >> $GITHUB_STEP_SUMMARY |
| 191 | + echo "\`\`\`" >> $GITHUB_STEP_SUMMARY |
| 192 | + ls -la "$WORK_DIR" 2>/dev/null | head -20 |
| 193 | + echo "\`\`\`" >> $GITHUB_STEP_SUMMARY |
| 194 | + echo "" >> $GITHUB_STEP_SUMMARY |
| 195 | +
|
| 196 | + # Check for problematic directories |
| 197 | + for dir in unfor19-awscli .cache deploy; do |
| 198 | + if [ -d "$WORK_DIR/BlueOS/$dir" ]; then |
| 199 | + echo "⚠️ Found leftover directory: $dir" >> $GITHUB_STEP_SUMMARY |
| 200 | + echo "::warning::Found leftover directory: $dir" |
| 201 | + fi |
| 202 | + done |
| 203 | +
|
| 204 | + - name: Network Connectivity |
| 205 | + run: | |
| 206 | + echo "## Network Connectivity" >> $GITHUB_STEP_SUMMARY |
| 207 | +
|
| 208 | + # Test connectivity to key services |
| 209 | + declare -A ENDPOINTS=( |
| 210 | + ["GitHub"]="github.com" |
| 211 | + ["Docker Hub"]="hub.docker.com" |
| 212 | + ["AWS S3"]="s3.us-east-2.amazonaws.com" |
| 213 | + ["Raspberry Pi Downloads"]="downloads.raspberrypi.org" |
| 214 | + ) |
| 215 | +
|
| 216 | + echo "| Service | Status |" >> $GITHUB_STEP_SUMMARY |
| 217 | + echo "|---------|--------|" >> $GITHUB_STEP_SUMMARY |
| 218 | +
|
| 219 | + for name in "GitHub" "Docker Hub" "AWS S3" "Raspberry Pi Downloads"; do |
| 220 | + host="${ENDPOINTS[$name]}" |
| 221 | + if ping -c 1 -W 5 "$host" > /dev/null 2>&1; then |
| 222 | + echo "| $name | ✅ Reachable |" >> $GITHUB_STEP_SUMMARY |
| 223 | + else |
| 224 | + echo "| $name | ❌ Unreachable |" >> $GITHUB_STEP_SUMMARY |
| 225 | + echo "::warning::Cannot reach $name ($host)" |
| 226 | + fi |
| 227 | + done |
| 228 | + echo "" >> $GITHUB_STEP_SUMMARY |
| 229 | +
|
| 230 | + - name: Python Environment |
| 231 | + run: | |
| 232 | + echo "## Python Environment" >> $GITHUB_STEP_SUMMARY |
| 233 | + echo "| Property | Value |" >> $GITHUB_STEP_SUMMARY |
| 234 | + echo "|----------|-------|" >> $GITHUB_STEP_SUMMARY |
| 235 | + echo "| Python Version | $(python3 --version 2>/dev/null || echo 'Not found') |" >> $GITHUB_STEP_SUMMARY |
| 236 | + echo "| Pip Version | $(pip3 --version 2>/dev/null | cut -d' ' -f2 || echo 'Not found') |" >> $GITHUB_STEP_SUMMARY |
| 237 | + echo "| Pip Location | $(which pip3 2>/dev/null || echo 'Not found') |" >> $GITHUB_STEP_SUMMARY |
| 238 | +
|
| 239 | + # Check if awscli is installed |
| 240 | + if command -v aws &> /dev/null; then |
| 241 | + echo "| AWS CLI | $(aws --version 2>/dev/null | cut -d' ' -f1) |" >> $GITHUB_STEP_SUMMARY |
| 242 | + else |
| 243 | + echo "| AWS CLI | Not installed |" >> $GITHUB_STEP_SUMMARY |
| 244 | + fi |
| 245 | + echo "" >> $GITHUB_STEP_SUMMARY |
| 246 | +
|
| 247 | + - name: Recent Errors in System Log |
| 248 | + run: | |
| 249 | + echo "## Recent System Errors (last 50 lines)" >> $GITHUB_STEP_SUMMARY |
| 250 | + echo "\`\`\`" >> $GITHUB_STEP_SUMMARY |
| 251 | + sudo journalctl -p err -n 50 --no-pager 2>/dev/null | tail -30 || echo "Unable to read system journal" |
| 252 | + echo "\`\`\`" >> $GITHUB_STEP_SUMMARY |
| 253 | + echo "" >> $GITHUB_STEP_SUMMARY |
| 254 | +
|
| 255 | + - name: Test Docker Pull |
| 256 | + run: | |
| 257 | + echo "## Docker Pull Test" >> $GITHUB_STEP_SUMMARY |
| 258 | +
|
| 259 | + # Try to pull a small test image |
| 260 | + if docker pull hello-world > /dev/null 2>&1; then |
| 261 | + echo "✅ Docker pull works correctly" >> $GITHUB_STEP_SUMMARY |
| 262 | + docker rmi hello-world > /dev/null 2>&1 || true |
| 263 | + else |
| 264 | + echo "❌ **Docker pull failed**" >> $GITHUB_STEP_SUMMARY |
| 265 | + echo "::error::Docker pull test failed" |
| 266 | + fi |
| 267 | + echo "" >> $GITHUB_STEP_SUMMARY |
| 268 | +
|
| 269 | + - name: Test QEMU Emulation |
| 270 | + run: | |
| 271 | + echo "## QEMU Emulation Test" >> $GITHUB_STEP_SUMMARY |
| 272 | +
|
| 273 | + ARCH=$(uname -m) |
| 274 | + if [ "$ARCH" = "aarch64" ]; then |
| 275 | + # On ARM64, test ARM32 emulation |
| 276 | + TEST_PLATFORM="linux/arm/v7" |
| 277 | + TEST_DESC="ARM32 on ARM64" |
| 278 | + else |
| 279 | + # On ARM32, we don't need emulation for ARM32 |
| 280 | + TEST_PLATFORM="linux/arm/v7" |
| 281 | + TEST_DESC="ARM32 native" |
| 282 | + fi |
| 283 | +
|
| 284 | + echo "Testing: $TEST_DESC" >> $GITHUB_STEP_SUMMARY |
| 285 | +
|
| 286 | + if docker run --rm --platform $TEST_PLATFORM alpine:latest echo "QEMU test passed" > /dev/null 2>&1; then |
| 287 | + echo "✅ Platform emulation works ($TEST_PLATFORM)" >> $GITHUB_STEP_SUMMARY |
| 288 | + else |
| 289 | + echo "⚠️ Platform emulation may have issues ($TEST_PLATFORM)" >> $GITHUB_STEP_SUMMARY |
| 290 | + echo "::warning::QEMU emulation test failed for $TEST_PLATFORM" |
| 291 | + fi |
| 292 | + echo "" >> $GITHUB_STEP_SUMMARY |
| 293 | +
|
| 294 | + - name: Health Summary |
| 295 | + run: | |
| 296 | + echo "## Health Check Complete" >> $GITHUB_STEP_SUMMARY |
| 297 | + echo "" >> $GITHUB_STEP_SUMMARY |
| 298 | + echo "Runner: **${{ matrix.name }}** (${{ matrix.runner }})" >> $GITHUB_STEP_SUMMARY |
| 299 | + echo "Timestamp: $(date -u '+%Y-%m-%d %H:%M:%S UTC')" >> $GITHUB_STEP_SUMMARY |
0 commit comments