Skip to content

Commit 96f51ad

Browse files
ci: runner-health-check: First commit
Signed-off-by: Patrick José Pereira <patrickelectric@gmail.com>
1 parent e4d36f3 commit 96f51ad

File tree

1 file changed

+299
-0
lines changed

1 file changed

+299
-0
lines changed
Lines changed: 299 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,299 @@
1+
name: Self-Hosted Runner Health Check
2+
3+
on:
4+
workflow_dispatch:
5+
6+
jobs:
7+
health-check:
8+
if: github.repository_owner == 'bluerobotics'
9+
strategy:
10+
fail-fast: false
11+
matrix:
12+
include:
13+
- runner: blueos-ci
14+
name: "BlueOS CI (ARM32)"
15+
- runner: pi4-builder2
16+
name: "Pi4 Builder 2 (ARM32)"
17+
- runner: pi5-builder
18+
name: "Pi5 Builder (ARM64)"
19+
20+
runs-on: ${{ matrix.runner }}
21+
name: ${{ matrix.name }}
22+
23+
steps:
24+
- name: System Information
25+
run: |
26+
echo "## System Information" >> $GITHUB_STEP_SUMMARY
27+
echo "| Property | Value |" >> $GITHUB_STEP_SUMMARY
28+
echo "|----------|-------|" >> $GITHUB_STEP_SUMMARY
29+
echo "| Hostname | $(hostname) |" >> $GITHUB_STEP_SUMMARY
30+
echo "| Kernel | $(uname -r) |" >> $GITHUB_STEP_SUMMARY
31+
echo "| Architecture | $(uname -m) |" >> $GITHUB_STEP_SUMMARY
32+
echo "| OS | $(cat /etc/os-release | grep PRETTY_NAME | cut -d'"' -f2) |" >> $GITHUB_STEP_SUMMARY
33+
echo "| Uptime | $(uptime -p) |" >> $GITHUB_STEP_SUMMARY
34+
echo "| Date | $(date) |" >> $GITHUB_STEP_SUMMARY
35+
echo "" >> $GITHUB_STEP_SUMMARY
36+
37+
- name: Disk Usage
38+
run: |
39+
echo "## Disk Usage" >> $GITHUB_STEP_SUMMARY
40+
echo "\`\`\`" >> $GITHUB_STEP_SUMMARY
41+
df -h | grep -E "^/dev|Filesystem" >> $GITHUB_STEP_SUMMARY
42+
echo "\`\`\`" >> $GITHUB_STEP_SUMMARY
43+
echo "" >> $GITHUB_STEP_SUMMARY
44+
45+
# Check if disk usage is critical (>85%)
46+
DISK_USAGE=$(df / | tail -1 | awk '{print $5}' | sed 's/%//')
47+
if [ "$DISK_USAGE" -gt 85 ]; then
48+
echo "::warning::Disk usage is at ${DISK_USAGE}% - consider cleaning up"
49+
echo "⚠️ **WARNING: Disk usage is at ${DISK_USAGE}%**" >> $GITHUB_STEP_SUMMARY
50+
else
51+
echo "✅ Disk usage is at ${DISK_USAGE}%" >> $GITHUB_STEP_SUMMARY
52+
fi
53+
echo "" >> $GITHUB_STEP_SUMMARY
54+
55+
- name: Memory Usage
56+
run: |
57+
echo "## Memory Usage" >> $GITHUB_STEP_SUMMARY
58+
echo "\`\`\`" >> $GITHUB_STEP_SUMMARY
59+
free -h >> $GITHUB_STEP_SUMMARY
60+
echo "\`\`\`" >> $GITHUB_STEP_SUMMARY
61+
echo "" >> $GITHUB_STEP_SUMMARY
62+
63+
# Check memory usage
64+
MEM_TOTAL=$(free | grep Mem | awk '{print $2}')
65+
MEM_USED=$(free | grep Mem | awk '{print $3}')
66+
MEM_PERCENT=$((MEM_USED * 100 / MEM_TOTAL))
67+
if [ "$MEM_PERCENT" -gt 85 ]; then
68+
echo "::warning::Memory usage is at ${MEM_PERCENT}%"
69+
echo "⚠️ **WARNING: Memory usage is at ${MEM_PERCENT}%**" >> $GITHUB_STEP_SUMMARY
70+
else
71+
echo "✅ Memory usage is at ${MEM_PERCENT}%" >> $GITHUB_STEP_SUMMARY
72+
fi
73+
echo "" >> $GITHUB_STEP_SUMMARY
74+
75+
- name: CPU Information
76+
run: |
77+
echo "## CPU Information" >> $GITHUB_STEP_SUMMARY
78+
echo "| Property | Value |" >> $GITHUB_STEP_SUMMARY
79+
echo "|----------|-------|" >> $GITHUB_STEP_SUMMARY
80+
echo "| CPU Model | $(cat /proc/cpuinfo | grep 'model name\|Model' | head -1 | cut -d':' -f2 | xargs) |" >> $GITHUB_STEP_SUMMARY
81+
echo "| CPU Cores | $(nproc) |" >> $GITHUB_STEP_SUMMARY
82+
echo "| Load Average | $(cat /proc/loadavg | cut -d' ' -f1-3) |" >> $GITHUB_STEP_SUMMARY
83+
84+
# Check CPU temperature if available
85+
if [ -f /sys/class/thermal/thermal_zone0/temp ]; then
86+
TEMP=$(cat /sys/class/thermal/thermal_zone0/temp)
87+
TEMP_C=$((TEMP / 1000))
88+
echo "| CPU Temperature | ${TEMP_C}°C |" >> $GITHUB_STEP_SUMMARY
89+
if [ "$TEMP_C" -gt 70 ]; then
90+
echo "::warning::CPU temperature is ${TEMP_C}°C - consider improving cooling"
91+
fi
92+
fi
93+
echo "" >> $GITHUB_STEP_SUMMARY
94+
95+
- name: Docker Status
96+
run: |
97+
echo "## Docker Status" >> $GITHUB_STEP_SUMMARY
98+
99+
# Check if Docker is running
100+
if systemctl is-active --quiet docker; then
101+
echo "✅ Docker service is running" >> $GITHUB_STEP_SUMMARY
102+
else
103+
echo "❌ **Docker service is NOT running**" >> $GITHUB_STEP_SUMMARY
104+
echo "::error::Docker service is not running"
105+
fi
106+
echo "" >> $GITHUB_STEP_SUMMARY
107+
108+
echo "### Docker Version" >> $GITHUB_STEP_SUMMARY
109+
echo "\`\`\`" >> $GITHUB_STEP_SUMMARY
110+
docker version --format '{{.Server.Version}}' 2>/dev/null || echo "Unable to get Docker version"
111+
echo "\`\`\`" >> $GITHUB_STEP_SUMMARY
112+
echo "" >> $GITHUB_STEP_SUMMARY
113+
114+
echo "### Docker Disk Usage" >> $GITHUB_STEP_SUMMARY
115+
echo "\`\`\`" >> $GITHUB_STEP_SUMMARY
116+
docker system df 2>/dev/null || echo "Unable to get Docker disk usage"
117+
echo "\`\`\`" >> $GITHUB_STEP_SUMMARY
118+
echo "" >> $GITHUB_STEP_SUMMARY
119+
120+
echo "### Running Containers" >> $GITHUB_STEP_SUMMARY
121+
CONTAINERS=$(docker ps -q 2>/dev/null | wc -l)
122+
echo "Running containers: $CONTAINERS" >> $GITHUB_STEP_SUMMARY
123+
if [ "$CONTAINERS" -gt 0 ]; then
124+
echo "\`\`\`" >> $GITHUB_STEP_SUMMARY
125+
docker ps --format "table {{.Names}}\t{{.Status}}\t{{.Size}}" 2>/dev/null
126+
echo "\`\`\`" >> $GITHUB_STEP_SUMMARY
127+
fi
128+
echo "" >> $GITHUB_STEP_SUMMARY
129+
130+
echo "### Docker Images" >> $GITHUB_STEP_SUMMARY
131+
IMAGES=$(docker images -q 2>/dev/null | wc -l)
132+
echo "Total images: $IMAGES" >> $GITHUB_STEP_SUMMARY
133+
echo "" >> $GITHUB_STEP_SUMMARY
134+
135+
- name: QEMU/binfmt Status
136+
run: |
137+
echo "## QEMU/binfmt Status" >> $GITHUB_STEP_SUMMARY
138+
139+
# Check binfmt_misc mount
140+
if mount | grep -q binfmt_misc; then
141+
echo "✅ binfmt_misc is mounted" >> $GITHUB_STEP_SUMMARY
142+
else
143+
echo "❌ **binfmt_misc is NOT mounted**" >> $GITHUB_STEP_SUMMARY
144+
echo "::warning::binfmt_misc is not mounted - cross-architecture builds may fail"
145+
fi
146+
147+
# List registered binfmt handlers
148+
echo "" >> $GITHUB_STEP_SUMMARY
149+
echo "### Registered Handlers" >> $GITHUB_STEP_SUMMARY
150+
if [ -d /proc/sys/fs/binfmt_misc ]; then
151+
echo "\`\`\`" >> $GITHUB_STEP_SUMMARY
152+
ls -la /proc/sys/fs/binfmt_misc/ 2>/dev/null | grep -v "^total" | head -20
153+
echo "\`\`\`" >> $GITHUB_STEP_SUMMARY
154+
155+
# Check for qemu-arm specifically
156+
if [ -f /proc/sys/fs/binfmt_misc/qemu-arm ]; then
157+
echo "✅ qemu-arm handler is registered" >> $GITHUB_STEP_SUMMARY
158+
else
159+
echo "⚠️ qemu-arm handler is NOT registered" >> $GITHUB_STEP_SUMMARY
160+
fi
161+
else
162+
echo "binfmt_misc directory not found" >> $GITHUB_STEP_SUMMARY
163+
fi
164+
echo "" >> $GITHUB_STEP_SUMMARY
165+
166+
- name: Loop Devices Status
167+
run: |
168+
echo "## Loop Devices" >> $GITHUB_STEP_SUMMARY
169+
LOOP_COUNT=$(losetup -l 2>/dev/null | grep -v "^NAME" | wc -l)
170+
echo "Active loop devices: $LOOP_COUNT" >> $GITHUB_STEP_SUMMARY
171+
if [ "$LOOP_COUNT" -gt 0 ]; then
172+
echo "::warning::Found $LOOP_COUNT active loop devices - may indicate incomplete cleanup"
173+
echo "\`\`\`" >> $GITHUB_STEP_SUMMARY
174+
losetup -l 2>/dev/null
175+
echo "\`\`\`" >> $GITHUB_STEP_SUMMARY
176+
else
177+
echo "✅ No stale loop devices" >> $GITHUB_STEP_SUMMARY
178+
fi
179+
echo "" >> $GITHUB_STEP_SUMMARY
180+
181+
- name: Runner Work Directory
182+
run: |
183+
echo "## Runner Work Directory" >> $GITHUB_STEP_SUMMARY
184+
185+
# Check for leftover directories that could cause issues
186+
WORK_DIR="${GITHUB_WORKSPACE}/.."
187+
echo "Work directory: $WORK_DIR" >> $GITHUB_STEP_SUMMARY
188+
echo "" >> $GITHUB_STEP_SUMMARY
189+
190+
echo "### Directory Contents" >> $GITHUB_STEP_SUMMARY
191+
echo "\`\`\`" >> $GITHUB_STEP_SUMMARY
192+
ls -la "$WORK_DIR" 2>/dev/null | head -20
193+
echo "\`\`\`" >> $GITHUB_STEP_SUMMARY
194+
echo "" >> $GITHUB_STEP_SUMMARY
195+
196+
# Check for problematic directories
197+
for dir in unfor19-awscli .cache deploy; do
198+
if [ -d "$WORK_DIR/BlueOS/$dir" ]; then
199+
echo "⚠️ Found leftover directory: $dir" >> $GITHUB_STEP_SUMMARY
200+
echo "::warning::Found leftover directory: $dir"
201+
fi
202+
done
203+
204+
- name: Network Connectivity
205+
run: |
206+
echo "## Network Connectivity" >> $GITHUB_STEP_SUMMARY
207+
208+
# Test connectivity to key services
209+
declare -A ENDPOINTS=(
210+
["GitHub"]="github.com"
211+
["Docker Hub"]="hub.docker.com"
212+
["AWS S3"]="s3.us-east-2.amazonaws.com"
213+
["Raspberry Pi Downloads"]="downloads.raspberrypi.org"
214+
)
215+
216+
echo "| Service | Status |" >> $GITHUB_STEP_SUMMARY
217+
echo "|---------|--------|" >> $GITHUB_STEP_SUMMARY
218+
219+
for name in "GitHub" "Docker Hub" "AWS S3" "Raspberry Pi Downloads"; do
220+
host="${ENDPOINTS[$name]}"
221+
if ping -c 1 -W 5 "$host" > /dev/null 2>&1; then
222+
echo "| $name | ✅ Reachable |" >> $GITHUB_STEP_SUMMARY
223+
else
224+
echo "| $name | ❌ Unreachable |" >> $GITHUB_STEP_SUMMARY
225+
echo "::warning::Cannot reach $name ($host)"
226+
fi
227+
done
228+
echo "" >> $GITHUB_STEP_SUMMARY
229+
230+
- name: Python Environment
231+
run: |
232+
echo "## Python Environment" >> $GITHUB_STEP_SUMMARY
233+
echo "| Property | Value |" >> $GITHUB_STEP_SUMMARY
234+
echo "|----------|-------|" >> $GITHUB_STEP_SUMMARY
235+
echo "| Python Version | $(python3 --version 2>/dev/null || echo 'Not found') |" >> $GITHUB_STEP_SUMMARY
236+
echo "| Pip Version | $(pip3 --version 2>/dev/null | cut -d' ' -f2 || echo 'Not found') |" >> $GITHUB_STEP_SUMMARY
237+
echo "| Pip Location | $(which pip3 2>/dev/null || echo 'Not found') |" >> $GITHUB_STEP_SUMMARY
238+
239+
# Check if awscli is installed
240+
if command -v aws &> /dev/null; then
241+
echo "| AWS CLI | $(aws --version 2>/dev/null | cut -d' ' -f1) |" >> $GITHUB_STEP_SUMMARY
242+
else
243+
echo "| AWS CLI | Not installed |" >> $GITHUB_STEP_SUMMARY
244+
fi
245+
echo "" >> $GITHUB_STEP_SUMMARY
246+
247+
- name: Recent Errors in System Log
248+
run: |
249+
echo "## Recent System Errors (last 50 lines)" >> $GITHUB_STEP_SUMMARY
250+
echo "\`\`\`" >> $GITHUB_STEP_SUMMARY
251+
sudo journalctl -p err -n 50 --no-pager 2>/dev/null | tail -30 || echo "Unable to read system journal"
252+
echo "\`\`\`" >> $GITHUB_STEP_SUMMARY
253+
echo "" >> $GITHUB_STEP_SUMMARY
254+
255+
- name: Test Docker Pull
256+
run: |
257+
echo "## Docker Pull Test" >> $GITHUB_STEP_SUMMARY
258+
259+
# Try to pull a small test image
260+
if docker pull hello-world > /dev/null 2>&1; then
261+
echo "✅ Docker pull works correctly" >> $GITHUB_STEP_SUMMARY
262+
docker rmi hello-world > /dev/null 2>&1 || true
263+
else
264+
echo "❌ **Docker pull failed**" >> $GITHUB_STEP_SUMMARY
265+
echo "::error::Docker pull test failed"
266+
fi
267+
echo "" >> $GITHUB_STEP_SUMMARY
268+
269+
- name: Test QEMU Emulation
270+
run: |
271+
echo "## QEMU Emulation Test" >> $GITHUB_STEP_SUMMARY
272+
273+
ARCH=$(uname -m)
274+
if [ "$ARCH" = "aarch64" ]; then
275+
# On ARM64, test ARM32 emulation
276+
TEST_PLATFORM="linux/arm/v7"
277+
TEST_DESC="ARM32 on ARM64"
278+
else
279+
# On ARM32, we don't need emulation for ARM32
280+
TEST_PLATFORM="linux/arm/v7"
281+
TEST_DESC="ARM32 native"
282+
fi
283+
284+
echo "Testing: $TEST_DESC" >> $GITHUB_STEP_SUMMARY
285+
286+
if docker run --rm --platform $TEST_PLATFORM alpine:latest echo "QEMU test passed" > /dev/null 2>&1; then
287+
echo "✅ Platform emulation works ($TEST_PLATFORM)" >> $GITHUB_STEP_SUMMARY
288+
else
289+
echo "⚠️ Platform emulation may have issues ($TEST_PLATFORM)" >> $GITHUB_STEP_SUMMARY
290+
echo "::warning::QEMU emulation test failed for $TEST_PLATFORM"
291+
fi
292+
echo "" >> $GITHUB_STEP_SUMMARY
293+
294+
- name: Health Summary
295+
run: |
296+
echo "## Health Check Complete" >> $GITHUB_STEP_SUMMARY
297+
echo "" >> $GITHUB_STEP_SUMMARY
298+
echo "Runner: **${{ matrix.name }}** (${{ matrix.runner }})" >> $GITHUB_STEP_SUMMARY
299+
echo "Timestamp: $(date -u '+%Y-%m-%d %H:%M:%S UTC')" >> $GITHUB_STEP_SUMMARY

0 commit comments

Comments
 (0)