Skip to content

Fix corner-fix Dirichlet BC indexing for Ng>1 #567

Fix corner-fix Dirichlet BC indexing for Ng>1

Fix corner-fix Dirichlet BC indexing for Ng>1 #567

Workflow file for this run

name: GPU CI
on:
push:
jobs:
gpu-tests:
runs-on: self-hosted
timeout-minutes: 150
# Serialize per branch (keeps parallelism across branches)
concurrency:
group: gpu-ci-${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: false
steps:
- uses: actions/checkout@v4
# Extract HYPRE version from CMakeLists.txt to ensure cache key stays in sync
# Uses python for portability (grep -P isn't guaranteed on all distros)
- name: Extract HYPRE version
id: hypre_version
run: |
python3 - <<'PY' | tee -a "$GITHUB_OUTPUT"
import re, pathlib
text = pathlib.Path("CMakeLists.txt").read_text()
m = re.search(r"GIT_TAG\s+v(\d+(?:\.\d+)*)", text)
if not m:
raise SystemExit("ERROR: Could not extract HYPRE version from CMakeLists.txt")
print(f"version={m.group(1)}")
print(f"Detected HYPRE version: {m.group(1)}", file=__import__('sys').stderr)
PY
# Ensure cache directories exist (avoids warnings on cold start)
- name: Ensure cache directories exist
run: mkdir -p build_cpu_hypre/_deps build_gpu_hypre/_deps
# Restore HYPRE cache (slow to compile, rarely changes)
# Key includes HYPRE version and hash of build config files
# Only cache *_hypre directories - ci.sh uses these when HYPRE is enabled
- name: Restore HYPRE cache
id: hypre_cache
uses: actions/cache/restore@v4
with:
path: |
build_cpu_hypre/_deps
build_gpu_hypre/_deps
key: hypre-v${{ steps.hypre_version.outputs.version }}-h200-cc90-${{ hashFiles('CMakeLists.txt', 'cmake/**', '.github/scripts/**') }}
restore-keys: |
hypre-v${{ steps.hypre_version.outputs.version }}-h200-cc90-
# Clean hypre-subbuild after cache restore - these contain CMakeCache.txt with
# absolute paths that break when restored on a different runner
- name: Clean HYPRE subbuild cache (path-sensitive)
run: |
echo "Cleaning hypre-subbuild directories (contain absolute paths)..."
for dir in build_cpu_hypre/_deps build_gpu_hypre/_deps; do
if [ -d "${dir}/hypre-subbuild" ]; then
echo " Removing ${dir}/hypre-subbuild"
rm -rf "${dir}/hypre-subbuild"
fi
done
echo "Done - hypre-build (compiled libs) preserved, hypre-subbuild (CMake config) cleared"
- name: Submit GPU correctness suite to Slurm (H200, 1 GPU)
run: |
./.github/scripts/submit_and_monitor_slurm.sh \
./.github/scripts/gpu_ci_correctness.sbatch.template \
"${GITHUB_WORKSPACE}" \
"${GITHUB_WORKSPACE}/gpu_ci_correctness_${GITHUB_RUN_ID}.out" \
"${GITHUB_WORKSPACE}/gpu_ci_correctness_${GITHUB_RUN_ID}.err" \
"${GITHUB_WORKSPACE}/gpu_ci_correctness_${GITHUB_RUN_ID}.sbatch"
- name: Show Build Output
if: always()
run: |
echo "==================================================================="
echo " GPU CI Build Log (last 300 lines)"
echo "==================================================================="
echo ""
if [ -f "${GITHUB_WORKSPACE}/gpu_ci_build.log" ]; then
tail -n 300 "${GITHUB_WORKSPACE}/gpu_ci_build.log"
else
echo "Build log not found (build may have failed early)"
echo "Check the Slurm output for details."
fi
- name: Show Test Output
if: always()
run: |
echo "==================================================================="
echo " GPU CI Test Log (last 300 lines)"
echo "==================================================================="
echo ""
if [ -f "${GITHUB_WORKSPACE}/gpu_ci_test.log" ]; then
tail -n 300 "${GITHUB_WORKSPACE}/gpu_ci_test.log"
else
echo "Test log not found (tests may not have run)"
echo "Check the build log and Slurm output for details."
fi
- name: Submit GPU performance suite to Slurm (H200, 1 GPU)
run: |
./.github/scripts/submit_and_monitor_slurm.sh \
./.github/scripts/gpu_ci_perf.sbatch.template \
"${GITHUB_WORKSPACE}" \
"${GITHUB_WORKSPACE}/gpu_ci_perf_${GITHUB_RUN_ID}.out" \
"${GITHUB_WORKSPACE}/gpu_ci_perf_${GITHUB_RUN_ID}.err" \
"${GITHUB_WORKSPACE}/gpu_ci_perf_${GITHUB_RUN_ID}.sbatch"
- name: Show Slurm Output (on failure)
if: failure()
run: |
echo "==================================================================="
echo " Slurm Job Output (last 200 lines each)"
echo "==================================================================="
for f in "${GITHUB_WORKSPACE}/gpu_ci_correctness_${GITHUB_RUN_ID}.out" \
"${GITHUB_WORKSPACE}/gpu_ci_correctness_${GITHUB_RUN_ID}.err" \
"${GITHUB_WORKSPACE}/gpu_ci_perf_${GITHUB_RUN_ID}.out" \
"${GITHUB_WORKSPACE}/gpu_ci_perf_${GITHUB_RUN_ID}.err"; do
if [ -f "$f" ]; then
echo ""
echo "===== $f ====="
tail -n 200 "$f"
fi
done
- name: Debug info on failure
if: failure()
run: |
echo "=== Debug Information ==="
echo "Commit: $(git rev-parse HEAD)"
echo "Branch: $(git rev-parse --abbrev-ref HEAD)"
echo ""
echo "=== Build directories ==="
ls -la build_gpu_hypre 2>/dev/null || echo "No build_gpu_hypre directory"
ls -la build_cpu_hypre 2>/dev/null || echo "No build_cpu_hypre directory"
echo ""
echo "=== HYPRE cache status ==="
ls -la build_gpu_hypre/_deps 2>/dev/null || echo "No GPU HYPRE cache"
ls -la build_cpu_hypre/_deps 2>/dev/null || echo "No CPU HYPRE cache"
echo ""
echo "=== Test binaries ==="
ls -lh build_gpu_hypre/test_* 2>/dev/null || echo "No test binaries found"
# Save HYPRE cache if build succeeded (even if tests failed)
# HYPRE cache is independent of test results - only corrupted if build was interrupted
# Check for HYPRE library artifacts to verify build completed successfully
- name: Save HYPRE cache
continue-on-error: true
if: always() && steps.hypre_cache.outputs.cache-hit != 'true'
run: |
# Only save if HYPRE build artifacts exist (proof that build completed)
# Require at least one libHYPRE to be found (prevents empty cache on cold start)
HYPRE_OK=true
FOUND_LIB=false
for dir in build_cpu_hypre build_gpu_hypre; do
if [ -d "${dir}/_deps/hypre-build" ]; then
# Check for libHYPRE (the key artifact)
if ls ${dir}/_deps/hypre-build/src/libHYPRE* 2>/dev/null | head -1 > /dev/null; then
FOUND_LIB=true
else
echo "WARNING: ${dir} HYPRE build incomplete (no libHYPRE found)"
HYPRE_OK=false
fi
fi
done
# Must find at least one lib AND no incomplete builds
if [ "$HYPRE_OK" = "true" ] && [ "$FOUND_LIB" = "true" ]; then
echo "HYPRE builds complete - proceeding with cache save"
echo "save_cache=true" >> $GITHUB_OUTPUT
else
echo "HYPRE builds incomplete or missing - skipping cache save"
echo "save_cache=false" >> $GITHUB_OUTPUT
fi
id: check_hypre
- name: Actually save HYPRE cache
continue-on-error: true
if: always() && steps.hypre_cache.outputs.cache-hit != 'true' && steps.check_hypre.outputs.save_cache == 'true'
uses: actions/cache/save@v4
with:
path: |
build_cpu_hypre/_deps
build_gpu_hypre/_deps
key: hypre-v${{ steps.hypre_version.outputs.version }}-h200-cc90-${{ hashFiles('CMakeLists.txt', 'cmake/**', '.github/scripts/**') }}
- name: Cleanup
if: always()
run: |
# On failure, preserve logs for debugging; on success, clean everything
if [ "${{ job.status }}" = "success" ]; then
# Clean up Slurm artifacts and logs only on success
rm -f "${GITHUB_WORKSPACE}/gpu_ci_correctness_${GITHUB_RUN_ID}.sbatch" || true
rm -f "${GITHUB_WORKSPACE}/gpu_ci_correctness_${GITHUB_RUN_ID}.out" || true
rm -f "${GITHUB_WORKSPACE}/gpu_ci_correctness_${GITHUB_RUN_ID}.err" || true
rm -f "${GITHUB_WORKSPACE}/gpu_ci_perf_${GITHUB_RUN_ID}.sbatch" || true
rm -f "${GITHUB_WORKSPACE}/gpu_ci_perf_${GITHUB_RUN_ID}.out" || true
rm -f "${GITHUB_WORKSPACE}/gpu_ci_perf_${GITHUB_RUN_ID}.err" || true
rm -f "${GITHUB_WORKSPACE}/gpu_ci_build.log" || true
rm -f "${GITHUB_WORKSPACE}/gpu_ci_test.log" || true
else
echo "=== Preserving logs for debugging (job status: ${{ job.status }}) ==="
ls -la "${GITHUB_WORKSPACE}"/gpu_ci_*.{sbatch,out,err,log} 2>/dev/null || true
fi
# Clean build artifacts but keep _deps for cache (only *_hypre dirs are cached)
for dir in build_gpu_hypre build_cpu_hypre; do
if [ -d "${GITHUB_WORKSPACE}/${dir}" ]; then
find "${GITHUB_WORKSPACE}/${dir}" -mindepth 1 -maxdepth 1 ! -name '_deps' -exec rm -rf {} +
fi
done
# Fully remove non-hypre build dirs (not cached)
rm -rf "${GITHUB_WORKSPACE}/build_gpu" "${GITHUB_WORKSPACE}/build_cpu" || true