Fix GPU ||b||_2 computation: sync and NVHPC workaround

sbryngelson · claude · sbryngelson · commit 28fbbf18a51e · 2026-01-27T18:08:07.000-05:00
Three fixes for the adaptive MG convergence check on GPU:

1. Add cudaDeviceSynchronize before ||b||_2 computation
   The data copy from rhs_present to f_level0_ptr_ may not be complete
   without explicit sync, causing the reduction to read stale/garbage data.

2. Use f_level0_ptr_ instead of f_ptrs_[0] for omp_get_mapped_ptr
   Vector element access (f_ptrs_[0]) can return stale addresses in NVHPC
   target regions. Member pointer f_level0_ptr_ is set once and stable.

3. Add sanity check for garbage b_l2_ values
   If the reduction returns NaN/Inf or suspiciously small values, set
   b_l2_=0 to force the convergence check to use raw residual instead of
   relative. This prevents early exit on bad reduction results.

These fixes address the GPU CI test failure where GalileanStageBreakdownTest
was failing with 775x divergence ratios instead of expected &lt;3x.

Co-Authored-By: Claude Opus 4.5 &lt;noreply@anthropic.com&gt;
diff --git a/src/poisson_solver_multigrid.cpp b/src/poisson_solver_multigrid.cpp
@@ -1871,6 +1871,8 @@ int MultigridPoissonSolver::solve_device(double* rhs_present, double* p_present,
 
             // Compute ||b||_2 BEFORE running any V-cycles (RHS is still pristine)
             // This matches the convergence mode pattern
+            // CRITICAL: Sync to ensure data copy is complete before reading
+            CUDA_CHECK_SYNC(cudaDeviceSynchronize());
             {
                 auto& finest = *levels_[0];
                 const int Ng = finest.Ng;
@@ -1881,9 +1883,10 @@ int MultigridPoissonSolver::solve_device(double* rhs_present, double* p_present,
                 const int plane_stride = finest.plane_stride;
                 const bool is_2d = finest.is2D();
 
-                // NVHPC WORKAROUND: Use omp_get_mapped_ptr for actual device addresses
+                // NVHPC WORKAROUND: Use member pointer f_level0_ptr_ instead of vector element f_ptrs_[0]
+                // Vector element access can return stale addresses in NVHPC
                 int device = omp_get_default_device();
-                const double* f_ptr = static_cast<const double*>(omp_get_mapped_ptr(f_ptrs_[0], device));
+                const double* f_ptr = static_cast<const double*>(omp_get_mapped_ptr(f_level0_ptr_, device));
 
                 double b_sum_sq = 0.0;
 
@@ -1911,6 +1914,12 @@ int MultigridPoissonSolver::solve_device(double* rhs_present, double* p_present,
                     }
                 }
                 b_l2_ = std::sqrt(b_sum_sq);
+
+                // Sanity check: if ||b||_2 is invalid or garbage, fall back to all cycles
+                // This prevents early exit on bad reduction results
+                if (!std::isfinite(b_l2_) || b_l2_ < 1e-30) {
+                    b_l2_ = 0.0;  // Force rel_res check to use raw residual
+                }
             }
 
             // First batch of cycles