Fix GPU reduction in adaptive mode: use is_device_ptr

sbryngelson · claude · sbryngelson · commit 94c796331a10 · 2026-01-27T16:37:24.000-05:00
The adaptive mode ||b||_2 computation was using map(present:) clause
which doesn't work correctly with NVHPC. Replace with the established
pattern using omp_get_mapped_ptr() + is_device_ptr() that works
throughout the rest of the codebase.

This was causing the GPU CI test "Rest frame projection converges"
to fail with extremely high ratios (775x-969x instead of &lt;5x) because
the GPU reduction was reading incorrect data.

Co-Authored-By: Claude Opus 4.5 &lt;noreply@anthropic.com&gt;
diff --git a/src/poisson_solver_multigrid.cpp b/src/poisson_solver_multigrid.cpp
@@ -1889,32 +1889,32 @@ int MultigridPoissonSolver::solve_device(double* rhs_present, double* p_present,
                 const int Nz = finest.Nz;
                 const int stride = finest.stride;
                 const int plane_stride = finest.plane_stride;
-                [[maybe_unused]] const size_t f_size = finest.total_size;
-                const double* f_ptr = f_ptrs_[0];
                 const bool is_2d = finest.is2D();
 
                 double b_sum_sq = 0.0;
 
+                // NVHPC WORKAROUND: Use omp_get_mapped_ptr for actual device addresses
+                int device = omp_get_default_device();
+                const double* f_dev = static_cast<const double*>(omp_get_mapped_ptr(f_ptrs_[0], device));
+
                 if (is_2d) {
                     #pragma omp target teams distribute parallel for collapse(2) \
-                        map(present: f_ptr[0:f_size]) \
-                        reduction(+: b_sum_sq)
+                        is_device_ptr(f_dev) reduction(+: b_sum_sq)
                     for (int j = Ng; j < Ny + Ng; ++j) {
                         for (int i = Ng; i < Nx + Ng; ++i) {
                             int idx = j * stride + i;
-                            double val = f_ptr[idx];
+                            double val = f_dev[idx];
                             b_sum_sq += val * val;
                         }
                     }
                 } else {
                     #pragma omp target teams distribute parallel for collapse(3) \
-                        map(present: f_ptr[0:f_size]) \
-                        reduction(+: b_sum_sq)
+                        is_device_ptr(f_dev) reduction(+: b_sum_sq)
                     for (int k = Ng; k < Nz + Ng; ++k) {
                         for (int j = Ng; j < Ny + Ng; ++j) {
                             for (int i = Ng; i < Nx + Ng; ++i) {
                                 int idx = k * plane_stride + j * stride + i;
-                                double val = f_ptr[idx];
+                                double val = f_dev[idx];
                                 b_sum_sq += val * val;
                             }
                         }