Skip to content

Commit 94c7963

Browse files
sbryngelsonclaude
andcommitted
Fix GPU reduction in adaptive mode: use is_device_ptr
The adaptive mode ||b||_2 computation was using map(present:) clause which doesn't work correctly with NVHPC. Replace with the established pattern using omp_get_mapped_ptr() + is_device_ptr() that works throughout the rest of the codebase. This was causing the GPU CI test "Rest frame projection converges" to fail with extremely high ratios (775x-969x instead of <5x) because the GPU reduction was reading incorrect data. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent bdf62f1 commit 94c7963

File tree

1 file changed

+8
-8
lines changed

1 file changed

+8
-8
lines changed

src/poisson_solver_multigrid.cpp

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1889,32 +1889,32 @@ int MultigridPoissonSolver::solve_device(double* rhs_present, double* p_present,
18891889
const int Nz = finest.Nz;
18901890
const int stride = finest.stride;
18911891
const int plane_stride = finest.plane_stride;
1892-
[[maybe_unused]] const size_t f_size = finest.total_size;
1893-
const double* f_ptr = f_ptrs_[0];
18941892
const bool is_2d = finest.is2D();
18951893

18961894
double b_sum_sq = 0.0;
18971895

1896+
// NVHPC WORKAROUND: Use omp_get_mapped_ptr for actual device addresses
1897+
int device = omp_get_default_device();
1898+
const double* f_dev = static_cast<const double*>(omp_get_mapped_ptr(f_ptrs_[0], device));
1899+
18981900
if (is_2d) {
18991901
#pragma omp target teams distribute parallel for collapse(2) \
1900-
map(present: f_ptr[0:f_size]) \
1901-
reduction(+: b_sum_sq)
1902+
is_device_ptr(f_dev) reduction(+: b_sum_sq)
19021903
for (int j = Ng; j < Ny + Ng; ++j) {
19031904
for (int i = Ng; i < Nx + Ng; ++i) {
19041905
int idx = j * stride + i;
1905-
double val = f_ptr[idx];
1906+
double val = f_dev[idx];
19061907
b_sum_sq += val * val;
19071908
}
19081909
}
19091910
} else {
19101911
#pragma omp target teams distribute parallel for collapse(3) \
1911-
map(present: f_ptr[0:f_size]) \
1912-
reduction(+: b_sum_sq)
1912+
is_device_ptr(f_dev) reduction(+: b_sum_sq)
19131913
for (int k = Ng; k < Nz + Ng; ++k) {
19141914
for (int j = Ng; j < Ny + Ng; ++j) {
19151915
for (int i = Ng; i < Nx + Ng; ++i) {
19161916
int idx = k * plane_stride + j * stride + i;
1917-
double val = f_ptr[idx];
1917+
double val = f_dev[idx];
19181918
b_sum_sq += val * val;
19191919
}
19201920
}

0 commit comments

Comments
 (0)