Skip to content

Commit bdf62f1

Browse files
sbryngelsonclaude
andcommitted
Fix O4 stencil support and CodeRabbit review issues
- Fix unreachable O4 skew path: reorder if-else chain to check use_O4 && use_skew before plain use_skew (was dead code) - Fix multi-layer ghost cell Dirichlet BCs: use Ng+g indexing for proper linear extrapolation through boundary - Fix config_ vs config parameter bug in space_order safety check - Fix RHS norm computation: use Ng-based loop bounds instead of hardcoded 1..Nx (supports O4 with Ng=2) - Fix V-cycle graph config: use grid.Ng instead of hardcoded 1 - Add perf_bench for branch performance comparisons Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent de074c0 commit bdf62f1

File tree

4 files changed

+179
-68
lines changed

4 files changed

+179
-68
lines changed

CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -548,6 +548,10 @@ if(BUILD_TESTS)
548548
target_link_libraries(bench_mg_cuda_graphs PRIVATE nn_cfd_core)
549549
endif()
550550

551+
# Performance benchmark for branch comparisons
552+
add_executable(perf_bench perf_bench.cpp)
553+
target_link_libraries(perf_bench PRIVATE nn_cfd_core)
554+
551555
# =========================================================================
552556
# Convenience targets for running tests
553557
# =========================================================================

perf_bench.cpp

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
/// @file perf_bench.cpp
2+
/// @brief Simple performance benchmark for branch comparison
3+
/// Usage: ./perf_bench [grid_size] [num_steps]
4+
5+
#include "mesh.hpp"
6+
#include "fields.hpp"
7+
#include "solver.hpp"
8+
#include <iostream>
9+
#include <iomanip>
10+
#include <chrono>
11+
#include <cmath>
12+
13+
using namespace nncfd;
14+
using namespace std::chrono;
15+
16+
int main(int argc, char** argv) {
17+
// Parse arguments
18+
int N = 64; // Default grid size
19+
int nsteps = 100; // Default number of steps
20+
21+
if (argc > 1) N = std::atoi(argv[1]);
22+
if (argc > 2) nsteps = std::atoi(argv[2]);
23+
24+
std::cout << "=== Performance Benchmark ===" << std::endl;
25+
std::cout << "Grid: " << N << "³ | Steps: " << nsteps << std::endl;
26+
27+
// Create 3D mesh (periodic in x,z, walls in y - channel flow)
28+
const double Lx = 2.0 * M_PI;
29+
const double Ly = 2.0;
30+
const double Lz = M_PI;
31+
32+
Mesh mesh;
33+
mesh.init_uniform(N, N, N, 0.0, Lx, 0.0, Ly, 0.0, Lz);
34+
35+
// Configure solver - minimal settings, no I/O
36+
Config config;
37+
config.nu = 1e-4;
38+
config.dt = 1e-4;
39+
config.adaptive_dt = false;
40+
config.max_steps = nsteps;
41+
config.verbose = false;
42+
config.output_freq = 0;
43+
config.write_fields = false;
44+
config.num_snapshots = 0;
45+
config.turb_model = TurbulenceModelType::None;
46+
config.poisson_solver = PoissonSolverType::MG;
47+
48+
// Create solver
49+
RANSSolver solver(mesh, config);
50+
51+
// Set BCs: periodic x,z, no-slip y (channel)
52+
VelocityBC bc;
53+
bc.x_lo = VelocityBC::Periodic;
54+
bc.x_hi = VelocityBC::Periodic;
55+
bc.y_lo = VelocityBC::NoSlip;
56+
bc.y_hi = VelocityBC::NoSlip;
57+
bc.z_lo = VelocityBC::Periodic;
58+
bc.z_hi = VelocityBC::Periodic;
59+
solver.set_velocity_bc(bc);
60+
61+
// Initialize with Taylor-Green-like vortex
62+
const double U0 = 1.0;
63+
for (int k = 1; k <= N; ++k) {
64+
double z = mesh.z(k);
65+
for (int j = 1; j <= N; ++j) {
66+
double y = mesh.y(j);
67+
for (int i = 1; i <= N + 1; ++i) {
68+
double x = mesh.xf[i];
69+
solver.velocity().u(i, j, k) = U0 * std::sin(x) * std::cos(y) * std::cos(z);
70+
}
71+
}
72+
}
73+
for (int k = 1; k <= N; ++k) {
74+
double z = mesh.z(k);
75+
for (int j = 1; j <= N + 1; ++j) {
76+
double y = mesh.yf[j];
77+
for (int i = 1; i <= N; ++i) {
78+
double x = mesh.x(i);
79+
solver.velocity().v(i, j, k) = -U0 * std::cos(x) * std::sin(y) * std::cos(z);
80+
}
81+
}
82+
}
83+
// w = 0
84+
85+
// Warmup (1 step)
86+
solver.step();
87+
88+
// Timed run
89+
auto start = high_resolution_clock::now();
90+
for (int i = 1; i < nsteps; ++i) {
91+
solver.step();
92+
}
93+
auto end = high_resolution_clock::now();
94+
95+
double elapsed_ms = duration_cast<microseconds>(end - start).count() / 1000.0;
96+
double ms_per_step = elapsed_ms / (nsteps - 1);
97+
double mcells_per_sec = (static_cast<double>(N) * N * N) / (ms_per_step * 1000.0);
98+
99+
std::cout << std::fixed << std::setprecision(2);
100+
std::cout << "Total time: " << elapsed_ms << " ms" << std::endl;
101+
std::cout << "Per step: " << std::setprecision(3) << ms_per_step << " ms" << std::endl;
102+
std::cout << "Throughput: " << std::setprecision(2) << mcells_per_sec << " Mcells/s" << std::endl;
103+
104+
return 0;
105+
}

src/poisson_solver_multigrid.cpp

Lines changed: 27 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -230,17 +230,17 @@ void MultigridPoissonSolver::apply_bc(int level) {
230230
u_ptr[idx_lo] = u_ptr[j * stride + Nx + g];
231231
} else if (bc_x_lo == 1) { // Neumann (zero gradient)
232232
u_ptr[idx_lo] = u_ptr[j * stride + Ng];
233-
} else { // Dirichlet
234-
u_ptr[idx_lo] = 2.0 * dval - u_ptr[j * stride + Ng];
233+
} else { // Dirichlet - mirror ghost g through boundary using interior cell Ng+g
234+
u_ptr[idx_lo] = 2.0 * dval - u_ptr[j * stride + (Ng + g)];
235235
}
236236

237237
// Right boundary - periodic wraps to left interior
238238
if (bc_x_hi == 2) { // Periodic
239239
u_ptr[idx_hi] = u_ptr[j * stride + Ng + g];
240240
} else if (bc_x_hi == 1) { // Neumann (zero gradient)
241241
u_ptr[idx_hi] = u_ptr[j * stride + (Nx + Ng - 1)];
242-
} else { // Dirichlet
243-
u_ptr[idx_hi] = 2.0 * dval - u_ptr[j * stride + (Nx + Ng - 1)];
242+
} else { // Dirichlet - mirror ghost g through boundary using interior cell (Nx+Ng-1-g)
243+
u_ptr[idx_hi] = 2.0 * dval - u_ptr[j * stride + (Nx + Ng - 1 - g)];
244244
}
245245
}
246246
}
@@ -257,17 +257,17 @@ void MultigridPoissonSolver::apply_bc(int level) {
257257
u_ptr[idx_lo] = u_ptr[(Ny + g) * stride + i];
258258
} else if (bc_y_lo == 1) { // Neumann (zero gradient)
259259
u_ptr[idx_lo] = u_ptr[Ng * stride + i];
260-
} else { // Dirichlet
261-
u_ptr[idx_lo] = 2.0 * dval - u_ptr[Ng * stride + i];
260+
} else { // Dirichlet - mirror ghost g through boundary using interior cell Ng+g
261+
u_ptr[idx_lo] = 2.0 * dval - u_ptr[(Ng + g) * stride + i];
262262
}
263263

264264
// Top boundary - periodic wraps to bottom interior
265265
if (bc_y_hi == 2) { // Periodic
266266
u_ptr[idx_hi] = u_ptr[(Ng + g) * stride + i];
267267
} else if (bc_y_hi == 1) { // Neumann (zero gradient)
268268
u_ptr[idx_hi] = u_ptr[(Ny + Ng - 1) * stride + i];
269-
} else { // Dirichlet
270-
u_ptr[idx_hi] = 2.0 * dval - u_ptr[(Ny + Ng - 1) * stride + i];
269+
} else { // Dirichlet - mirror ghost g through boundary using interior cell (Ny+Ng-1-g)
270+
u_ptr[idx_hi] = 2.0 * dval - u_ptr[(Ny + Ng - 1 - g) * stride + i];
271271
}
272272
}
273273
}
@@ -345,17 +345,17 @@ void MultigridPoissonSolver::apply_bc(int level) {
345345
u_ptr[idx_lo] = u_ptr[k * plane_stride + j * stride + Nx + g];
346346
} else if (bc_x_lo == 1) {
347347
u_ptr[idx_lo] = u_ptr[k * plane_stride + j * stride + Ng];
348-
} else {
349-
u_ptr[idx_lo] = 2.0 * dval - u_ptr[k * plane_stride + j * stride + Ng];
348+
} else { // Dirichlet - mirror ghost g through boundary using interior cell Ng+g
349+
u_ptr[idx_lo] = 2.0 * dval - u_ptr[k * plane_stride + j * stride + (Ng + g)];
350350
}
351351

352352
// Right boundary - periodic wraps to left interior
353353
if (bc_x_hi == 2) {
354354
u_ptr[idx_hi] = u_ptr[k * plane_stride + j * stride + Ng + g];
355355
} else if (bc_x_hi == 1) {
356356
u_ptr[idx_hi] = u_ptr[k * plane_stride + j * stride + (Nx + Ng - 1)];
357-
} else {
358-
u_ptr[idx_hi] = 2.0 * dval - u_ptr[k * plane_stride + j * stride + (Nx + Ng - 1)];
357+
} else { // Dirichlet - mirror ghost g through boundary using interior cell (Nx+Ng-1-g)
358+
u_ptr[idx_hi] = 2.0 * dval - u_ptr[k * plane_stride + j * stride + (Nx + Ng - 1 - g)];
359359
}
360360
}
361361
}
@@ -374,17 +374,17 @@ void MultigridPoissonSolver::apply_bc(int level) {
374374
u_ptr[idx_lo] = u_ptr[k * plane_stride + (Ny + g) * stride + i];
375375
} else if (bc_y_lo == 1) {
376376
u_ptr[idx_lo] = u_ptr[k * plane_stride + Ng * stride + i];
377-
} else {
378-
u_ptr[idx_lo] = 2.0 * dval - u_ptr[k * plane_stride + Ng * stride + i];
377+
} else { // Dirichlet - mirror ghost g through boundary using interior cell Ng+g
378+
u_ptr[idx_lo] = 2.0 * dval - u_ptr[k * plane_stride + (Ng + g) * stride + i];
379379
}
380380

381381
// Top boundary - periodic wraps to bottom interior
382382
if (bc_y_hi == 2) {
383383
u_ptr[idx_hi] = u_ptr[k * plane_stride + (Ng + g) * stride + i];
384384
} else if (bc_y_hi == 1) {
385385
u_ptr[idx_hi] = u_ptr[k * plane_stride + (Ny + Ng - 1) * stride + i];
386-
} else {
387-
u_ptr[idx_hi] = 2.0 * dval - u_ptr[k * plane_stride + (Ny + Ng - 1) * stride + i];
386+
} else { // Dirichlet - mirror ghost g through boundary using interior cell (Ny+Ng-1-g)
387+
u_ptr[idx_hi] = 2.0 * dval - u_ptr[k * plane_stride + (Ny + Ng - 1 - g) * stride + i];
388388
}
389389
}
390390
}
@@ -403,17 +403,17 @@ void MultigridPoissonSolver::apply_bc(int level) {
403403
u_ptr[idx_lo] = u_ptr[(Nz + g) * plane_stride + j * stride + i];
404404
} else if (bc_z_lo == 1) {
405405
u_ptr[idx_lo] = u_ptr[Ng * plane_stride + j * stride + i];
406-
} else {
407-
u_ptr[idx_lo] = 2.0 * dval - u_ptr[Ng * plane_stride + j * stride + i];
406+
} else { // Dirichlet - mirror ghost g through boundary using interior cell Ng+g
407+
u_ptr[idx_lo] = 2.0 * dval - u_ptr[(Ng + g) * plane_stride + j * stride + i];
408408
}
409409

410410
// Front boundary - periodic wraps to back interior
411411
if (bc_z_hi == 2) {
412412
u_ptr[idx_hi] = u_ptr[(Ng + g) * plane_stride + j * stride + i];
413413
} else if (bc_z_hi == 1) {
414414
u_ptr[idx_hi] = u_ptr[(Nz + Ng - 1) * plane_stride + j * stride + i];
415-
} else {
416-
u_ptr[idx_hi] = 2.0 * dval - u_ptr[(Nz + Ng - 1) * plane_stride + j * stride + i];
415+
} else { // Dirichlet - mirror ghost g through boundary using interior cell (Nz+Ng-1-g)
416+
u_ptr[idx_hi] = 2.0 * dval - u_ptr[(Nz + Ng - 1 - g) * plane_stride + j * stride + i];
417417
}
418418
}
419419
}
@@ -1669,20 +1669,21 @@ int MultigridPoissonSolver::solve(const ScalarField& rhs, ScalarField& p, const
16691669
// Compute reference norms for relative tolerances (CPU path)
16701670
// ||b||_∞ = max|f| and ||b||_2 = sqrt(sum(f^2)) on finest level - store in member for diagnostics
16711671
auto& finest_cpu = *levels_[0];
1672+
const int Ng_f = finest_cpu.Ng; // Use actual Ng for correct interior indexing (O4 has Ng=2)
16721673
b_inf_ = 0.0;
16731674
double b_sum_sq = 0.0;
16741675
if (mesh_->is2D()) {
1675-
for (int j = 1; j <= finest_cpu.Ny; ++j) {
1676-
for (int i = 1; i <= finest_cpu.Nx; ++i) {
1676+
for (int j = Ng_f; j < Ng_f + finest_cpu.Ny; ++j) {
1677+
for (int i = Ng_f; i < Ng_f + finest_cpu.Nx; ++i) {
16771678
double val = finest_cpu.f(i, j);
16781679
b_inf_ = std::max(b_inf_, std::abs(val));
16791680
b_sum_sq += val * val;
16801681
}
16811682
}
16821683
} else {
1683-
for (int k = 1; k <= finest_cpu.Nz; ++k) {
1684-
for (int j = 1; j <= finest_cpu.Ny; ++j) {
1685-
for (int i = 1; i <= finest_cpu.Nx; ++i) {
1684+
for (int k = Ng_f; k < Ng_f + finest_cpu.Nz; ++k) {
1685+
for (int j = Ng_f; j < Ng_f + finest_cpu.Ny; ++j) {
1686+
for (int i = Ng_f; i < Ng_f + finest_cpu.Nx; ++i) {
16861687
double val = finest_cpu.f(i, j, k);
16871688
b_inf_ = std::max(b_inf_, std::abs(val));
16881689
b_sum_sq += val * val;
@@ -2279,7 +2280,7 @@ void MultigridPoissonSolver::initialize_vcycle_graph(int nu1, int nu2, int degre
22792280
cfg.Nx = grid.Nx;
22802281
cfg.Ny = grid.Ny;
22812282
cfg.Nz = grid.Nz;
2282-
cfg.Ng = 1;
2283+
cfg.Ng = grid.Ng; // Use actual Ng (finest level may have Ng=2 for O4 stencils)
22832284
cfg.dx2 = grid.dx * grid.dx;
22842285
cfg.dy2 = grid.dy * grid.dy;
22852286
cfg.dz2 = grid.dz * grid.dz;

0 commit comments

Comments
 (0)