LCOV - coverage.info - solvers/navier_stokes/avx2/solver_projection

LCOV - code coverage report

Current view:	top level - solvers/navier_stokes/avx2 - solver_projection_avx2.c (source / functions)		Coverage	Total	Hit
Test:	coverage.info	Lines:	10.2 %	128	13
Test Date:	2026-06-23 13:41:07	Functions:	100.0 %	3	3

            Line data    Source code

       1              : /**
       2              :  * Optimized Projection Method NSSolver (Chorin's Method) with SIMD + OpenMP
       3              :  *
       4              :  * This implementation combines SIMD vectorization (AVX2) with OpenMP
       5              :  * parallelization for maximum performance on multi-core CPUs.
       6              :  *
       7              :  * - Predictor step: OpenMP parallelized (scalar inner loops)
       8              :  * - Corrector step: OpenMP parallelized with AVX2 SIMD inner loops
       9              :  * - Poisson solver: Uses SIMD Poisson solver for pressure computation
      10              :  */
      11              : 
      12              : #include "cfd/boundary/boundary_conditions.h"
      13              : #include "cfd/core/cfd_status.h"
      14              : #include "cfd/core/grid.h"
      15              : #include "cfd/core/indexing.h"
      16              : #include "cfd/core/logging.h"
      17              : #include "cfd/core/memory.h"
      18              : #include "cfd/solvers/navier_stokes_solver.h"
      19              : #include "cfd/solvers/poisson_solver.h"
      20              : #include "cfd/solvers/energy_solver.h"
      21              : #include "../../energy/energy_solver_internal.h"
      22              : 
      23              : #include "../boundary_copy_utils.h"
      24              : 
      25              : #include <math.h>
      26              : #include <stdio.h>
      27              : #include <string.h>
      28              : 
      29              : #ifndef M_PI
      30              : #define M_PI 3.14159265358979323846
      31              : #endif
      32              : 
      33              : #ifdef _OPENMP
      34              : #include <omp.h>
      35              : #endif
      36              : 
      37              : /* AVX2 detection
      38              :  * CFD_HAS_AVX2 is set by CMake when -DCFD_ENABLE_AVX2=ON.
      39              :  * This works consistently across all compilers (GCC, Clang, MSVC).
      40              :  */
      41              : #if defined(CFD_HAS_AVX2)
      42              : #include <immintrin.h>
      43              : #define USE_AVX 1
      44              : #else
      45              : #define USE_AVX 0
      46              : #endif
      47              : 
      48              : // Physical limits
      49              : #define MAX_VELOCITY 100.0
      50              : 
      51              : typedef struct {
      52              :     double* u_star;
      53              :     double* v_star;
      54              :     double* w_star;
      55              :     double* p_new;
      56              :     double* rhs;
      57              :     double* u_new;  /* used as p_temp for Poisson solver */
      58              :     double* T_ws;   /* Reusable scratch for the energy step (avoids per-step alloc) */
      59              :     size_t nx;
      60              :     size_t ny;
      61              :     size_t nz;
      62              :     size_t stride_z;
      63              :     size_t k_start;
      64              :     size_t k_end;
      65              :     double inv_2dz;
      66              :     double inv_dz2;
      67              :     int initialized;
      68              :     int iter_count;
      69              : } projection_simd_context;
      70              : 
      71              : // Public API
      72              : cfd_status_t projection_simd_init(struct NSSolver* solver, const grid* grid,
      73              :                                   const ns_solver_params_t* params);
      74              : void projection_simd_destroy(struct NSSolver* solver);
      75              : cfd_status_t projection_simd_step(struct NSSolver* solver, flow_field* field, const grid* grid,
      76              :                                   const ns_solver_params_t* params, ns_solver_stats_t* stats);
      77              : 
      78           15 : cfd_status_t projection_simd_init(struct NSSolver* solver, const grid* grid,
      79              :                                   const ns_solver_params_t* params) {
      80           15 :     (void)params;
      81           15 :     if (!solver || !grid) {
      82              :         return CFD_ERROR_INVALID;
      83              :     }
      84           15 :     if (grid->nx < 3 || grid->ny < 3 || (grid->nz > 1 && grid->nz < 3)) {
      85              :         return CFD_ERROR_INVALID;
      86              :     }
      87              : 
      88              :     /* Verify SIMD CG Poisson solver is available before allocating resources */
      89           15 :     poisson_solver_t* test_solver = poisson_solver_create(
      90              :         POISSON_METHOD_CG, POISSON_BACKEND_SIMD);
      91           15 :     if (!test_solver) {
      92           15 :         CFD_LOG_WARNING("projection", "SIMD CG Poisson solver not available");
      93           15 :         return CFD_ERROR_UNSUPPORTED;
      94              :     }
      95            0 :     poisson_solver_destroy(test_solver);
      96              : 
      97            0 :     projection_simd_context* ctx =
      98            0 :         (projection_simd_context*)cfd_calloc(1, sizeof(projection_simd_context));
      99            0 :     if (!ctx) {
     100              :         return CFD_ERROR_NOMEM;
     101              :     }
     102              : 
     103            0 :     ctx->nx = grid->nx;
     104            0 :     ctx->ny = grid->ny;
     105            0 :     ctx->nz = grid->nz;
     106            0 :     size_t size = ctx->nx * ctx->ny * grid->nz * sizeof(double);
     107              : 
     108              :     /* Reject non-uniform z-spacing (solver uses constant dz) */
     109            0 :     if (grid->nz > 1 && grid->dz) {
     110            0 :         for (size_t kk = 1; kk < grid->nz - 1; kk++) {
     111            0 :             if (fabs(grid->dz[kk] - grid->dz[0]) > 1e-14) {
     112            0 :                 cfd_free(ctx);
     113            0 :                 return CFD_ERROR_INVALID;
     114              :             }
     115              :         }
     116              :     }
     117              : 
     118            0 :     size_t plane = ctx->nx * ctx->ny;
     119            0 :     ctx->stride_z = (grid->nz > 1) ? plane : 0;
     120            0 :     ctx->k_start  = (grid->nz > 1) ? 1 : 0;
     121            0 :     ctx->k_end    = (grid->nz > 1) ? (grid->nz - 1) : 1;
     122            0 :     double dz = (grid->nz > 1 && grid->dz) ? grid->dz[0] : 0.0;
     123            0 :     ctx->inv_2dz  = (grid->nz > 1 && grid->dz) ? 1.0 / (2.0 * dz) : 0.0;
     124            0 :     ctx->inv_dz2  = (grid->nz > 1 && grid->dz) ? 1.0 / (dz * dz) : 0.0;
     125              : 
     126            0 :     ctx->u_star = (double*)cfd_aligned_malloc(size);
     127            0 :     ctx->v_star = (double*)cfd_aligned_malloc(size);
     128            0 :     ctx->w_star = (double*)cfd_aligned_malloc(size);
     129            0 :     ctx->p_new  = (double*)cfd_aligned_malloc(size);
     130            0 :     ctx->rhs    = (double*)cfd_aligned_malloc(size);
     131            0 :     ctx->u_new  = (double*)cfd_aligned_malloc(size);
     132            0 :     ctx->T_ws   = (double*)cfd_aligned_malloc(size);
     133              : 
     134            0 :     if (!ctx->u_star || !ctx->v_star || !ctx->w_star || !ctx->p_new ||
     135            0 :         !ctx->rhs || !ctx->u_new || !ctx->T_ws) {
     136            0 :         if (ctx->u_star) {
     137            0 :             cfd_aligned_free(ctx->u_star);
     138              :         }
     139            0 :         if (ctx->v_star) {
     140            0 :             cfd_aligned_free(ctx->v_star);
     141              :         }
     142            0 :         if (ctx->w_star) {
     143            0 :             cfd_aligned_free(ctx->w_star);
     144              :         }
     145            0 :         if (ctx->p_new) {
     146            0 :             cfd_aligned_free(ctx->p_new);
     147              :         }
     148            0 :         if (ctx->rhs) {
     149            0 :             cfd_aligned_free(ctx->rhs);
     150              :         }
     151            0 :         if (ctx->u_new) {
     152            0 :             cfd_aligned_free(ctx->u_new);
     153              :         }
     154            0 :         if (ctx->T_ws) {
     155            0 :             cfd_aligned_free(ctx->T_ws);
     156              :         }
     157            0 :         cfd_free(ctx);
     158            0 :         return CFD_ERROR_NOMEM;
     159              :     }
     160              : 
     161            0 :     ctx->initialized = 1;
     162            0 :     solver->context = ctx;
     163            0 :     return CFD_SUCCESS;
     164              : }
     165              : 
     166           19 : void projection_simd_destroy(struct NSSolver* solver) {
     167           19 :     if (solver && solver->context) {
     168            0 :         projection_simd_context* ctx = (projection_simd_context*)solver->context;
     169            0 :         if (ctx->initialized) {
     170            0 :             cfd_aligned_free(ctx->u_star);
     171            0 :             cfd_aligned_free(ctx->v_star);
     172            0 :             cfd_aligned_free(ctx->w_star);
     173            0 :             cfd_aligned_free(ctx->p_new);
     174            0 :             cfd_aligned_free(ctx->rhs);
     175            0 :             cfd_aligned_free(ctx->u_new);
     176            0 :             cfd_aligned_free(ctx->T_ws);
     177              :         }
     178            0 :         cfd_free(ctx);
     179            0 :         solver->context = NULL;
     180              :     }
     181           19 : }
     182              : 
     183            1 : cfd_status_t projection_simd_step(struct NSSolver* solver, flow_field* field, const grid* grid,
     184              :                                   const ns_solver_params_t* params, ns_solver_stats_t* stats) {
     185            1 :     if (!solver || !solver->context || !field || !grid || !params) {
     186              :         return CFD_ERROR_INVALID;
     187              :     }
     188            0 :     if (field->nx < 3 || field->ny < 3 || (field->nz > 1 && field->nz < 3)) {
     189              :         return CFD_ERROR_INVALID;
     190              :     }
     191              : 
     192            0 :     projection_simd_context* ctx = (projection_simd_context*)solver->context;
     193              : 
     194              :     // Verify context matches current grid
     195            0 :     if (ctx->nx != field->nx || ctx->ny != field->ny || ctx->nz != field->nz) {
     196              :         return CFD_ERROR_INVALID;
     197              :     }
     198              : 
     199            0 :     size_t nx = field->nx;
     200            0 :     size_t ny = field->ny;
     201            0 :     size_t size = nx * ny * ctx->nz;
     202              : 
     203            0 :     double dx = grid->dx[0];
     204            0 :     double dy = grid->dy[0];
     205            0 :     double dz = (ctx->nz > 1 && grid->dz) ? grid->dz[0] : 0.0;
     206            0 :     double dt = params->dt;
     207            0 :     double nu = params->mu;  // Viscosity (treated as kinematic for ρ=1)
     208              : 
     209            0 :     double* u_star = ctx->u_star;
     210            0 :     double* v_star = ctx->v_star;
     211            0 :     double* w_star = ctx->w_star;
     212            0 :     double* p_new = ctx->p_new;
     213            0 :     double* rhs = ctx->rhs;
     214              : 
     215              :     // Copy current field values to work buffers (includes boundaries)
     216            0 :     memcpy(u_star, field->u, size * sizeof(double));
     217            0 :     memcpy(v_star, field->v, size * sizeof(double));
     218            0 :     memcpy(w_star, field->w, size * sizeof(double));
     219            0 :     memcpy(p_new, field->p, size * sizeof(double));
     220            0 :     memset(rhs, 0, size * sizeof(double));
     221              : 
     222              :     // ============================================================
     223              :     // STEP 1: Predictor - Compute intermediate velocity u*
     224              :     // (OpenMP parallelized outer loop, scalar inner loop)
     225              :     // ============================================================
     226            0 :     int ny_int = (int)ny;
     227            0 :     int nx_int = (int)nx;
     228            0 :     int jj;
     229            0 :     (void)nx_int;  /* suppress unused variable warning */
     230              : 
     231            0 :     for (size_t k = ctx->k_start; k < ctx->k_end; k++) {
     232            0 :         size_t k_off = k * ctx->stride_z;
     233              : #ifdef _OPENMP
     234            0 :         #pragma omp parallel for schedule(static)
     235              : #endif
     236              :         for (jj = 1; jj < ny_int - 1; jj++) {
     237              :             size_t j = (size_t)jj;
     238              :             for (size_t i = 1; i < nx - 1; i++) {
     239              :                 size_t idx = k_off + IDX_2D(i, j, nx);
     240              : 
     241              :                 double u = field->u[idx];
     242              :                 double v = field->v[idx];
     243              :                 double w = field->w[idx];
     244              : 
     245              :                 // Convective terms: -u·∇u (central differences)
     246              :                 double du_dx = (field->u[idx + 1] - field->u[idx - 1]) / (2.0 * dx);
     247              :                 double du_dy = (field->u[idx + nx] - field->u[idx - nx]) / (2.0 * dy);
     248              :                 double du_dz = (field->u[idx + ctx->stride_z] - field->u[idx - ctx->stride_z]) *
     249              :                                ctx->inv_2dz;
     250              : 
     251              :                 double dv_dx = (field->v[idx + 1] - field->v[idx - 1]) / (2.0 * dx);
     252              :                 double dv_dy = (field->v[idx + nx] - field->v[idx - nx]) / (2.0 * dy);
     253              :                 double dv_dz = (field->v[idx + ctx->stride_z] - field->v[idx - ctx->stride_z]) *
     254              :                                ctx->inv_2dz;
     255              : 
     256              :                 double dw_dx = (field->w[idx + 1] - field->w[idx - 1]) / (2.0 * dx);
     257              :                 double dw_dy = (field->w[idx + nx] - field->w[idx - nx]) / (2.0 * dy);
     258              :                 double dw_dz = (field->w[idx + ctx->stride_z] - field->w[idx - ctx->stride_z]) *
     259              :                                ctx->inv_2dz;
     260              : 
     261              :                 double conv_u = (u * du_dx) + (v * du_dy) + (w * du_dz);
     262              :                 double conv_v = (u * dv_dx) + (v * dv_dy) + (w * dv_dz);
     263              :                 double conv_w = (u * dw_dx) + (v * dw_dy) + (w * dw_dz);
     264              : 
     265              :                 // Viscous terms: ν∇²u
     266              :                 double d2u_dx2 = (field->u[idx + 1] - 2.0 * u + field->u[idx - 1]) / (dx * dx);
     267              :                 double d2u_dy2 = (field->u[idx + nx] - 2.0 * u + field->u[idx - nx]) / (dy * dy);
     268              :                 double d2u_dz2 = (field->u[idx + ctx->stride_z] - 2.0 * u +
     269              :                                   field->u[idx - ctx->stride_z]) * ctx->inv_dz2;
     270              : 
     271              :                 double d2v_dx2 = (field->v[idx + 1] - 2.0 * v + field->v[idx - 1]) / (dx * dx);
     272              :                 double d2v_dy2 = (field->v[idx + nx] - 2.0 * v + field->v[idx - nx]) / (dy * dy);
     273              :                 double d2v_dz2 = (field->v[idx + ctx->stride_z] - 2.0 * v +
     274              :                                   field->v[idx - ctx->stride_z]) * ctx->inv_dz2;
     275              : 
     276              :                 double d2w_dx2 = (field->w[idx + 1] - 2.0 * w + field->w[idx - 1]) / (dx * dx);
     277              :                 double d2w_dy2 = (field->w[idx + nx] - 2.0 * w + field->w[idx - nx]) / (dy * dy);
     278              :                 double d2w_dz2 = (field->w[idx + ctx->stride_z] - 2.0 * w +
     279              :                                   field->w[idx - ctx->stride_z]) * ctx->inv_dz2;
     280              : 
     281              :                 double visc_u = nu * (d2u_dx2 + d2u_dy2 + d2u_dz2);
     282              :                 double visc_v = nu * (d2v_dx2 + d2v_dy2 + d2v_dz2);
     283              :                 double visc_w = nu * (d2w_dx2 + d2w_dy2 + d2w_dz2);
     284              : 
     285              :                 // Source terms
     286              :                 double source_u = 0.0;
     287              :                 double source_v = 0.0;
     288              :                 double source_w = 0.0;
     289              :                 double x_coord = grid->x[i];
     290              :                 double y_coord = grid->y[j];
     291              :                 double z_coord = (ctx->nz > 1 && grid->z) ? grid->z[k] : 0.0;
     292              :                 compute_source_terms(x_coord, y_coord, z_coord, ctx->iter_count, dt, params,
     293              :                                      &source_u, &source_v, &source_w);
     294              : 
     295              :                 // Boussinesq buoyancy source (no-op when beta == 0)
     296              :                 energy_compute_buoyancy(field->T[idx], params,
     297              :                                         &source_u, &source_v, &source_w);
     298              : 
     299              :                 // Intermediate velocity (without pressure gradient)
     300              :                 u_star[idx] = u + (dt * (-conv_u + visc_u + source_u));
     301              :                 v_star[idx] = v + (dt * (-conv_v + visc_v + source_v));
     302              :                 w_star[idx] = w + (dt * (-conv_w + visc_w + source_w));
     303              : 
     304              :                 // Limit velocities
     305              :                 u_star[idx] = fmax(-MAX_VELOCITY, fmin(MAX_VELOCITY, u_star[idx]));
     306              :                 v_star[idx] = fmax(-MAX_VELOCITY, fmin(MAX_VELOCITY, v_star[idx]));
     307              :                 w_star[idx] = fmax(-MAX_VELOCITY, fmin(MAX_VELOCITY, w_star[idx]));
     308              :             }
     309              :         }
     310              :     }
     311              : 
     312              :     // Copy boundary values from field to star arrays
     313            0 :     copy_boundary_velocities_3d(u_star, v_star, w_star, field->u, field->v, field->w,
     314              :                                 nx, ny, ctx->nz);
     315              : 
     316              :     // ============================================================
     317              :     // STEP 2: Solve Poisson equation for pressure
     318              :     // ∇²p = (ρ/dt) * ∇·u*
     319              :     // ============================================================
     320              : 
     321            0 :     double rho = field->rho[0];
     322            0 :     if (rho < 1e-10) {
     323            0 :         rho = 1.0;
     324              :     }
     325              : 
     326              :     // Compute RHS: divergence of intermediate velocity
     327            0 :     for (size_t k = ctx->k_start; k < ctx->k_end; k++) {
     328            0 :         size_t k_off = k * ctx->stride_z;
     329              : #ifdef _OPENMP
     330            0 :         #pragma omp parallel for schedule(static)
     331              : #endif
     332              :         for (jj = 1; jj < ny_int - 1; jj++) {
     333              :             size_t j = (size_t)jj;
     334              :             for (size_t i = 1; i < nx - 1; i++) {
     335              :                 size_t idx = k_off + IDX_2D(i, j, nx);
     336              : 
     337              :                 double du_star_dx = (u_star[idx + 1] - u_star[idx - 1]) / (2.0 * dx);
     338              :                 double dv_star_dy = (v_star[idx + nx] - v_star[idx - nx]) / (2.0 * dy);
     339              :                 double dw_star_dz = (w_star[idx + ctx->stride_z] -
     340              :                                      w_star[idx - ctx->stride_z]) * ctx->inv_2dz;
     341              : 
     342              :                 double divergence = du_star_dx + dv_star_dy + dw_star_dz;
     343              :                 rhs[idx] = (rho / dt) * divergence;
     344              :             }
     345              :         }
     346              :     }
     347              : 
     348              :     // Use SIMD Poisson solver (Conjugate Gradient with SIMD)
     349              :     // ctx->u_new is used as temp buffer for the Poisson solver
     350            0 :     int poisson_iters = poisson_solve_3d(p_new, ctx->u_new, rhs, nx, ny, ctx->nz,
     351              :                                          dx, dy, dz, POISSON_SOLVER_CG_SIMD);
     352              : 
     353            0 :     if (poisson_iters < 0) {
     354              :         return CFD_ERROR_MAX_ITER;
     355              :     }
     356              : 
     357              :     // ============================================================
     358              :     // STEP 3: Corrector - Project velocity to be divergence-free
     359              :     // u^(n+1) = u* - (dt/ρ) * ∇p
     360              :     // (OpenMP parallelized with SIMD inner loop)
     361              :     // ============================================================
     362              : 
     363            0 :     double dt_over_rho = dt / rho;
     364            0 :     double inv_2dx = 1.0 / (2.0 * dx);
     365            0 :     double inv_2dy = 1.0 / (2.0 * dy);
     366              : 
     367              : #if USE_AVX
     368              :     __m256d dt_rho_vec      = _mm256_set1_pd(dt_over_rho);
     369              :     __m256d inv_2dx_vec     = _mm256_set1_pd(inv_2dx);
     370              :     __m256d inv_2dy_vec     = _mm256_set1_pd(inv_2dy);
     371              :     __m256d inv_2dz_vec     = _mm256_set1_pd(ctx->inv_2dz);
     372              :     __m256d max_vel_vec     = _mm256_set1_pd(MAX_VELOCITY);
     373              :     __m256d neg_max_vel_vec = _mm256_set1_pd(-MAX_VELOCITY);
     374              : #endif
     375              : 
     376            0 :     for (size_t k = ctx->k_start; k < ctx->k_end; k++) {
     377            0 :         size_t k_off = k * ctx->stride_z;
     378              : #ifdef _OPENMP
     379            0 :         #pragma omp parallel for schedule(static)
     380              : #endif
     381              :         for (jj = 1; jj < ny_int - 1; jj++) {
     382              :             size_t j = (size_t)jj;
     383              :             size_t i = 1;
     384              : 
     385              : #if USE_AVX
     386              :             // SIMD loop - process 4 cells at once
     387              :             for (; i + 4 <= nx - 1; i += 4) {
     388              :                 size_t idx = k_off + IDX_2D(i, j, nx);
     389              : 
     390              :                 // Load pressure neighbors for gradient computation
     391              :                 __m256d p_xp = _mm256_loadu_pd(&p_new[idx + 1]);
     392              :                 __m256d p_xm = _mm256_loadu_pd(&p_new[idx - 1]);
     393              :                 __m256d p_yp = _mm256_loadu_pd(&p_new[idx + nx]);
     394              :                 __m256d p_ym = _mm256_loadu_pd(&p_new[idx - nx]);
     395              :                 __m256d p_zp = _mm256_loadu_pd(&p_new[idx + ctx->stride_z]);
     396              :                 __m256d p_zm = _mm256_loadu_pd(&p_new[idx - ctx->stride_z]);
     397              : 
     398              :                 // Compute pressure gradients
     399              :                 __m256d dp_dx = _mm256_mul_pd(_mm256_sub_pd(p_xp, p_xm), inv_2dx_vec);
     400              :                 __m256d dp_dy = _mm256_mul_pd(_mm256_sub_pd(p_yp, p_ym), inv_2dy_vec);
     401              :                 __m256d dp_dz = _mm256_mul_pd(_mm256_sub_pd(p_zp, p_zm), inv_2dz_vec);
     402              : 
     403              :                 // Load intermediate velocities
     404              :                 __m256d u_s = _mm256_loadu_pd(&u_star[idx]);
     405              :                 __m256d v_s = _mm256_loadu_pd(&v_star[idx]);
     406              :                 __m256d w_s = _mm256_loadu_pd(&w_star[idx]);
     407              : 
     408              :                 // Corrector: u = u* - (dt/rho) * dp/dx
     409              :                 __m256d u_new = _mm256_sub_pd(u_s, _mm256_mul_pd(dt_rho_vec, dp_dx));
     410              :                 __m256d v_new = _mm256_sub_pd(v_s, _mm256_mul_pd(dt_rho_vec, dp_dy));
     411              :                 __m256d w_new = _mm256_sub_pd(w_s, _mm256_mul_pd(dt_rho_vec, dp_dz));
     412              : 
     413              :                 // Clamp velocities to [-MAX_VELOCITY, MAX_VELOCITY]
     414              :                 u_new = _mm256_max_pd(neg_max_vel_vec, _mm256_min_pd(max_vel_vec, u_new));
     415              :                 v_new = _mm256_max_pd(neg_max_vel_vec, _mm256_min_pd(max_vel_vec, v_new));
     416              :                 w_new = _mm256_max_pd(neg_max_vel_vec, _mm256_min_pd(max_vel_vec, w_new));
     417              : 
     418              :                 // Store results
     419              :                 _mm256_storeu_pd(&field->u[idx], u_new);
     420              :                 _mm256_storeu_pd(&field->v[idx], v_new);
     421              :                 _mm256_storeu_pd(&field->w[idx], w_new);
     422              :             }
     423              : #endif
     424              : 
     425              :             // Scalar remainder
     426              :             for (; i < nx - 1; i++) {
     427              :                 size_t idx = k_off + IDX_2D(i, j, nx);
     428              : 
     429              :                 double dp_dx = (p_new[idx + 1] - p_new[idx - 1]) * inv_2dx;
     430              :                 double dp_dy = (p_new[idx + nx] - p_new[idx - nx]) * inv_2dy;
     431              :                 double dp_dz = (p_new[idx + ctx->stride_z] - p_new[idx - ctx->stride_z]) *
     432              :                                ctx->inv_2dz;
     433              : 
     434              :                 field->u[idx] = u_star[idx] - (dt_over_rho * dp_dx);
     435              :                 field->v[idx] = v_star[idx] - (dt_over_rho * dp_dy);
     436              :                 field->w[idx] = w_star[idx] - (dt_over_rho * dp_dz);
     437              : 
     438              :                 // Limit velocities
     439              :                 field->u[idx] = fmax(-MAX_VELOCITY, fmin(MAX_VELOCITY, field->u[idx]));
     440              :                 field->v[idx] = fmax(-MAX_VELOCITY, fmin(MAX_VELOCITY, field->v[idx]));
     441              :                 field->w[idx] = fmax(-MAX_VELOCITY, fmin(MAX_VELOCITY, field->w[idx]));
     442              :             }
     443              :         }
     444              :     }
     445              : 
     446              :     // Update pressure field
     447            0 :     memcpy(field->p, p_new, size * sizeof(double));
     448              : 
     449              :     // Energy equation: advance temperature after velocity correction
     450              :     {
     451            0 :         cfd_status_t energy_status = energy_step_explicit_avx2_with_workspace(
     452            0 :             field, grid, params, dt, ctx->iter_count * dt, ctx->T_ws, size);
     453            0 :         if (energy_status != CFD_SUCCESS) {
     454              :             return energy_status;
     455              :         }
     456              :     }
     457              : 
     458              :     // Apply configured thermal BCs to temperature field
     459              :     {
     460            0 :         cfd_status_t bc_status = energy_apply_thermal_bcs(field, params);
     461            0 :         if (bc_status != CFD_SUCCESS) {
     462              :             return bc_status;
     463              :         }
     464              :     }
     465              : 
     466              :     // Copy boundary velocity values from star arrays (which have caller's BCs)
     467            0 :     copy_boundary_velocities_3d(field->u, field->v, field->w, u_star, v_star, w_star,
     468              :                                 nx, ny, ctx->nz);
     469              : 
     470              :     // Check for NaN
     471            0 :     for (size_t n = 0; n < size; n++) {
     472            0 :         if (!isfinite(field->u[n]) || !isfinite(field->v[n]) ||
     473            0 :             !isfinite(field->w[n]) || !isfinite(field->p[n])) {
     474              :             return CFD_ERROR_DIVERGED;
     475              :         }
     476              :     }
     477              : 
     478            0 :     ctx->iter_count++;
     479              : 
     480            0 :     if (stats) {
     481            0 :         stats->iterations = 1;
     482              :     }
     483              : 
     484              :     return CFD_SUCCESS;
     485              : }

Generated by: LCOV version 2.0-1