LCOV - code coverage report
Current view: top level - solvers/navier_stokes/omp - solver_rk2_omp.c (source / functions) Coverage Total Hit
Test: coverage.info Lines: 92.2 % 77 71
Test Date: 2026-03-04 10:22:18 Functions: 100.0 % 2 2

            Line data    Source code
       1              : /**
       2              :  * @file solver_rk2_omp.c
       3              :  * @brief RK2 (Heun's method) time integration - OpenMP parallelized
       4              :  *
       5              :  * Same algorithm as scalar RK2 but with OpenMP-parallelized loops.
       6              :  * Achieves O(dt^2) temporal accuracy with multi-threaded execution.
       7              :  *
       8              :  * Branch-free 3D: when nz==1, stride_z=0 and inv_2dz/inv_dz2=0.0 cause all
       9              :  * z-terms to vanish, producing bit-identical results to the 2D code path.
      10              :  */
      11              : 
      12              : #include "cfd/core/cfd_status.h"
      13              : #include "cfd/core/grid.h"
      14              : #include "cfd/core/indexing.h"
      15              : #include "cfd/core/memory.h"
      16              : #include "cfd/solvers/navier_stokes_solver.h"
      17              : 
      18              : #include <math.h>
      19              : #include <stddef.h>
      20              : #include <string.h>
      21              : 
      22              : #ifdef CFD_ENABLE_OPENMP
      23              : 
      24              : #include <omp.h>
      25              : 
      26              : /* OpenMP version-gated pragmas */
      27              : #if _OPENMP >= 201307  /* OMP 4.0 */
      28              : #  define OMP_FOR_SIMD _Pragma("omp parallel for simd schedule(static)")
      29              : #else
      30              : #  define OMP_FOR_SIMD _Pragma("omp parallel for schedule(static)")
      31              : #endif
      32              : 
      33              : #ifndef M_PI
      34              : #define M_PI 3.14159265358979323846
      35              : #endif
      36              : 
      37              : /* Physical stability limits (same as scalar RK2) */
      38              : #define MAX_DERIVATIVE_LIMIT        100.0
      39              : #define MAX_SECOND_DERIVATIVE_LIMIT 1000.0
      40              : #define MAX_VELOCITY_LIMIT          100.0
      41              : #define MAX_DIVERGENCE_LIMIT        10.0
      42              : #define PRESSURE_UPDATE_FACTOR      0.1
      43              : 
      44              : /* ============================================================================
      45              :  * RHS EVALUATION (OpenMP parallelized)
      46              :  * ============================================================================ */
      47              : 
      48          424 : static void compute_rhs_omp(const double* u, const double* v, const double* w,
      49              :                              const double* p, const double* rho,
      50              :                              double* rhs_u, double* rhs_v, double* rhs_w,
      51              :                              double* rhs_p,
      52              :                              const grid* grid, const ns_solver_params_t* params,
      53              :                              size_t nx, size_t ny, size_t nz,
      54              :                              size_t stride_z, size_t k_start, size_t k_end,
      55              :                              double inv_2dz, double inv_dz2,
      56              :                              int iter, double dt) {
      57          424 :     ptrdiff_t ny_int = (ptrdiff_t)ny;
      58          424 :     ptrdiff_t nx_int = (ptrdiff_t)nx;
      59              : 
      60          868 :     for (size_t k = k_start; k < k_end; k++) {
      61          444 :         ptrdiff_t j;
      62          444 : #pragma omp parallel for schedule(static)
      63              :         for (j = 1; j < ny_int - 1; j++) {
      64              :             for (ptrdiff_t i = 1; i < nx_int - 1; i++) {
      65              :                 size_t idx = k * stride_z + IDX_2D((size_t)i, (size_t)j, nx);
      66              : 
      67              :                 /* Safety checks */
      68              :                 if (rho[idx] <= 1e-10) {
      69              :                     rhs_u[idx] = 0.0;
      70              :                     rhs_v[idx] = 0.0;
      71              :                     rhs_w[idx] = 0.0;
      72              :                     rhs_p[idx] = 0.0;
      73              :                     continue;
      74              :                 }
      75              :                 if (fabs(grid->dx[i]) < 1e-10 || fabs(grid->dy[j]) < 1e-10) {
      76              :                     rhs_u[idx] = 0.0;
      77              :                     rhs_v[idx] = 0.0;
      78              :                     rhs_w[idx] = 0.0;
      79              :                     rhs_p[idx] = 0.0;
      80              :                     continue;
      81              :                 }
      82              : 
      83              :                 /* Periodic stencil indices in x and y — avoids relying on ghost
      84              :                  * cells, critical for preserving RK2 temporal order. */
      85              :                 size_t il = ((size_t)i > 1)      ? idx - 1  : k * stride_z + IDX_2D(nx - 2, (size_t)j, nx);
      86              :                 size_t ir = ((size_t)i < nx - 2) ? idx + 1  : k * stride_z + IDX_2D(1, (size_t)j, nx);
      87              :                 size_t jd = ((size_t)j > 1)      ? idx - nx : k * stride_z + IDX_2D((size_t)i, ny - 2, nx);
      88              :                 size_t ju = ((size_t)j < ny - 2) ? idx + nx : k * stride_z + IDX_2D((size_t)i, 1, nx);
      89              : 
      90              :                 /* Periodic stencil indices in z.
      91              :                  * When nz==1: k=0, stride_z=0, so kd=ku=idx → z-terms vanish. */
      92              :                 size_t kd = (k > 1)      ? idx - stride_z
      93              :                                          : (nz - 2) * stride_z + IDX_2D((size_t)i, (size_t)j, nx);
      94              :                 size_t ku = (k < nz - 2) ? idx + stride_z
      95              :                                          : 1 * stride_z + IDX_2D((size_t)i, (size_t)j, nx);
      96              : 
      97              :                 /* First derivatives (central differences) */
      98              :                 double du_dx = (u[ir] - u[il]) / (2.0 * grid->dx[i]);
      99              :                 double du_dy = (u[ju] - u[jd]) / (2.0 * grid->dy[j]);
     100              :                 double du_dz = (u[ku] - u[kd]) * inv_2dz;
     101              : 
     102              :                 double dv_dx = (v[ir] - v[il]) / (2.0 * grid->dx[i]);
     103              :                 double dv_dy = (v[ju] - v[jd]) / (2.0 * grid->dy[j]);
     104              :                 double dv_dz = (v[ku] - v[kd]) * inv_2dz;
     105              : 
     106              :                 double dw_dx = (w[ir] - w[il]) / (2.0 * grid->dx[i]);
     107              :                 double dw_dy = (w[ju] - w[jd]) / (2.0 * grid->dy[j]);
     108              :                 double dw_dz = (w[ku] - w[kd]) * inv_2dz;
     109              : 
     110              :                 /* Pressure gradients */
     111              :                 double dp_dx = (p[ir] - p[il]) / (2.0 * grid->dx[i]);
     112              :                 double dp_dy = (p[ju] - p[jd]) / (2.0 * grid->dy[j]);
     113              :                 double dp_dz = (p[ku] - p[kd]) * inv_2dz;
     114              : 
     115              :                 /* Second derivatives (viscous terms) */
     116              :                 double d2u_dx2 = (u[ir] - 2.0 * u[idx] + u[il]) / (grid->dx[i] * grid->dx[i]);
     117              :                 double d2u_dy2 = (u[ju] - 2.0 * u[idx] + u[jd]) / (grid->dy[j] * grid->dy[j]);
     118              :                 double d2u_dz2 = (u[ku] - 2.0 * u[idx] + u[kd]) * inv_dz2;
     119              : 
     120              :                 double d2v_dx2 = (v[ir] - 2.0 * v[idx] + v[il]) / (grid->dx[i] * grid->dx[i]);
     121              :                 double d2v_dy2 = (v[ju] - 2.0 * v[idx] + v[jd]) / (grid->dy[j] * grid->dy[j]);
     122              :                 double d2v_dz2 = (v[ku] - 2.0 * v[idx] + v[kd]) * inv_dz2;
     123              : 
     124              :                 double d2w_dx2 = (w[ir] - 2.0 * w[idx] + w[il]) / (grid->dx[i] * grid->dx[i]);
     125              :                 double d2w_dy2 = (w[ju] - 2.0 * w[idx] + w[jd]) / (grid->dy[j] * grid->dy[j]);
     126              :                 double d2w_dz2 = (w[ku] - 2.0 * w[idx] + w[kd]) * inv_dz2;
     127              : 
     128              :                 /* Kinematic viscosity */
     129              :                 double nu = params->mu / fmax(rho[idx], 1e-10);
     130              :                 nu = fmin(nu, 1.0);
     131              : 
     132              :                 /* Clamp first derivatives */
     133              :                 du_dx = fmax(-MAX_DERIVATIVE_LIMIT, fmin(MAX_DERIVATIVE_LIMIT, du_dx));
     134              :                 du_dy = fmax(-MAX_DERIVATIVE_LIMIT, fmin(MAX_DERIVATIVE_LIMIT, du_dy));
     135              :                 du_dz = fmax(-MAX_DERIVATIVE_LIMIT, fmin(MAX_DERIVATIVE_LIMIT, du_dz));
     136              :                 dv_dx = fmax(-MAX_DERIVATIVE_LIMIT, fmin(MAX_DERIVATIVE_LIMIT, dv_dx));
     137              :                 dv_dy = fmax(-MAX_DERIVATIVE_LIMIT, fmin(MAX_DERIVATIVE_LIMIT, dv_dy));
     138              :                 dv_dz = fmax(-MAX_DERIVATIVE_LIMIT, fmin(MAX_DERIVATIVE_LIMIT, dv_dz));
     139              :                 dw_dx = fmax(-MAX_DERIVATIVE_LIMIT, fmin(MAX_DERIVATIVE_LIMIT, dw_dx));
     140              :                 dw_dy = fmax(-MAX_DERIVATIVE_LIMIT, fmin(MAX_DERIVATIVE_LIMIT, dw_dy));
     141              :                 dw_dz = fmax(-MAX_DERIVATIVE_LIMIT, fmin(MAX_DERIVATIVE_LIMIT, dw_dz));
     142              :                 dp_dx = fmax(-MAX_DERIVATIVE_LIMIT, fmin(MAX_DERIVATIVE_LIMIT, dp_dx));
     143              :                 dp_dy = fmax(-MAX_DERIVATIVE_LIMIT, fmin(MAX_DERIVATIVE_LIMIT, dp_dy));
     144              :                 dp_dz = fmax(-MAX_DERIVATIVE_LIMIT, fmin(MAX_DERIVATIVE_LIMIT, dp_dz));
     145              : 
     146              :                 /* Clamp second derivatives */
     147              :                 d2u_dx2 = fmax(-MAX_SECOND_DERIVATIVE_LIMIT, fmin(MAX_SECOND_DERIVATIVE_LIMIT, d2u_dx2));
     148              :                 d2u_dy2 = fmax(-MAX_SECOND_DERIVATIVE_LIMIT, fmin(MAX_SECOND_DERIVATIVE_LIMIT, d2u_dy2));
     149              :                 d2u_dz2 = fmax(-MAX_SECOND_DERIVATIVE_LIMIT, fmin(MAX_SECOND_DERIVATIVE_LIMIT, d2u_dz2));
     150              :                 d2v_dx2 = fmax(-MAX_SECOND_DERIVATIVE_LIMIT, fmin(MAX_SECOND_DERIVATIVE_LIMIT, d2v_dx2));
     151              :                 d2v_dy2 = fmax(-MAX_SECOND_DERIVATIVE_LIMIT, fmin(MAX_SECOND_DERIVATIVE_LIMIT, d2v_dy2));
     152              :                 d2v_dz2 = fmax(-MAX_SECOND_DERIVATIVE_LIMIT, fmin(MAX_SECOND_DERIVATIVE_LIMIT, d2v_dz2));
     153              :                 d2w_dx2 = fmax(-MAX_SECOND_DERIVATIVE_LIMIT, fmin(MAX_SECOND_DERIVATIVE_LIMIT, d2w_dx2));
     154              :                 d2w_dy2 = fmax(-MAX_SECOND_DERIVATIVE_LIMIT, fmin(MAX_SECOND_DERIVATIVE_LIMIT, d2w_dy2));
     155              :                 d2w_dz2 = fmax(-MAX_SECOND_DERIVATIVE_LIMIT, fmin(MAX_SECOND_DERIVATIVE_LIMIT, d2w_dz2));
     156              : 
     157              :                 /* Source terms */
     158              :                 double source_u = 0.0, source_v = 0.0, source_w = 0.0;
     159              :                 double z_coord = (nz > 1 && grid->z) ? grid->z[k] : 0.0;
     160              :                 compute_source_terms(grid->x[i], grid->y[j], z_coord, iter, dt,
     161              :                                      params, &source_u, &source_v, &source_w);
     162              : 
     163              :                 /* RHS for u-momentum */
     164              :                 rhs_u[idx] = -u[idx] * du_dx - v[idx] * du_dy - w[idx] * du_dz
     165              :                              - dp_dx / rho[idx]
     166              :                              + nu * (d2u_dx2 + d2u_dy2 + d2u_dz2)
     167              :                              + source_u;
     168              : 
     169              :                 /* RHS for v-momentum */
     170              :                 rhs_v[idx] = -u[idx] * dv_dx - v[idx] * dv_dy - w[idx] * dv_dz
     171              :                              - dp_dy / rho[idx]
     172              :                              + nu * (d2v_dx2 + d2v_dy2 + d2v_dz2)
     173              :                              + source_v;
     174              : 
     175              :                 /* RHS for w-momentum */
     176              :                 rhs_w[idx] = -u[idx] * dw_dx - v[idx] * dw_dy - w[idx] * dw_dz
     177              :                              - dp_dz / rho[idx]
     178              :                              + nu * (d2w_dx2 + d2w_dy2 + d2w_dz2)
     179              :                              + source_w;
     180              : 
     181              :                 /* Simplified pressure RHS (divergence-based) */
     182              :                 double divergence = du_dx + dv_dy + dw_dz;
     183              :                 divergence = fmax(-MAX_DIVERGENCE_LIMIT,
     184              :                                   fmin(MAX_DIVERGENCE_LIMIT, divergence));
     185              :                 rhs_p[idx] = -PRESSURE_UPDATE_FACTOR * rho[idx] * divergence;
     186              :             }
     187              :         }
     188              :     }
     189          424 : }
     190              : 
     191              : /* ============================================================================
     192              :  * RK2 OMP SOLVER
     193              :  * ============================================================================ */
     194              : 
     195          212 : cfd_status_t rk2_omp_impl(flow_field* field, const grid* grid,
     196              :                             const ns_solver_params_t* params) {
     197          212 :     if (field->nx < 3 || field->ny < 3 || (field->nz > 1 && field->nz < 3)) {
     198              :         return CFD_ERROR_INVALID;
     199              :     }
     200              : 
     201          212 :     size_t nx = field->nx;
     202          212 :     size_t ny = field->ny;
     203          212 :     size_t nz = field->nz;
     204              : 
     205              :     /* Reject non-uniform z-spacing (solver uses constant inv_2dz/inv_dz2) */
     206          212 :     if (nz > 1 && grid->dz) {
     207           14 :         for (size_t k = 1; k < nz - 1; k++) {
     208           12 :             if (fabs(grid->dz[k] - grid->dz[0]) > 1e-14) {
     209              :                 return CFD_ERROR_INVALID;
     210              :             }
     211              :         }
     212              :     }
     213              : 
     214          212 :     size_t plane = nx * ny;
     215          212 :     size_t total = plane * nz;
     216          212 :     size_t bytes = total * sizeof(double);
     217              : 
     218              :     /* Branch-free 3D constants */
     219          212 :     size_t stride_z = (nz > 1) ? plane : 0;
     220          212 :     size_t k_start  = (nz > 1) ? 1 : 0;
     221          212 :     size_t k_end    = (nz > 1) ? (nz - 1) : 1;
     222          212 :     double inv_2dz  = (nz > 1 && grid->dz) ? 1.0 / (2.0 * grid->dz[0]) : 0.0;
     223            2 :     double inv_dz2  = (nz > 1 && grid->dz) ? 1.0 / (grid->dz[0] * grid->dz[0]) : 0.0;
     224              : 
     225              :     /* Allocate working arrays:
     226              :      *   k1_u/v/w/p : Stage 1 derivatives
     227              :      *   k2_u/v/w/p : Stage 2 derivatives
     228              :      *   u0/v0/w0/p0 : Saved state Q^n
     229              :      */
     230          212 :     double* k1_u = (double*)cfd_calloc(total, sizeof(double));
     231          212 :     double* k1_v = (double*)cfd_calloc(total, sizeof(double));
     232          212 :     double* k1_w = (double*)cfd_calloc(total, sizeof(double));
     233          212 :     double* k1_p = (double*)cfd_calloc(total, sizeof(double));
     234          212 :     double* k2_u = (double*)cfd_calloc(total, sizeof(double));
     235          212 :     double* k2_v = (double*)cfd_calloc(total, sizeof(double));
     236          212 :     double* k2_w = (double*)cfd_calloc(total, sizeof(double));
     237          212 :     double* k2_p = (double*)cfd_calloc(total, sizeof(double));
     238          212 :     double* u0 = (double*)cfd_calloc(total, sizeof(double));
     239          212 :     double* v0 = (double*)cfd_calloc(total, sizeof(double));
     240          212 :     double* w0 = (double*)cfd_calloc(total, sizeof(double));
     241          212 :     double* p0 = (double*)cfd_calloc(total, sizeof(double));
     242              : 
     243          212 :     if (!k1_u || !k1_v || !k1_w || !k1_p ||
     244          212 :         !k2_u || !k2_v || !k2_w || !k2_p ||
     245          212 :         !u0 || !v0 || !w0 || !p0) {
     246            0 :         cfd_free(k1_u); cfd_free(k1_v); cfd_free(k1_w); cfd_free(k1_p);
     247            0 :         cfd_free(k2_u); cfd_free(k2_v); cfd_free(k2_w); cfd_free(k2_p);
     248            0 :         cfd_free(u0); cfd_free(v0); cfd_free(w0); cfd_free(p0);
     249            0 :         return CFD_ERROR_NOMEM;
     250              :     }
     251              : 
     252          212 :     double dt = params->dt;
     253          212 :     ptrdiff_t total_int = (ptrdiff_t)total;
     254          212 :     cfd_status_t status = CFD_SUCCESS;
     255              : 
     256          424 :     for (int iter = 0; iter < params->max_iter; iter++) {
     257              :         /* Save Q^n */
     258          212 :         memcpy(u0, field->u, bytes);
     259          212 :         memcpy(v0, field->v, bytes);
     260          212 :         memcpy(w0, field->w, bytes);
     261          212 :         memcpy(p0, field->p, bytes);
     262              : 
     263              :         /* ---- Stage 1: k1 = RHS(Q^n) ---- */
     264          212 :         memset(k1_u, 0, bytes);
     265          212 :         memset(k1_v, 0, bytes);
     266          212 :         memset(k1_w, 0, bytes);
     267          212 :         memset(k1_p, 0, bytes);
     268              : 
     269          212 :         compute_rhs_omp(field->u, field->v, field->w, field->p, field->rho,
     270              :                          k1_u, k1_v, k1_w, k1_p,
     271              :                          grid, params, nx, ny, nz,
     272              :                          stride_z, k_start, k_end, inv_2dz, inv_dz2,
     273              :                          iter, dt);
     274              : 
     275              :         /* ---- Intermediate: field = Q^n + dt * k1 ---- */
     276              :         {
     277          212 :             ptrdiff_t kk;
     278          212 :             OMP_FOR_SIMD
     279              :             for (kk = 0; kk < total_int; kk++) {
     280              :                 field->u[kk] = u0[kk] + dt * k1_u[kk];
     281              :                 field->v[kk] = v0[kk] + dt * k1_v[kk];
     282              :                 field->w[kk] = w0[kk] + dt * k1_w[kk];
     283              :                 field->p[kk] = p0[kk] + dt * k1_p[kk];
     284              : 
     285              :                 field->u[kk] = fmax(-MAX_VELOCITY_LIMIT, fmin(MAX_VELOCITY_LIMIT, field->u[kk]));
     286              :                 field->v[kk] = fmax(-MAX_VELOCITY_LIMIT, fmin(MAX_VELOCITY_LIMIT, field->v[kk]));
     287              :                 field->w[kk] = fmax(-MAX_VELOCITY_LIMIT, fmin(MAX_VELOCITY_LIMIT, field->w[kk]));
     288              :             }
     289              :         }
     290              : 
     291              :         /* NOTE: Do NOT apply BCs between RK stages. The ghost cells carry
     292              :          * zero-derivative evolution (k1[ghost]=0), which is consistent with
     293              :          * the semi-discrete ODE system. Applying BCs here would reduce RK2
     294              :          * to first-order temporal accuracy. */
     295              : 
     296              :         /* ---- Stage 2: k2 = RHS(Q_pred) ---- */
     297          212 :         memset(k2_u, 0, bytes);
     298          212 :         memset(k2_v, 0, bytes);
     299          212 :         memset(k2_w, 0, bytes);
     300          212 :         memset(k2_p, 0, bytes);
     301              : 
     302          212 :         compute_rhs_omp(field->u, field->v, field->w, field->p, field->rho,
     303              :                          k2_u, k2_v, k2_w, k2_p,
     304              :                          grid, params, nx, ny, nz,
     305              :                          stride_z, k_start, k_end, inv_2dz, inv_dz2,
     306              :                          iter, dt);
     307              : 
     308              :         /* ---- Final update: Q^{n+1} = Q^n + (dt/2)*(k1 + k2) ---- */
     309              :         {
     310          212 :             double half_dt = 0.5 * dt;
     311          212 :             ptrdiff_t kk;
     312          212 :             OMP_FOR_SIMD
     313              :             for (kk = 0; kk < total_int; kk++) {
     314              :                 field->u[kk] = u0[kk] + half_dt * (k1_u[kk] + k2_u[kk]);
     315              :                 field->v[kk] = v0[kk] + half_dt * (k1_v[kk] + k2_v[kk]);
     316              :                 field->w[kk] = w0[kk] + half_dt * (k1_w[kk] + k2_w[kk]);
     317              :                 field->p[kk] = p0[kk] + half_dt * (k1_p[kk] + k2_p[kk]);
     318              : 
     319              :                 field->u[kk] = fmax(-MAX_VELOCITY_LIMIT, fmin(MAX_VELOCITY_LIMIT, field->u[kk]));
     320              :                 field->v[kk] = fmax(-MAX_VELOCITY_LIMIT, fmin(MAX_VELOCITY_LIMIT, field->v[kk]));
     321              :                 field->w[kk] = fmax(-MAX_VELOCITY_LIMIT, fmin(MAX_VELOCITY_LIMIT, field->w[kk]));
     322              :             }
     323              :         }
     324              : 
     325              :         /* Apply BCs to final state only (after the full RK2 step) */
     326          212 :         apply_boundary_conditions(field, grid);
     327              : 
     328              :         /* NaN / Inf check (parallelized) */
     329              :         {
     330          212 :             int has_nan = 0;
     331          212 :             ptrdiff_t kk;
     332          212 : #pragma omp parallel for reduction(| : has_nan) schedule(static)
     333              :             for (kk = 0; kk < total_int; kk++) {
     334              :                 if (!isfinite(field->u[kk]) || !isfinite(field->v[kk]) ||
     335              :                     !isfinite(field->w[kk]) || !isfinite(field->p[kk])) {
     336              :                     has_nan = 1;
     337              :                 }
     338              :             }
     339          212 :             if (has_nan) {
     340            0 :                 status = CFD_ERROR_DIVERGED;
     341            0 :                 goto cleanup;
     342              :             }
     343              :         }
     344              :     }
     345              : 
     346          212 : cleanup:
     347          212 :     cfd_free(k1_u); cfd_free(k1_v); cfd_free(k1_w); cfd_free(k1_p);
     348          212 :     cfd_free(k2_u); cfd_free(k2_v); cfd_free(k2_w); cfd_free(k2_p);
     349          212 :     cfd_free(u0); cfd_free(v0); cfd_free(w0); cfd_free(p0);
     350              : 
     351          212 :     return status;
     352              : }
     353              : 
     354              : #endif /* CFD_ENABLE_OPENMP */
        

Generated by: LCOV version 2.0-1