Line data Source code
1 : /**
2 : * @file solver_rk2_omp.c
3 : * @brief RK2 (Heun's method) time integration - OpenMP parallelized
4 : *
5 : * Same algorithm as scalar RK2 but with OpenMP-parallelized loops.
6 : * Achieves O(dt^2) temporal accuracy with multi-threaded execution.
7 : *
8 : * Branch-free 3D: when nz==1, stride_z=0 and inv_2dz/inv_dz2=0.0 cause all
9 : * z-terms to vanish, producing bit-identical results to the 2D code path.
10 : */
11 :
12 : #include "cfd/core/cfd_status.h"
13 : #include "cfd/core/grid.h"
14 : #include "cfd/core/indexing.h"
15 : #include "cfd/core/memory.h"
16 : #include "cfd/solvers/navier_stokes_solver.h"
17 :
18 : #include <math.h>
19 : #include <stddef.h>
20 : #include <string.h>
21 :
22 : #ifdef CFD_ENABLE_OPENMP
23 :
24 : #include <omp.h>
25 :
26 : /* OpenMP version-gated pragmas */
27 : #if _OPENMP >= 201307 /* OMP 4.0 */
28 : # define OMP_FOR_SIMD _Pragma("omp parallel for simd schedule(static)")
29 : #else
30 : # define OMP_FOR_SIMD _Pragma("omp parallel for schedule(static)")
31 : #endif
32 :
33 : #ifndef M_PI
34 : #define M_PI 3.14159265358979323846
35 : #endif
36 :
37 : /* Physical stability limits (same as scalar RK2) */
38 : #define MAX_DERIVATIVE_LIMIT 100.0
39 : #define MAX_SECOND_DERIVATIVE_LIMIT 1000.0
40 : #define MAX_VELOCITY_LIMIT 100.0
41 : #define MAX_DIVERGENCE_LIMIT 10.0
42 : #define PRESSURE_UPDATE_FACTOR 0.1
43 :
44 : /* ============================================================================
45 : * RHS EVALUATION (OpenMP parallelized)
46 : * ============================================================================ */
47 :
48 424 : static void compute_rhs_omp(const double* u, const double* v, const double* w,
49 : const double* p, const double* rho,
50 : double* rhs_u, double* rhs_v, double* rhs_w,
51 : double* rhs_p,
52 : const grid* grid, const ns_solver_params_t* params,
53 : size_t nx, size_t ny, size_t nz,
54 : size_t stride_z, size_t k_start, size_t k_end,
55 : double inv_2dz, double inv_dz2,
56 : int iter, double dt) {
57 424 : ptrdiff_t ny_int = (ptrdiff_t)ny;
58 424 : ptrdiff_t nx_int = (ptrdiff_t)nx;
59 :
60 868 : for (size_t k = k_start; k < k_end; k++) {
61 444 : ptrdiff_t j;
62 444 : #pragma omp parallel for schedule(static)
63 : for (j = 1; j < ny_int - 1; j++) {
64 : for (ptrdiff_t i = 1; i < nx_int - 1; i++) {
65 : size_t idx = k * stride_z + IDX_2D((size_t)i, (size_t)j, nx);
66 :
67 : /* Safety checks */
68 : if (rho[idx] <= 1e-10) {
69 : rhs_u[idx] = 0.0;
70 : rhs_v[idx] = 0.0;
71 : rhs_w[idx] = 0.0;
72 : rhs_p[idx] = 0.0;
73 : continue;
74 : }
75 : if (fabs(grid->dx[i]) < 1e-10 || fabs(grid->dy[j]) < 1e-10) {
76 : rhs_u[idx] = 0.0;
77 : rhs_v[idx] = 0.0;
78 : rhs_w[idx] = 0.0;
79 : rhs_p[idx] = 0.0;
80 : continue;
81 : }
82 :
83 : /* Periodic stencil indices in x and y — avoids relying on ghost
84 : * cells, critical for preserving RK2 temporal order. */
85 : size_t il = ((size_t)i > 1) ? idx - 1 : k * stride_z + IDX_2D(nx - 2, (size_t)j, nx);
86 : size_t ir = ((size_t)i < nx - 2) ? idx + 1 : k * stride_z + IDX_2D(1, (size_t)j, nx);
87 : size_t jd = ((size_t)j > 1) ? idx - nx : k * stride_z + IDX_2D((size_t)i, ny - 2, nx);
88 : size_t ju = ((size_t)j < ny - 2) ? idx + nx : k * stride_z + IDX_2D((size_t)i, 1, nx);
89 :
90 : /* Periodic stencil indices in z.
91 : * When nz==1: k=0, stride_z=0, so kd=ku=idx → z-terms vanish. */
92 : size_t kd = (k > 1) ? idx - stride_z
93 : : (nz - 2) * stride_z + IDX_2D((size_t)i, (size_t)j, nx);
94 : size_t ku = (k < nz - 2) ? idx + stride_z
95 : : 1 * stride_z + IDX_2D((size_t)i, (size_t)j, nx);
96 :
97 : /* First derivatives (central differences) */
98 : double du_dx = (u[ir] - u[il]) / (2.0 * grid->dx[i]);
99 : double du_dy = (u[ju] - u[jd]) / (2.0 * grid->dy[j]);
100 : double du_dz = (u[ku] - u[kd]) * inv_2dz;
101 :
102 : double dv_dx = (v[ir] - v[il]) / (2.0 * grid->dx[i]);
103 : double dv_dy = (v[ju] - v[jd]) / (2.0 * grid->dy[j]);
104 : double dv_dz = (v[ku] - v[kd]) * inv_2dz;
105 :
106 : double dw_dx = (w[ir] - w[il]) / (2.0 * grid->dx[i]);
107 : double dw_dy = (w[ju] - w[jd]) / (2.0 * grid->dy[j]);
108 : double dw_dz = (w[ku] - w[kd]) * inv_2dz;
109 :
110 : /* Pressure gradients */
111 : double dp_dx = (p[ir] - p[il]) / (2.0 * grid->dx[i]);
112 : double dp_dy = (p[ju] - p[jd]) / (2.0 * grid->dy[j]);
113 : double dp_dz = (p[ku] - p[kd]) * inv_2dz;
114 :
115 : /* Second derivatives (viscous terms) */
116 : double d2u_dx2 = (u[ir] - 2.0 * u[idx] + u[il]) / (grid->dx[i] * grid->dx[i]);
117 : double d2u_dy2 = (u[ju] - 2.0 * u[idx] + u[jd]) / (grid->dy[j] * grid->dy[j]);
118 : double d2u_dz2 = (u[ku] - 2.0 * u[idx] + u[kd]) * inv_dz2;
119 :
120 : double d2v_dx2 = (v[ir] - 2.0 * v[idx] + v[il]) / (grid->dx[i] * grid->dx[i]);
121 : double d2v_dy2 = (v[ju] - 2.0 * v[idx] + v[jd]) / (grid->dy[j] * grid->dy[j]);
122 : double d2v_dz2 = (v[ku] - 2.0 * v[idx] + v[kd]) * inv_dz2;
123 :
124 : double d2w_dx2 = (w[ir] - 2.0 * w[idx] + w[il]) / (grid->dx[i] * grid->dx[i]);
125 : double d2w_dy2 = (w[ju] - 2.0 * w[idx] + w[jd]) / (grid->dy[j] * grid->dy[j]);
126 : double d2w_dz2 = (w[ku] - 2.0 * w[idx] + w[kd]) * inv_dz2;
127 :
128 : /* Kinematic viscosity */
129 : double nu = params->mu / fmax(rho[idx], 1e-10);
130 : nu = fmin(nu, 1.0);
131 :
132 : /* Clamp first derivatives */
133 : du_dx = fmax(-MAX_DERIVATIVE_LIMIT, fmin(MAX_DERIVATIVE_LIMIT, du_dx));
134 : du_dy = fmax(-MAX_DERIVATIVE_LIMIT, fmin(MAX_DERIVATIVE_LIMIT, du_dy));
135 : du_dz = fmax(-MAX_DERIVATIVE_LIMIT, fmin(MAX_DERIVATIVE_LIMIT, du_dz));
136 : dv_dx = fmax(-MAX_DERIVATIVE_LIMIT, fmin(MAX_DERIVATIVE_LIMIT, dv_dx));
137 : dv_dy = fmax(-MAX_DERIVATIVE_LIMIT, fmin(MAX_DERIVATIVE_LIMIT, dv_dy));
138 : dv_dz = fmax(-MAX_DERIVATIVE_LIMIT, fmin(MAX_DERIVATIVE_LIMIT, dv_dz));
139 : dw_dx = fmax(-MAX_DERIVATIVE_LIMIT, fmin(MAX_DERIVATIVE_LIMIT, dw_dx));
140 : dw_dy = fmax(-MAX_DERIVATIVE_LIMIT, fmin(MAX_DERIVATIVE_LIMIT, dw_dy));
141 : dw_dz = fmax(-MAX_DERIVATIVE_LIMIT, fmin(MAX_DERIVATIVE_LIMIT, dw_dz));
142 : dp_dx = fmax(-MAX_DERIVATIVE_LIMIT, fmin(MAX_DERIVATIVE_LIMIT, dp_dx));
143 : dp_dy = fmax(-MAX_DERIVATIVE_LIMIT, fmin(MAX_DERIVATIVE_LIMIT, dp_dy));
144 : dp_dz = fmax(-MAX_DERIVATIVE_LIMIT, fmin(MAX_DERIVATIVE_LIMIT, dp_dz));
145 :
146 : /* Clamp second derivatives */
147 : d2u_dx2 = fmax(-MAX_SECOND_DERIVATIVE_LIMIT, fmin(MAX_SECOND_DERIVATIVE_LIMIT, d2u_dx2));
148 : d2u_dy2 = fmax(-MAX_SECOND_DERIVATIVE_LIMIT, fmin(MAX_SECOND_DERIVATIVE_LIMIT, d2u_dy2));
149 : d2u_dz2 = fmax(-MAX_SECOND_DERIVATIVE_LIMIT, fmin(MAX_SECOND_DERIVATIVE_LIMIT, d2u_dz2));
150 : d2v_dx2 = fmax(-MAX_SECOND_DERIVATIVE_LIMIT, fmin(MAX_SECOND_DERIVATIVE_LIMIT, d2v_dx2));
151 : d2v_dy2 = fmax(-MAX_SECOND_DERIVATIVE_LIMIT, fmin(MAX_SECOND_DERIVATIVE_LIMIT, d2v_dy2));
152 : d2v_dz2 = fmax(-MAX_SECOND_DERIVATIVE_LIMIT, fmin(MAX_SECOND_DERIVATIVE_LIMIT, d2v_dz2));
153 : d2w_dx2 = fmax(-MAX_SECOND_DERIVATIVE_LIMIT, fmin(MAX_SECOND_DERIVATIVE_LIMIT, d2w_dx2));
154 : d2w_dy2 = fmax(-MAX_SECOND_DERIVATIVE_LIMIT, fmin(MAX_SECOND_DERIVATIVE_LIMIT, d2w_dy2));
155 : d2w_dz2 = fmax(-MAX_SECOND_DERIVATIVE_LIMIT, fmin(MAX_SECOND_DERIVATIVE_LIMIT, d2w_dz2));
156 :
157 : /* Source terms */
158 : double source_u = 0.0, source_v = 0.0, source_w = 0.0;
159 : double z_coord = (nz > 1 && grid->z) ? grid->z[k] : 0.0;
160 : compute_source_terms(grid->x[i], grid->y[j], z_coord, iter, dt,
161 : params, &source_u, &source_v, &source_w);
162 :
163 : /* RHS for u-momentum */
164 : rhs_u[idx] = -u[idx] * du_dx - v[idx] * du_dy - w[idx] * du_dz
165 : - dp_dx / rho[idx]
166 : + nu * (d2u_dx2 + d2u_dy2 + d2u_dz2)
167 : + source_u;
168 :
169 : /* RHS for v-momentum */
170 : rhs_v[idx] = -u[idx] * dv_dx - v[idx] * dv_dy - w[idx] * dv_dz
171 : - dp_dy / rho[idx]
172 : + nu * (d2v_dx2 + d2v_dy2 + d2v_dz2)
173 : + source_v;
174 :
175 : /* RHS for w-momentum */
176 : rhs_w[idx] = -u[idx] * dw_dx - v[idx] * dw_dy - w[idx] * dw_dz
177 : - dp_dz / rho[idx]
178 : + nu * (d2w_dx2 + d2w_dy2 + d2w_dz2)
179 : + source_w;
180 :
181 : /* Simplified pressure RHS (divergence-based) */
182 : double divergence = du_dx + dv_dy + dw_dz;
183 : divergence = fmax(-MAX_DIVERGENCE_LIMIT,
184 : fmin(MAX_DIVERGENCE_LIMIT, divergence));
185 : rhs_p[idx] = -PRESSURE_UPDATE_FACTOR * rho[idx] * divergence;
186 : }
187 : }
188 : }
189 424 : }
190 :
191 : /* ============================================================================
192 : * RK2 OMP SOLVER
193 : * ============================================================================ */
194 :
195 212 : cfd_status_t rk2_omp_impl(flow_field* field, const grid* grid,
196 : const ns_solver_params_t* params) {
197 212 : if (field->nx < 3 || field->ny < 3 || (field->nz > 1 && field->nz < 3)) {
198 : return CFD_ERROR_INVALID;
199 : }
200 :
201 212 : size_t nx = field->nx;
202 212 : size_t ny = field->ny;
203 212 : size_t nz = field->nz;
204 :
205 : /* Reject non-uniform z-spacing (solver uses constant inv_2dz/inv_dz2) */
206 212 : if (nz > 1 && grid->dz) {
207 14 : for (size_t k = 1; k < nz - 1; k++) {
208 12 : if (fabs(grid->dz[k] - grid->dz[0]) > 1e-14) {
209 : return CFD_ERROR_INVALID;
210 : }
211 : }
212 : }
213 :
214 212 : size_t plane = nx * ny;
215 212 : size_t total = plane * nz;
216 212 : size_t bytes = total * sizeof(double);
217 :
218 : /* Branch-free 3D constants */
219 212 : size_t stride_z = (nz > 1) ? plane : 0;
220 212 : size_t k_start = (nz > 1) ? 1 : 0;
221 212 : size_t k_end = (nz > 1) ? (nz - 1) : 1;
222 212 : double inv_2dz = (nz > 1 && grid->dz) ? 1.0 / (2.0 * grid->dz[0]) : 0.0;
223 2 : double inv_dz2 = (nz > 1 && grid->dz) ? 1.0 / (grid->dz[0] * grid->dz[0]) : 0.0;
224 :
225 : /* Allocate working arrays:
226 : * k1_u/v/w/p : Stage 1 derivatives
227 : * k2_u/v/w/p : Stage 2 derivatives
228 : * u0/v0/w0/p0 : Saved state Q^n
229 : */
230 212 : double* k1_u = (double*)cfd_calloc(total, sizeof(double));
231 212 : double* k1_v = (double*)cfd_calloc(total, sizeof(double));
232 212 : double* k1_w = (double*)cfd_calloc(total, sizeof(double));
233 212 : double* k1_p = (double*)cfd_calloc(total, sizeof(double));
234 212 : double* k2_u = (double*)cfd_calloc(total, sizeof(double));
235 212 : double* k2_v = (double*)cfd_calloc(total, sizeof(double));
236 212 : double* k2_w = (double*)cfd_calloc(total, sizeof(double));
237 212 : double* k2_p = (double*)cfd_calloc(total, sizeof(double));
238 212 : double* u0 = (double*)cfd_calloc(total, sizeof(double));
239 212 : double* v0 = (double*)cfd_calloc(total, sizeof(double));
240 212 : double* w0 = (double*)cfd_calloc(total, sizeof(double));
241 212 : double* p0 = (double*)cfd_calloc(total, sizeof(double));
242 :
243 212 : if (!k1_u || !k1_v || !k1_w || !k1_p ||
244 212 : !k2_u || !k2_v || !k2_w || !k2_p ||
245 212 : !u0 || !v0 || !w0 || !p0) {
246 0 : cfd_free(k1_u); cfd_free(k1_v); cfd_free(k1_w); cfd_free(k1_p);
247 0 : cfd_free(k2_u); cfd_free(k2_v); cfd_free(k2_w); cfd_free(k2_p);
248 0 : cfd_free(u0); cfd_free(v0); cfd_free(w0); cfd_free(p0);
249 0 : return CFD_ERROR_NOMEM;
250 : }
251 :
252 212 : double dt = params->dt;
253 212 : ptrdiff_t total_int = (ptrdiff_t)total;
254 212 : cfd_status_t status = CFD_SUCCESS;
255 :
256 424 : for (int iter = 0; iter < params->max_iter; iter++) {
257 : /* Save Q^n */
258 212 : memcpy(u0, field->u, bytes);
259 212 : memcpy(v0, field->v, bytes);
260 212 : memcpy(w0, field->w, bytes);
261 212 : memcpy(p0, field->p, bytes);
262 :
263 : /* ---- Stage 1: k1 = RHS(Q^n) ---- */
264 212 : memset(k1_u, 0, bytes);
265 212 : memset(k1_v, 0, bytes);
266 212 : memset(k1_w, 0, bytes);
267 212 : memset(k1_p, 0, bytes);
268 :
269 212 : compute_rhs_omp(field->u, field->v, field->w, field->p, field->rho,
270 : k1_u, k1_v, k1_w, k1_p,
271 : grid, params, nx, ny, nz,
272 : stride_z, k_start, k_end, inv_2dz, inv_dz2,
273 : iter, dt);
274 :
275 : /* ---- Intermediate: field = Q^n + dt * k1 ---- */
276 : {
277 212 : ptrdiff_t kk;
278 212 : OMP_FOR_SIMD
279 : for (kk = 0; kk < total_int; kk++) {
280 : field->u[kk] = u0[kk] + dt * k1_u[kk];
281 : field->v[kk] = v0[kk] + dt * k1_v[kk];
282 : field->w[kk] = w0[kk] + dt * k1_w[kk];
283 : field->p[kk] = p0[kk] + dt * k1_p[kk];
284 :
285 : field->u[kk] = fmax(-MAX_VELOCITY_LIMIT, fmin(MAX_VELOCITY_LIMIT, field->u[kk]));
286 : field->v[kk] = fmax(-MAX_VELOCITY_LIMIT, fmin(MAX_VELOCITY_LIMIT, field->v[kk]));
287 : field->w[kk] = fmax(-MAX_VELOCITY_LIMIT, fmin(MAX_VELOCITY_LIMIT, field->w[kk]));
288 : }
289 : }
290 :
291 : /* NOTE: Do NOT apply BCs between RK stages. The ghost cells carry
292 : * zero-derivative evolution (k1[ghost]=0), which is consistent with
293 : * the semi-discrete ODE system. Applying BCs here would reduce RK2
294 : * to first-order temporal accuracy. */
295 :
296 : /* ---- Stage 2: k2 = RHS(Q_pred) ---- */
297 212 : memset(k2_u, 0, bytes);
298 212 : memset(k2_v, 0, bytes);
299 212 : memset(k2_w, 0, bytes);
300 212 : memset(k2_p, 0, bytes);
301 :
302 212 : compute_rhs_omp(field->u, field->v, field->w, field->p, field->rho,
303 : k2_u, k2_v, k2_w, k2_p,
304 : grid, params, nx, ny, nz,
305 : stride_z, k_start, k_end, inv_2dz, inv_dz2,
306 : iter, dt);
307 :
308 : /* ---- Final update: Q^{n+1} = Q^n + (dt/2)*(k1 + k2) ---- */
309 : {
310 212 : double half_dt = 0.5 * dt;
311 212 : ptrdiff_t kk;
312 212 : OMP_FOR_SIMD
313 : for (kk = 0; kk < total_int; kk++) {
314 : field->u[kk] = u0[kk] + half_dt * (k1_u[kk] + k2_u[kk]);
315 : field->v[kk] = v0[kk] + half_dt * (k1_v[kk] + k2_v[kk]);
316 : field->w[kk] = w0[kk] + half_dt * (k1_w[kk] + k2_w[kk]);
317 : field->p[kk] = p0[kk] + half_dt * (k1_p[kk] + k2_p[kk]);
318 :
319 : field->u[kk] = fmax(-MAX_VELOCITY_LIMIT, fmin(MAX_VELOCITY_LIMIT, field->u[kk]));
320 : field->v[kk] = fmax(-MAX_VELOCITY_LIMIT, fmin(MAX_VELOCITY_LIMIT, field->v[kk]));
321 : field->w[kk] = fmax(-MAX_VELOCITY_LIMIT, fmin(MAX_VELOCITY_LIMIT, field->w[kk]));
322 : }
323 : }
324 :
325 : /* Apply BCs to final state only (after the full RK2 step) */
326 212 : apply_boundary_conditions(field, grid);
327 :
328 : /* NaN / Inf check (parallelized) */
329 : {
330 212 : int has_nan = 0;
331 212 : ptrdiff_t kk;
332 212 : #pragma omp parallel for reduction(| : has_nan) schedule(static)
333 : for (kk = 0; kk < total_int; kk++) {
334 : if (!isfinite(field->u[kk]) || !isfinite(field->v[kk]) ||
335 : !isfinite(field->w[kk]) || !isfinite(field->p[kk])) {
336 : has_nan = 1;
337 : }
338 : }
339 212 : if (has_nan) {
340 0 : status = CFD_ERROR_DIVERGED;
341 0 : goto cleanup;
342 : }
343 : }
344 : }
345 :
346 212 : cleanup:
347 212 : cfd_free(k1_u); cfd_free(k1_v); cfd_free(k1_w); cfd_free(k1_p);
348 212 : cfd_free(k2_u); cfd_free(k2_v); cfd_free(k2_w); cfd_free(k2_p);
349 212 : cfd_free(u0); cfd_free(v0); cfd_free(w0); cfd_free(p0);
350 :
351 212 : return status;
352 : }
353 :
354 : #endif /* CFD_ENABLE_OPENMP */
|