Line data Source code
1 : /**
2 : * @file solver_rk2_avx2.c
3 : * @brief RK2 (Heun's method) - AVX2 + OpenMP implementation
4 : *
5 : * AVX2-only backend. Returns CFD_ERROR_UNSUPPORTED if AVX2 is unavailable.
6 : * Uses persistent aligned buffers in context to avoid per-step allocation.
7 : *
8 : * SIMD strategy per row (fixed j, k):
9 : * i = 1 : scalar (periodic il wrap)
10 : * i = 2..nx-4 : AVX2 vectorized (interior, no wrapping)
11 : * i = nx-2 : scalar (periodic ir wrap)
12 : * Remainder between AVX2 end and nx-2: scalar (no wrapping)
13 : *
14 : * Algorithm: identical to scalar/OMP RK2.
15 : * k1 = RHS(Q^n)
16 : * Q_pred = Q^n + dt*k1
17 : * k2 = RHS(Q_pred)
18 : * Q^{n+1} = Q^n + (dt/2)*(k1 + k2)
19 : * BCs applied to final state only.
20 : *
21 : * Branch-free 3D: when nz==1, stride_z=0 and inv_2dz/inv_dz2=0.0 cause all
22 : * z-terms to vanish, producing bit-identical results to the 2D code path.
23 : */
24 :
25 : #define _POSIX_C_SOURCE 200809L
26 : #define _ISOC11_SOURCE
27 : #define _USE_MATH_DEFINES
28 :
29 : #include "cfd/core/cfd_status.h"
30 : #include "cfd/core/grid.h"
31 : #include "cfd/core/indexing.h"
32 : #include "cfd/core/memory.h"
33 : #include "cfd/solvers/navier_stokes_solver.h"
34 :
35 : #include <math.h>
36 : #include <stddef.h>
37 : #include <stdio.h>
38 : #include <string.h>
39 :
40 : #ifndef M_PI
41 : #define M_PI 3.14159265358979323846
42 : #endif
43 :
44 : #ifdef _OPENMP
45 : #include <omp.h>
46 : #endif
47 :
48 : #if defined(CFD_HAS_AVX2)
49 : #include <immintrin.h>
50 : #define USE_AVX2 1
51 : #else
52 : #define USE_AVX2 0
53 : #endif
54 :
55 : /* Physical stability limits (identical to scalar/OMP RK2) */
56 : #define MAX_DERIVATIVE_LIMIT 100.0
57 : #define MAX_SECOND_DERIVATIVE_LIMIT 1000.0
58 : #define MAX_VELOCITY_LIMIT 100.0
59 : #define MAX_DIVERGENCE_LIMIT 10.0
60 : #define PRESSURE_UPDATE_FACTOR 0.1
61 :
62 : /* ============================================================================
63 : * CONTEXT
64 : * ============================================================================ */
65 :
66 : typedef struct {
67 : double* k1_u; double* k1_v; double* k1_w; double* k1_p;
68 : double* k2_u; double* k2_v; double* k2_w; double* k2_p;
69 : double* u0; double* v0; double* w0; double* p0;
70 : double* dx_inv; /* 1/(2*dx[i]) for i = 0..nx-1 */
71 : double* dy_inv; /* 1/(2*dy[j]) for j = 0..ny-1 */
72 : size_t nx, ny, nz;
73 : size_t stride_z;
74 : size_t k_start, k_end;
75 : double inv_2dz, inv_dz2;
76 : int initialized;
77 : } rk2_avx2_context_t;
78 :
79 : /* ============================================================================
80 : * PUBLIC API DECLARATIONS
81 : * ============================================================================ */
82 :
83 : cfd_status_t rk2_avx2_init(ns_solver_t* solver, const grid* g,
84 : const ns_solver_params_t* params);
85 : void rk2_avx2_destroy(ns_solver_t* solver);
86 : cfd_status_t rk2_avx2_step(ns_solver_t* solver, flow_field* field, const grid* g,
87 : const ns_solver_params_t* params, ns_solver_stats_t* stats);
88 :
89 : /* ============================================================================
90 : * SCALAR POINT HELPER
91 : * Handles a single interior grid point given explicit stencil indices.
92 : * Used for periodic-edge columns (i=1 with il wrap, i=nx-2 with ir wrap)
93 : * and for scalar remainder after the AVX2 group.
94 : * ============================================================================ */
95 :
96 : static void rk2_rhs_point(
97 : const double* u, const double* v, const double* w,
98 : const double* p, const double* rho,
99 : double* rhs_u, double* rhs_v, double* rhs_w, double* rhs_p,
100 : const grid* g, const ns_solver_params_t* params,
101 : size_t idx, size_t il, size_t ir, size_t jd, size_t ju,
102 : size_t kd, size_t ku,
103 : double inv_2dz, double inv_dz2,
104 : size_t i, size_t j, size_t k, size_t nz,
105 : int iter, double dt)
106 : {
107 : if (rho[idx] <= 1e-10 || fabs(g->dx[i]) < 1e-10 || fabs(g->dy[j]) < 1e-10) {
108 : rhs_u[idx] = 0.0;
109 : rhs_v[idx] = 0.0;
110 : rhs_w[idx] = 0.0;
111 : rhs_p[idx] = 0.0;
112 : return;
113 : }
114 :
115 : double dx = g->dx[i], dy = g->dy[j];
116 : double dx2 = dx * dx, dy2 = dy * dy;
117 :
118 : double du_dx = (u[ir] - u[il]) / (2.0 * dx);
119 : double du_dy = (u[ju] - u[jd]) / (2.0 * dy);
120 : double du_dz = (u[ku] - u[kd]) * inv_2dz;
121 : double dv_dx = (v[ir] - v[il]) / (2.0 * dx);
122 : double dv_dy = (v[ju] - v[jd]) / (2.0 * dy);
123 : double dv_dz = (v[ku] - v[kd]) * inv_2dz;
124 : double dw_dx = (w[ir] - w[il]) / (2.0 * dx);
125 : double dw_dy = (w[ju] - w[jd]) / (2.0 * dy);
126 : double dw_dz = (w[ku] - w[kd]) * inv_2dz;
127 : double dp_dx = (p[ir] - p[il]) / (2.0 * dx);
128 : double dp_dy = (p[ju] - p[jd]) / (2.0 * dy);
129 : double dp_dz = (p[ku] - p[kd]) * inv_2dz;
130 :
131 : double d2u_dx2 = (u[ir] - 2.0 * u[idx] + u[il]) / dx2;
132 : double d2u_dy2 = (u[ju] - 2.0 * u[idx] + u[jd]) / dy2;
133 : double d2u_dz2 = (u[ku] - 2.0 * u[idx] + u[kd]) * inv_dz2;
134 : double d2v_dx2 = (v[ir] - 2.0 * v[idx] + v[il]) / dx2;
135 : double d2v_dy2 = (v[ju] - 2.0 * v[idx] + v[jd]) / dy2;
136 : double d2v_dz2 = (v[ku] - 2.0 * v[idx] + v[kd]) * inv_dz2;
137 : double d2w_dx2 = (w[ir] - 2.0 * w[idx] + w[il]) / dx2;
138 : double d2w_dy2 = (w[ju] - 2.0 * w[idx] + w[jd]) / dy2;
139 : double d2w_dz2 = (w[ku] - 2.0 * w[idx] + w[kd]) * inv_dz2;
140 :
141 : double nu = params->mu / fmax(rho[idx], 1e-10);
142 : nu = fmin(nu, 1.0);
143 :
144 : du_dx = fmax(-MAX_DERIVATIVE_LIMIT, fmin(MAX_DERIVATIVE_LIMIT, du_dx));
145 : du_dy = fmax(-MAX_DERIVATIVE_LIMIT, fmin(MAX_DERIVATIVE_LIMIT, du_dy));
146 : du_dz = fmax(-MAX_DERIVATIVE_LIMIT, fmin(MAX_DERIVATIVE_LIMIT, du_dz));
147 : dv_dx = fmax(-MAX_DERIVATIVE_LIMIT, fmin(MAX_DERIVATIVE_LIMIT, dv_dx));
148 : dv_dy = fmax(-MAX_DERIVATIVE_LIMIT, fmin(MAX_DERIVATIVE_LIMIT, dv_dy));
149 : dv_dz = fmax(-MAX_DERIVATIVE_LIMIT, fmin(MAX_DERIVATIVE_LIMIT, dv_dz));
150 : dw_dx = fmax(-MAX_DERIVATIVE_LIMIT, fmin(MAX_DERIVATIVE_LIMIT, dw_dx));
151 : dw_dy = fmax(-MAX_DERIVATIVE_LIMIT, fmin(MAX_DERIVATIVE_LIMIT, dw_dy));
152 : dw_dz = fmax(-MAX_DERIVATIVE_LIMIT, fmin(MAX_DERIVATIVE_LIMIT, dw_dz));
153 : dp_dx = fmax(-MAX_DERIVATIVE_LIMIT, fmin(MAX_DERIVATIVE_LIMIT, dp_dx));
154 : dp_dy = fmax(-MAX_DERIVATIVE_LIMIT, fmin(MAX_DERIVATIVE_LIMIT, dp_dy));
155 : dp_dz = fmax(-MAX_DERIVATIVE_LIMIT, fmin(MAX_DERIVATIVE_LIMIT, dp_dz));
156 : d2u_dx2 = fmax(-MAX_SECOND_DERIVATIVE_LIMIT, fmin(MAX_SECOND_DERIVATIVE_LIMIT, d2u_dx2));
157 : d2u_dy2 = fmax(-MAX_SECOND_DERIVATIVE_LIMIT, fmin(MAX_SECOND_DERIVATIVE_LIMIT, d2u_dy2));
158 : d2u_dz2 = fmax(-MAX_SECOND_DERIVATIVE_LIMIT, fmin(MAX_SECOND_DERIVATIVE_LIMIT, d2u_dz2));
159 : d2v_dx2 = fmax(-MAX_SECOND_DERIVATIVE_LIMIT, fmin(MAX_SECOND_DERIVATIVE_LIMIT, d2v_dx2));
160 : d2v_dy2 = fmax(-MAX_SECOND_DERIVATIVE_LIMIT, fmin(MAX_SECOND_DERIVATIVE_LIMIT, d2v_dy2));
161 : d2v_dz2 = fmax(-MAX_SECOND_DERIVATIVE_LIMIT, fmin(MAX_SECOND_DERIVATIVE_LIMIT, d2v_dz2));
162 : d2w_dx2 = fmax(-MAX_SECOND_DERIVATIVE_LIMIT, fmin(MAX_SECOND_DERIVATIVE_LIMIT, d2w_dx2));
163 : d2w_dy2 = fmax(-MAX_SECOND_DERIVATIVE_LIMIT, fmin(MAX_SECOND_DERIVATIVE_LIMIT, d2w_dy2));
164 : d2w_dz2 = fmax(-MAX_SECOND_DERIVATIVE_LIMIT, fmin(MAX_SECOND_DERIVATIVE_LIMIT, d2w_dz2));
165 :
166 : double source_u = 0.0, source_v = 0.0, source_w = 0.0;
167 : double z_coord = (nz > 1 && g->z) ? g->z[k] : 0.0;
168 : compute_source_terms(g->x[i], g->y[j], z_coord, iter, dt, params,
169 : &source_u, &source_v, &source_w);
170 :
171 : rhs_u[idx] = -(u[idx] * du_dx) - (v[idx] * du_dy) - (w[idx] * du_dz)
172 : - dp_dx / rho[idx]
173 : + (nu * (d2u_dx2 + d2u_dy2 + d2u_dz2))
174 : + source_u;
175 :
176 : rhs_v[idx] = -(u[idx] * dv_dx) - (v[idx] * dv_dy) - (w[idx] * dv_dz)
177 : - dp_dy / rho[idx]
178 : + (nu * (d2v_dx2 + d2v_dy2 + d2v_dz2))
179 : + source_v;
180 :
181 : rhs_w[idx] = -(u[idx] * dw_dx) - (v[idx] * dw_dy) - (w[idx] * dw_dz)
182 : - dp_dz / rho[idx]
183 : + (nu * (d2w_dx2 + d2w_dy2 + d2w_dz2))
184 : + source_w;
185 :
186 : double divergence = du_dx + dv_dy + dw_dz;
187 : divergence = fmax(-MAX_DIVERGENCE_LIMIT, fmin(MAX_DIVERGENCE_LIMIT, divergence));
188 : rhs_p[idx] = -PRESSURE_UPDATE_FACTOR * rho[idx] * divergence;
189 : }
190 :
191 : /* ============================================================================
192 : * AVX2 ROW-LEVEL RHS (compiled only when AVX2 is available)
193 : * ============================================================================ */
194 :
195 : #if USE_AVX2
196 :
197 : static inline __m256d avx2_clamp(__m256d x, __m256d lo, __m256d hi) {
198 : return _mm256_max_pd(lo, _mm256_min_pd(hi, x));
199 : }
200 :
201 : /*
202 : * Compute RHS for one row j at z-plane k, dispatching:
203 : * i=1 : scalar (periodic il wrap)
204 : * i=2..nx-4 : AVX2 (4-wide, no wrapping)
205 : * i=nx-2 : scalar (periodic ir wrap), plus any scalar remainder
206 : */
207 : static void compute_rhs_row(
208 : const double* u, const double* v, const double* w,
209 : const double* p, const double* rho,
210 : double* rhs_u, double* rhs_v, double* rhs_w, double* rhs_p,
211 : const rk2_avx2_context_t* ctx, const grid* g,
212 : const ns_solver_params_t* params, size_t j, size_t k, int iter, double dt)
213 : {
214 : size_t nx = ctx->nx;
215 : size_t ny = ctx->ny;
216 : size_t nz = ctx->nz;
217 : size_t stride_z = ctx->stride_z;
218 : double inv_2dz = ctx->inv_2dz;
219 : double inv_dz2 = ctx->inv_dz2;
220 :
221 : size_t k_off = k * stride_z;
222 : size_t j_off = k_off + j * nx;
223 :
224 : /* Degenerate dy: zero RHS for entire row (matches scalar path) */
225 : if (fabs(g->dy[j]) < 1e-10) {
226 : return; /* rhs arrays already memset to 0 before this call */
227 : }
228 :
229 : /* j-direction stencil row offsets: uniform across entire row */
230 : size_t jd_row = (j > 1) ? k_off + (j - 1) * nx : k_off + (ny - 2) * nx;
231 : size_t ju_row = (j < ny - 2) ? k_off + (j + 1) * nx : k_off + nx;
232 :
233 : /* z-direction stencil plane offsets: uniform across entire row.
234 : * When nz==1: stride_z=0, so kd_off=ku_off=0 and z-terms vanish. */
235 : size_t kd_off = (k > 1) ? (k - 1) * stride_z : (nz - 2) * stride_z;
236 : size_t ku_off = (k < nz - 2) ? (k + 1) * stride_z : 1 * stride_z;
237 :
238 : /* --- Scalar: i = 1 (il wraps periodically) --- */
239 : {
240 : size_t si = 1;
241 : size_t idx = j_off + si;
242 : size_t il = j_off + (nx - 2); /* periodic wrap */
243 : size_t ir = idx + 1;
244 : size_t kd = kd_off + j * nx + si;
245 : size_t ku = ku_off + j * nx + si;
246 : rk2_rhs_point(u, v, w, p, rho, rhs_u, rhs_v, rhs_w, rhs_p, g, params,
247 : idx, il, ir, jd_row + si, ju_row + si, kd, ku,
248 : inv_2dz, inv_dz2, si, j, k, nz, iter, dt);
249 : }
250 :
251 : /* --- AVX2: i = 2 while i+3 <= nx-3 (interior, no wrapping needed) --- */
252 : {
253 : __m256d dy_inv_v = _mm256_set1_pd(ctx->dy_inv[j]);
254 : /* 1/dy^2 = 4 * (1/(2*dy))^2, derived from pre-guarded dy_inv */
255 : __m256d dy2_inv_v = _mm256_set1_pd(4.0 * ctx->dy_inv[j] * ctx->dy_inv[j]);
256 : __m256d dz_inv_v = _mm256_set1_pd(inv_2dz);
257 : __m256d dz2_inv_v = _mm256_set1_pd(inv_dz2);
258 : __m256d two = _mm256_set1_pd(2.0);
259 : __m256d four = _mm256_set1_pd(4.0);
260 : __m256d max_d1 = _mm256_set1_pd( MAX_DERIVATIVE_LIMIT);
261 : __m256d min_d1 = _mm256_set1_pd(-MAX_DERIVATIVE_LIMIT);
262 : __m256d max_d2 = _mm256_set1_pd( MAX_SECOND_DERIVATIVE_LIMIT);
263 : __m256d min_d2 = _mm256_set1_pd(-MAX_SECOND_DERIVATIVE_LIMIT);
264 : __m256d max_div = _mm256_set1_pd( MAX_DIVERGENCE_LIMIT);
265 : __m256d min_div = _mm256_set1_pd(-MAX_DIVERGENCE_LIMIT);
266 : __m256d mu_v = _mm256_set1_pd(params->mu);
267 : __m256d eps = _mm256_set1_pd(1e-10);
268 : __m256d one = _mm256_set1_pd(1.0);
269 : __m256d neg_puf = _mm256_set1_pd(-PRESSURE_UPDATE_FACTOR);
270 :
271 : ptrdiff_t i;
272 : for (i = 2; i + 3 <= (ptrdiff_t)(nx - 3); i += 4) {
273 : size_t idx = j_off + (size_t)i;
274 :
275 : /* Load u stencil */
276 : __m256d u_c = _mm256_loadu_pd(&u[idx]);
277 : __m256d u_il = _mm256_loadu_pd(&u[idx - 1]);
278 : __m256d u_ir = _mm256_loadu_pd(&u[idx + 1]);
279 : __m256d u_jd = _mm256_loadu_pd(&u[jd_row + (size_t)i]);
280 : __m256d u_ju = _mm256_loadu_pd(&u[ju_row + (size_t)i]);
281 : __m256d u_kd = _mm256_loadu_pd(&u[kd_off + j * nx + (size_t)i]);
282 : __m256d u_ku = _mm256_loadu_pd(&u[ku_off + j * nx + (size_t)i]);
283 :
284 : /* Load v stencil */
285 : __m256d v_c = _mm256_loadu_pd(&v[idx]);
286 : __m256d v_il = _mm256_loadu_pd(&v[idx - 1]);
287 : __m256d v_ir = _mm256_loadu_pd(&v[idx + 1]);
288 : __m256d v_jd = _mm256_loadu_pd(&v[jd_row + (size_t)i]);
289 : __m256d v_ju = _mm256_loadu_pd(&v[ju_row + (size_t)i]);
290 : __m256d v_kd = _mm256_loadu_pd(&v[kd_off + j * nx + (size_t)i]);
291 : __m256d v_ku = _mm256_loadu_pd(&v[ku_off + j * nx + (size_t)i]);
292 :
293 : /* Load w stencil */
294 : __m256d w_c = _mm256_loadu_pd(&w[idx]);
295 : __m256d w_il = _mm256_loadu_pd(&w[idx - 1]);
296 : __m256d w_ir = _mm256_loadu_pd(&w[idx + 1]);
297 : __m256d w_jd = _mm256_loadu_pd(&w[jd_row + (size_t)i]);
298 : __m256d w_ju = _mm256_loadu_pd(&w[ju_row + (size_t)i]);
299 : __m256d w_kd = _mm256_loadu_pd(&w[kd_off + j * nx + (size_t)i]);
300 : __m256d w_ku = _mm256_loadu_pd(&w[ku_off + j * nx + (size_t)i]);
301 :
302 : /* Load p stencil */
303 : __m256d p_il = _mm256_loadu_pd(&p[idx - 1]);
304 : __m256d p_ir = _mm256_loadu_pd(&p[idx + 1]);
305 : __m256d p_jd = _mm256_loadu_pd(&p[jd_row + (size_t)i]);
306 : __m256d p_ju = _mm256_loadu_pd(&p[ju_row + (size_t)i]);
307 : __m256d p_kd = _mm256_loadu_pd(&p[kd_off + j * nx + (size_t)i]);
308 : __m256d p_ku = _mm256_loadu_pd(&p[ku_off + j * nx + (size_t)i]);
309 :
310 : /* Density: clamp for safe division, but track validity */
311 : __m256d rho_raw = _mm256_loadu_pd(&rho[idx]);
312 : __m256d rho_valid = _mm256_cmp_pd(rho_raw, eps, _CMP_GT_OQ);
313 : __m256d rho_c = _mm256_max_pd(rho_raw, eps);
314 : __m256d rho_inv = _mm256_div_pd(one, rho_c);
315 :
316 : /* dx stencil: dx_inv[i] = 1/(2*dx[i]), 0 when dx < 1e-10 */
317 : __m256d dx_inv_v = _mm256_loadu_pd(&ctx->dx_inv[(size_t)i]);
318 : __m256d dx_valid = _mm256_cmp_pd(dx_inv_v, _mm256_setzero_pd(), _CMP_GT_OQ);
319 :
320 : /* Combined validity: rho > eps AND dx > 0 (dy already checked at row level) */
321 : __m256d valid = _mm256_and_pd(rho_valid, dx_valid);
322 : /* 1/dx^2 = 4 * (1/(2*dx))^2 */
323 : __m256d dx2_inv_v = _mm256_mul_pd(four, _mm256_mul_pd(dx_inv_v, dx_inv_v));
324 :
325 : /* First derivatives — u */
326 : __m256d du_dx = avx2_clamp(
327 : _mm256_mul_pd(_mm256_sub_pd(u_ir, u_il), dx_inv_v), min_d1, max_d1);
328 : __m256d du_dy = avx2_clamp(
329 : _mm256_mul_pd(_mm256_sub_pd(u_ju, u_jd), dy_inv_v), min_d1, max_d1);
330 : __m256d du_dz = avx2_clamp(
331 : _mm256_mul_pd(_mm256_sub_pd(u_ku, u_kd), dz_inv_v), min_d1, max_d1);
332 :
333 : /* First derivatives — v */
334 : __m256d dv_dx = avx2_clamp(
335 : _mm256_mul_pd(_mm256_sub_pd(v_ir, v_il), dx_inv_v), min_d1, max_d1);
336 : __m256d dv_dy = avx2_clamp(
337 : _mm256_mul_pd(_mm256_sub_pd(v_ju, v_jd), dy_inv_v), min_d1, max_d1);
338 : __m256d dv_dz = avx2_clamp(
339 : _mm256_mul_pd(_mm256_sub_pd(v_ku, v_kd), dz_inv_v), min_d1, max_d1);
340 :
341 : /* First derivatives — w */
342 : __m256d dw_dx = avx2_clamp(
343 : _mm256_mul_pd(_mm256_sub_pd(w_ir, w_il), dx_inv_v), min_d1, max_d1);
344 : __m256d dw_dy = avx2_clamp(
345 : _mm256_mul_pd(_mm256_sub_pd(w_ju, w_jd), dy_inv_v), min_d1, max_d1);
346 : __m256d dw_dz = avx2_clamp(
347 : _mm256_mul_pd(_mm256_sub_pd(w_ku, w_kd), dz_inv_v), min_d1, max_d1);
348 :
349 : /* Pressure gradients */
350 : __m256d dp_dx = avx2_clamp(
351 : _mm256_mul_pd(_mm256_sub_pd(p_ir, p_il), dx_inv_v), min_d1, max_d1);
352 : __m256d dp_dy = avx2_clamp(
353 : _mm256_mul_pd(_mm256_sub_pd(p_ju, p_jd), dy_inv_v), min_d1, max_d1);
354 : __m256d dp_dz = avx2_clamp(
355 : _mm256_mul_pd(_mm256_sub_pd(p_ku, p_kd), dz_inv_v), min_d1, max_d1);
356 :
357 : /* Second derivatives — u */
358 : __m256d d2u_dx2 = avx2_clamp(
359 : _mm256_mul_pd(
360 : _mm256_sub_pd(_mm256_add_pd(u_ir, u_il), _mm256_mul_pd(two, u_c)),
361 : dx2_inv_v),
362 : min_d2, max_d2);
363 : __m256d d2u_dy2 = avx2_clamp(
364 : _mm256_mul_pd(
365 : _mm256_sub_pd(_mm256_add_pd(u_ju, u_jd), _mm256_mul_pd(two, u_c)),
366 : dy2_inv_v),
367 : min_d2, max_d2);
368 : __m256d d2u_dz2 = avx2_clamp(
369 : _mm256_mul_pd(
370 : _mm256_sub_pd(_mm256_add_pd(u_ku, u_kd), _mm256_mul_pd(two, u_c)),
371 : dz2_inv_v),
372 : min_d2, max_d2);
373 :
374 : /* Second derivatives — v */
375 : __m256d d2v_dx2 = avx2_clamp(
376 : _mm256_mul_pd(
377 : _mm256_sub_pd(_mm256_add_pd(v_ir, v_il), _mm256_mul_pd(two, v_c)),
378 : dx2_inv_v),
379 : min_d2, max_d2);
380 : __m256d d2v_dy2 = avx2_clamp(
381 : _mm256_mul_pd(
382 : _mm256_sub_pd(_mm256_add_pd(v_ju, v_jd), _mm256_mul_pd(two, v_c)),
383 : dy2_inv_v),
384 : min_d2, max_d2);
385 : __m256d d2v_dz2 = avx2_clamp(
386 : _mm256_mul_pd(
387 : _mm256_sub_pd(_mm256_add_pd(v_ku, v_kd), _mm256_mul_pd(two, v_c)),
388 : dz2_inv_v),
389 : min_d2, max_d2);
390 :
391 : /* Second derivatives — w */
392 : __m256d d2w_dx2 = avx2_clamp(
393 : _mm256_mul_pd(
394 : _mm256_sub_pd(_mm256_add_pd(w_ir, w_il), _mm256_mul_pd(two, w_c)),
395 : dx2_inv_v),
396 : min_d2, max_d2);
397 : __m256d d2w_dy2 = avx2_clamp(
398 : _mm256_mul_pd(
399 : _mm256_sub_pd(_mm256_add_pd(w_ju, w_jd), _mm256_mul_pd(two, w_c)),
400 : dy2_inv_v),
401 : min_d2, max_d2);
402 : __m256d d2w_dz2 = avx2_clamp(
403 : _mm256_mul_pd(
404 : _mm256_sub_pd(_mm256_add_pd(w_ku, w_kd), _mm256_mul_pd(two, w_c)),
405 : dz2_inv_v),
406 : min_d2, max_d2);
407 :
408 : /* Kinematic viscosity: nu = min(mu/rho, 1.0) */
409 : __m256d nu = _mm256_min_pd(one, _mm256_mul_pd(mu_v, rho_inv));
410 :
411 : /* Source terms: computed scalar per lane and packed */
412 : double src_u_arr[4] = {0.0, 0.0, 0.0, 0.0};
413 : double src_v_arr[4] = {0.0, 0.0, 0.0, 0.0};
414 : double src_w_arr[4] = {0.0, 0.0, 0.0, 0.0};
415 : for (int lane = 0; lane < 4; lane++) {
416 : double z_coord = (nz > 1 && g->z) ? g->z[k] : 0.0;
417 : compute_source_terms(g->x[(size_t)i + (size_t)lane], g->y[j],
418 : z_coord, iter, dt, params,
419 : &src_u_arr[lane], &src_v_arr[lane],
420 : &src_w_arr[lane]);
421 : }
422 : __m256d src_u_v = _mm256_loadu_pd(src_u_arr);
423 : __m256d src_v_v = _mm256_loadu_pd(src_v_arr);
424 : __m256d src_w_v = _mm256_loadu_pd(src_w_arr);
425 :
426 : /* RHS u: nu*(d2u_dx2+d2u_dy2+d2u_dz2) - dp_dx/rho - conv_u + src_u */
427 : __m256d conv_u = _mm256_add_pd(
428 : _mm256_add_pd(
429 : _mm256_mul_pd(u_c, du_dx),
430 : _mm256_mul_pd(v_c, du_dy)),
431 : _mm256_mul_pd(w_c, du_dz));
432 : __m256d rhs_u_v = _mm256_add_pd(
433 : _mm256_sub_pd(
434 : _mm256_sub_pd(
435 : _mm256_mul_pd(nu, _mm256_add_pd(_mm256_add_pd(d2u_dx2, d2u_dy2), d2u_dz2)),
436 : _mm256_mul_pd(dp_dx, rho_inv)),
437 : conv_u),
438 : src_u_v);
439 :
440 : /* RHS v: nu*(d2v_dx2+d2v_dy2+d2v_dz2) - dp_dy/rho - conv_v + src_v */
441 : __m256d conv_v = _mm256_add_pd(
442 : _mm256_add_pd(
443 : _mm256_mul_pd(u_c, dv_dx),
444 : _mm256_mul_pd(v_c, dv_dy)),
445 : _mm256_mul_pd(w_c, dv_dz));
446 : __m256d rhs_v_v = _mm256_add_pd(
447 : _mm256_sub_pd(
448 : _mm256_sub_pd(
449 : _mm256_mul_pd(nu, _mm256_add_pd(_mm256_add_pd(d2v_dx2, d2v_dy2), d2v_dz2)),
450 : _mm256_mul_pd(dp_dy, rho_inv)),
451 : conv_v),
452 : src_v_v);
453 :
454 : /* RHS w: nu*(d2w_dx2+d2w_dy2+d2w_dz2) - dp_dz/rho - conv_w + src_w */
455 : __m256d conv_w = _mm256_add_pd(
456 : _mm256_add_pd(
457 : _mm256_mul_pd(u_c, dw_dx),
458 : _mm256_mul_pd(v_c, dw_dy)),
459 : _mm256_mul_pd(w_c, dw_dz));
460 : __m256d rhs_w_v = _mm256_add_pd(
461 : _mm256_sub_pd(
462 : _mm256_sub_pd(
463 : _mm256_mul_pd(nu, _mm256_add_pd(_mm256_add_pd(d2w_dx2, d2w_dy2), d2w_dz2)),
464 : _mm256_mul_pd(dp_dz, rho_inv)),
465 : conv_w),
466 : src_w_v);
467 :
468 : /* Pressure RHS: -0.1 * rho * divergence */
469 : __m256d divergence = avx2_clamp(
470 : _mm256_add_pd(_mm256_add_pd(du_dx, dv_dy), dw_dz), min_div, max_div);
471 : __m256d rhs_p_v = _mm256_mul_pd(
472 : neg_puf, _mm256_mul_pd(rho_c, divergence));
473 :
474 : /* Zero RHS for invalid lanes (rho <= eps or dx degenerate) */
475 : rhs_u_v = _mm256_and_pd(rhs_u_v, valid);
476 : rhs_v_v = _mm256_and_pd(rhs_v_v, valid);
477 : rhs_w_v = _mm256_and_pd(rhs_w_v, valid);
478 : rhs_p_v = _mm256_and_pd(rhs_p_v, valid);
479 :
480 : _mm256_storeu_pd(&rhs_u[idx], rhs_u_v);
481 : _mm256_storeu_pd(&rhs_v[idx], rhs_v_v);
482 : _mm256_storeu_pd(&rhs_w[idx], rhs_w_v);
483 : _mm256_storeu_pd(&rhs_p[idx], rhs_p_v);
484 : }
485 :
486 : /* --- Scalar: remainder from end of AVX2 to nx-2 (inclusive) ---
487 : * Handles 0-3 leftover interior points plus the periodic ir-wrap edge. */
488 : for (; i <= (ptrdiff_t)(nx - 2); i++) {
489 : size_t si = (size_t)i;
490 : size_t idx = j_off + si;
491 : size_t il = idx - 1;
492 : size_t ir = (si == nx - 2) ? j_off + 1 : idx + 1; /* periodic at last point */
493 : size_t kd = kd_off + j * nx + si;
494 : size_t ku = ku_off + j * nx + si;
495 : rk2_rhs_point(u, v, w, p, rho, rhs_u, rhs_v, rhs_w, rhs_p, g, params,
496 : idx, il, ir, jd_row + si, ju_row + si, kd, ku,
497 : inv_2dz, inv_dz2, si, j, k, nz, iter, dt);
498 : }
499 : }
500 : }
501 :
502 : /*
503 : * Compute RHS for all interior rows and z-planes.
504 : * k-loop is outside the OMP parallel for j-loop (MSVC OMP 2.0 compatibility).
505 : */
506 : static void compute_rhs_avx2(
507 : const double* u, const double* v, const double* w,
508 : const double* p, const double* rho,
509 : double* rhs_u, double* rhs_v, double* rhs_w, double* rhs_p,
510 : const rk2_avx2_context_t* ctx, const grid* g,
511 : const ns_solver_params_t* params, int iter, double dt)
512 : {
513 : ptrdiff_t ny_int = (ptrdiff_t)ctx->ny;
514 : for (size_t k = ctx->k_start; k < ctx->k_end; k++) {
515 : ptrdiff_t j;
516 : #ifdef _OPENMP
517 : #pragma omp parallel for schedule(static)
518 : #endif
519 : for (j = 1; j < ny_int - 1; j++) {
520 : compute_rhs_row(u, v, w, p, rho, rhs_u, rhs_v, rhs_w, rhs_p,
521 : ctx, g, params, (size_t)j, k, iter, dt);
522 : }
523 : }
524 : }
525 :
526 : /* ============================================================================
527 : * RK2 AVX2 MAIN LOOP
528 : * ============================================================================ */
529 :
530 : static cfd_status_t rk2_avx2_impl(flow_field* field, rk2_avx2_context_t* ctx,
531 : const grid* g, const ns_solver_params_t* params)
532 : {
533 : size_t n = ctx->nx * ctx->ny * ctx->nz;
534 : size_t bytes = n * sizeof(double);
535 : double dt = params->dt;
536 : ptrdiff_t n_int = (ptrdiff_t)n;
537 : cfd_status_t status = CFD_SUCCESS;
538 :
539 : for (int iter = 0; iter < params->max_iter; iter++) {
540 : /* Save Q^n */
541 : memcpy(ctx->u0, field->u, bytes);
542 : memcpy(ctx->v0, field->v, bytes);
543 : memcpy(ctx->w0, field->w, bytes);
544 : memcpy(ctx->p0, field->p, bytes);
545 :
546 : /* ---- Stage 1: k1 = RHS(Q^n) ---- */
547 : memset(ctx->k1_u, 0, bytes);
548 : memset(ctx->k1_v, 0, bytes);
549 : memset(ctx->k1_w, 0, bytes);
550 : memset(ctx->k1_p, 0, bytes);
551 : compute_rhs_avx2(field->u, field->v, field->w, field->p, field->rho,
552 : ctx->k1_u, ctx->k1_v, ctx->k1_w, ctx->k1_p,
553 : ctx, g, params, iter, dt);
554 :
555 : /* ---- Intermediate: field = Q^n + dt*k1 ---- */
556 : {
557 : ptrdiff_t k;
558 : #ifdef _OPENMP
559 : #pragma omp parallel for schedule(static)
560 : #endif
561 : for (k = 0; k < n_int; k++) {
562 : field->u[k] = fmax(-MAX_VELOCITY_LIMIT,
563 : fmin( MAX_VELOCITY_LIMIT,
564 : ctx->u0[k] + dt * ctx->k1_u[k]));
565 : field->v[k] = fmax(-MAX_VELOCITY_LIMIT,
566 : fmin( MAX_VELOCITY_LIMIT,
567 : ctx->v0[k] + dt * ctx->k1_v[k]));
568 : field->w[k] = fmax(-MAX_VELOCITY_LIMIT,
569 : fmin( MAX_VELOCITY_LIMIT,
570 : ctx->w0[k] + dt * ctx->k1_w[k]));
571 : field->p[k] = ctx->p0[k] + dt * ctx->k1_p[k];
572 : }
573 : }
574 :
575 : /* NOTE: Do NOT apply BCs between RK stages.
576 : * The ghost cells carry zero-derivative evolution (k1[ghost]=0),
577 : * consistent with the semi-discrete ODE. Applying BCs here would
578 : * reduce RK2 to first-order temporal accuracy. */
579 :
580 : /* ---- Stage 2: k2 = RHS(Q_pred) ---- */
581 : memset(ctx->k2_u, 0, bytes);
582 : memset(ctx->k2_v, 0, bytes);
583 : memset(ctx->k2_w, 0, bytes);
584 : memset(ctx->k2_p, 0, bytes);
585 : compute_rhs_avx2(field->u, field->v, field->w, field->p, field->rho,
586 : ctx->k2_u, ctx->k2_v, ctx->k2_w, ctx->k2_p,
587 : ctx, g, params, iter, dt);
588 :
589 : /* ---- Final update: Q^{n+1} = Q^n + (dt/2)*(k1 + k2) ---- */
590 : {
591 : double half_dt = 0.5 * dt;
592 : ptrdiff_t k;
593 : #ifdef _OPENMP
594 : #pragma omp parallel for schedule(static)
595 : #endif
596 : for (k = 0; k < n_int; k++) {
597 : field->u[k] = fmax(-MAX_VELOCITY_LIMIT,
598 : fmin( MAX_VELOCITY_LIMIT,
599 : ctx->u0[k] + half_dt * (ctx->k1_u[k] + ctx->k2_u[k])));
600 : field->v[k] = fmax(-MAX_VELOCITY_LIMIT,
601 : fmin( MAX_VELOCITY_LIMIT,
602 : ctx->v0[k] + half_dt * (ctx->k1_v[k] + ctx->k2_v[k])));
603 : field->w[k] = fmax(-MAX_VELOCITY_LIMIT,
604 : fmin( MAX_VELOCITY_LIMIT,
605 : ctx->w0[k] + half_dt * (ctx->k1_w[k] + ctx->k2_w[k])));
606 : field->p[k] = ctx->p0[k] + half_dt * (ctx->k1_p[k] + ctx->k2_p[k]);
607 : }
608 : }
609 :
610 : /* Apply BCs to final state only */
611 : apply_boundary_conditions(field, g);
612 :
613 : /* NaN / Inf check (parallelized) */
614 : {
615 : int has_nan = 0;
616 : ptrdiff_t k;
617 : #ifdef _OPENMP
618 : #pragma omp parallel for reduction(|:has_nan) schedule(static)
619 : #endif
620 : for (k = 0; k < n_int; k++) {
621 : if (!isfinite(field->u[k]) || !isfinite(field->v[k]) ||
622 : !isfinite(field->w[k]) || !isfinite(field->p[k])) {
623 : has_nan = 1;
624 : }
625 : }
626 : if (has_nan) {
627 : status = CFD_ERROR_DIVERGED;
628 : goto cleanup;
629 : }
630 : }
631 : }
632 :
633 : cleanup:
634 : return status;
635 : }
636 :
637 : #endif /* USE_AVX2 */
638 :
639 : /* ============================================================================
640 : * PUBLIC API IMPLEMENTATIONS
641 : * ============================================================================ */
642 :
643 6 : cfd_status_t rk2_avx2_init(ns_solver_t* solver, const grid* g,
644 : const ns_solver_params_t* params)
645 : {
646 6 : (void)params;
647 :
648 : #if !USE_AVX2
649 6 : (void)solver;
650 6 : (void)g;
651 6 : cfd_set_error(CFD_ERROR_UNSUPPORTED, "AVX2 not available in this build");
652 6 : return CFD_ERROR_UNSUPPORTED;
653 : #else
654 : if (!solver || !g) {
655 : return CFD_ERROR_INVALID;
656 : }
657 : if (g->nx < 3 || g->ny < 3 || (g->nz > 1 && g->nz < 3)) {
658 : return CFD_ERROR_INVALID;
659 : }
660 :
661 : rk2_avx2_context_t* ctx =
662 : (rk2_avx2_context_t*)cfd_calloc(1, sizeof(rk2_avx2_context_t));
663 : if (!ctx) {
664 : return CFD_ERROR_NOMEM;
665 : }
666 :
667 : ctx->nx = g->nx;
668 : ctx->ny = g->ny;
669 : ctx->nz = g->nz;
670 : size_t bytes = ctx->nx * ctx->ny * g->nz * sizeof(double);
671 :
672 : /* Reject non-uniform z-spacing (solver uses constant inv_2dz/inv_dz2) */
673 : if (g->nz > 1 && g->dz) {
674 : for (size_t kk = 1; kk < g->nz - 1; kk++) {
675 : if (fabs(g->dz[kk] - g->dz[0]) > 1e-14) {
676 : cfd_free(ctx);
677 : return CFD_ERROR_INVALID;
678 : }
679 : }
680 : }
681 :
682 : /* Branch-free 3D constants */
683 : size_t plane = ctx->nx * ctx->ny;
684 : ctx->stride_z = (g->nz > 1) ? plane : 0;
685 : ctx->k_start = (g->nz > 1) ? 1 : 0;
686 : ctx->k_end = (g->nz > 1) ? (g->nz - 1) : 1;
687 : ctx->inv_2dz = (g->nz > 1 && g->dz) ? 1.0 / (2.0 * g->dz[0]) : 0.0;
688 : ctx->inv_dz2 = (g->nz > 1 && g->dz) ? 1.0 / (g->dz[0] * g->dz[0]) : 0.0;
689 :
690 : ctx->k1_u = (double*)cfd_aligned_malloc(bytes);
691 : ctx->k1_v = (double*)cfd_aligned_malloc(bytes);
692 : ctx->k1_w = (double*)cfd_aligned_malloc(bytes);
693 : ctx->k1_p = (double*)cfd_aligned_malloc(bytes);
694 : ctx->k2_u = (double*)cfd_aligned_malloc(bytes);
695 : ctx->k2_v = (double*)cfd_aligned_malloc(bytes);
696 : ctx->k2_w = (double*)cfd_aligned_malloc(bytes);
697 : ctx->k2_p = (double*)cfd_aligned_malloc(bytes);
698 : ctx->u0 = (double*)cfd_aligned_malloc(bytes);
699 : ctx->v0 = (double*)cfd_aligned_malloc(bytes);
700 : ctx->w0 = (double*)cfd_aligned_malloc(bytes);
701 : ctx->p0 = (double*)cfd_aligned_malloc(bytes);
702 : ctx->dx_inv = (double*)cfd_aligned_malloc(ctx->nx * sizeof(double));
703 : ctx->dy_inv = (double*)cfd_aligned_malloc(ctx->ny * sizeof(double));
704 :
705 : if (!ctx->k1_u || !ctx->k1_v || !ctx->k1_w || !ctx->k1_p ||
706 : !ctx->k2_u || !ctx->k2_v || !ctx->k2_w || !ctx->k2_p ||
707 : !ctx->u0 || !ctx->v0 || !ctx->w0 || !ctx->p0 ||
708 : !ctx->dx_inv || !ctx->dy_inv) {
709 : cfd_aligned_free(ctx->k1_u); cfd_aligned_free(ctx->k1_v);
710 : cfd_aligned_free(ctx->k1_w); cfd_aligned_free(ctx->k1_p);
711 : cfd_aligned_free(ctx->k2_u); cfd_aligned_free(ctx->k2_v);
712 : cfd_aligned_free(ctx->k2_w); cfd_aligned_free(ctx->k2_p);
713 : cfd_aligned_free(ctx->u0); cfd_aligned_free(ctx->v0);
714 : cfd_aligned_free(ctx->w0); cfd_aligned_free(ctx->p0);
715 : cfd_aligned_free(ctx->dx_inv);
716 : cfd_aligned_free(ctx->dy_inv);
717 : cfd_free(ctx);
718 : return CFD_ERROR_NOMEM;
719 : }
720 :
721 : for (size_t i = 0; i < ctx->nx; i++) {
722 : ctx->dx_inv[i] = (g->dx[i] > 1e-10) ? 1.0 / (2.0 * g->dx[i]) : 0.0;
723 : }
724 : for (size_t j = 0; j < ctx->ny; j++) {
725 : ctx->dy_inv[j] = (g->dy[j] > 1e-10) ? 1.0 / (2.0 * g->dy[j]) : 0.0;
726 : }
727 :
728 : ctx->initialized = 1;
729 : solver->context = ctx;
730 :
731 : #ifdef _OPENMP
732 : printf("RK2 SIMD: AVX2 + OpenMP enabled (%d threads)\n", omp_get_max_threads());
733 : #else
734 : printf("RK2 SIMD: AVX2 enabled (OpenMP disabled)\n");
735 : #endif
736 :
737 : return CFD_SUCCESS;
738 : #endif
739 : }
740 :
741 6 : void rk2_avx2_destroy(ns_solver_t* solver)
742 : {
743 6 : if (!solver || !solver->context) {
744 : return;
745 : }
746 0 : rk2_avx2_context_t* ctx = (rk2_avx2_context_t*)solver->context;
747 0 : if (ctx->initialized) {
748 0 : cfd_aligned_free(ctx->k1_u); cfd_aligned_free(ctx->k1_v);
749 0 : cfd_aligned_free(ctx->k1_w); cfd_aligned_free(ctx->k1_p);
750 0 : cfd_aligned_free(ctx->k2_u); cfd_aligned_free(ctx->k2_v);
751 0 : cfd_aligned_free(ctx->k2_w); cfd_aligned_free(ctx->k2_p);
752 0 : cfd_aligned_free(ctx->u0); cfd_aligned_free(ctx->v0);
753 0 : cfd_aligned_free(ctx->w0); cfd_aligned_free(ctx->p0);
754 0 : cfd_aligned_free(ctx->dx_inv);
755 0 : cfd_aligned_free(ctx->dy_inv);
756 : }
757 0 : cfd_free(ctx);
758 0 : solver->context = NULL;
759 : }
760 :
761 0 : cfd_status_t rk2_avx2_step(ns_solver_t* solver, flow_field* field, const grid* g,
762 : const ns_solver_params_t* params, ns_solver_stats_t* stats)
763 : {
764 : #if !USE_AVX2
765 0 : (void)solver; (void)field; (void)g; (void)params; (void)stats;
766 0 : return CFD_ERROR_UNSUPPORTED;
767 : #else
768 : if (!solver || !solver->context || !field || !g || !params) {
769 : return CFD_ERROR_INVALID;
770 : }
771 : if (field->nx < 3 || field->ny < 3 || (field->nz > 1 && field->nz < 3)) {
772 : return CFD_ERROR_INVALID;
773 : }
774 :
775 : rk2_avx2_context_t* ctx = (rk2_avx2_context_t*)solver->context;
776 : if (field->nx != ctx->nx || field->ny != ctx->ny || field->nz != ctx->nz) {
777 : return CFD_ERROR_INVALID;
778 : }
779 :
780 : ns_solver_params_t step_params = *params;
781 : step_params.max_iter = 1;
782 :
783 : cfd_status_t status = rk2_avx2_impl(field, ctx, g, &step_params);
784 :
785 : if (stats) {
786 : stats->iterations = 1;
787 : double max_vel = 0.0, max_p = 0.0;
788 : ptrdiff_t n_s = (ptrdiff_t)(field->nx * field->ny * field->nz);
789 : ptrdiff_t ks;
790 : #if defined(_OPENMP) && (_OPENMP >= 201107)
791 : #pragma omp parallel for reduction(max: max_vel, max_p) schedule(static)
792 : #endif
793 : for (ks = 0; ks < n_s; ks++) {
794 : double vel = sqrt(field->u[ks] * field->u[ks] +
795 : field->v[ks] * field->v[ks] +
796 : field->w[ks] * field->w[ks]);
797 : if (vel > max_vel) max_vel = vel;
798 : double ap = fabs(field->p[ks]);
799 : if (ap > max_p) max_p = ap;
800 : }
801 : stats->max_velocity = max_vel;
802 : stats->max_pressure = max_p;
803 : }
804 :
805 : return status;
806 : #endif
807 : }
808 :
809 0 : cfd_status_t rk2_avx2_solve(ns_solver_t* solver, flow_field* field, const grid* g,
810 : const ns_solver_params_t* params, ns_solver_stats_t* stats)
811 : {
812 : #if !USE_AVX2
813 0 : (void)solver; (void)field; (void)g; (void)params; (void)stats;
814 0 : return CFD_ERROR_UNSUPPORTED;
815 : #else
816 : if (!solver || !solver->context || !field || !g || !params) {
817 : return CFD_ERROR_INVALID;
818 : }
819 : if (field->nx < 3 || field->ny < 3 || (field->nz > 1 && field->nz < 3)) {
820 : return CFD_ERROR_INVALID;
821 : }
822 :
823 : rk2_avx2_context_t* ctx = (rk2_avx2_context_t*)solver->context;
824 : if (field->nx != ctx->nx || field->ny != ctx->ny || field->nz != ctx->nz) {
825 : return CFD_ERROR_INVALID;
826 : }
827 :
828 : /* Call impl directly so its internal loop runs iter=0..max_iter-1,
829 : * giving compute_source_terms the correct iteration index. */
830 : cfd_status_t status = rk2_avx2_impl(field, ctx, g, params);
831 :
832 : if (stats) {
833 : stats->iterations = params->max_iter;
834 : double max_vel = 0.0, max_p = 0.0;
835 : ptrdiff_t n_s = (ptrdiff_t)(field->nx * field->ny * field->nz);
836 : ptrdiff_t ks;
837 : #if defined(_OPENMP) && (_OPENMP >= 201107)
838 : #pragma omp parallel for reduction(max: max_vel, max_p) schedule(static)
839 : #endif
840 : for (ks = 0; ks < n_s; ks++) {
841 : double vel = sqrt(field->u[ks] * field->u[ks] +
842 : field->v[ks] * field->v[ks] +
843 : field->w[ks] * field->w[ks]);
844 : if (vel > max_vel) max_vel = vel;
845 : double ap = fabs(field->p[ks]);
846 : if (ap > max_p) max_p = ap;
847 : }
848 : stats->max_velocity = max_vel;
849 : stats->max_pressure = max_p;
850 : }
851 : return status;
852 : #endif
853 : }
|