Line data Source code
1 : /**
2 : * Optimized Explicit Euler NSSolver with SIMD + OpenMP
3 : *
4 : * This implementation combines SIMD vectorization (AVX2) with OpenMP
5 : * parallelization for maximum performance on multi-core CPUs.
6 : *
7 : * - Outer loops are parallelized with OpenMP
8 : * - Inner loops use AVX2 SIMD intrinsics for vectorization
9 : *
10 : * Note: When AVX2 is not enabled at compile time (CFD_ENABLE_AVX2=OFF),
11 : * this solver uses scalar code paths. Use the base explicit_euler solver
12 : * for guaranteed scalar-only execution.
13 : */
14 :
15 : // Enable C11 features for aligned_alloc
16 : #define _POSIX_C_SOURCE 200809L
17 : #define _ISOC11_SOURCE
18 : #define _USE_MATH_DEFINES
19 :
20 : #include "cfd/core/cfd_status.h"
21 : #include "cfd/core/grid.h"
22 : #include "cfd/core/indexing.h"
23 : #include "cfd/core/memory.h"
24 : #include "cfd/solvers/navier_stokes_solver.h"
25 :
26 : #include "../boundary_copy_utils.h"
27 :
28 : #include <math.h>
29 :
30 : #ifndef M_PI
31 : #define M_PI 3.14159265358979323846
32 : #endif
33 : #include <stdio.h>
34 : #include <stdlib.h>
35 : #include <string.h>
36 :
37 : #ifdef _OPENMP
38 : #include <omp.h>
39 : #endif
40 :
41 : /* AVX2 detection
42 : * CFD_HAS_AVX2 is set by CMake when -DCFD_ENABLE_AVX2=ON.
43 : * This works consistently across all compilers (GCC, Clang, MSVC).
44 : */
45 : #if defined(CFD_HAS_AVX2)
46 : #include <immintrin.h>
47 : #define USE_AVX 1
48 : #else
49 : #define USE_AVX 0
50 : #endif
51 :
52 : // Physical stability limits
53 : #define MAX_DERIVATIVE_LIMIT 100.0
54 : #define MAX_SECOND_DERIVATIVE_LIMIT 1000.0
55 : #define MAX_VELOCITY_LIMIT 100.0
56 : #define MAX_DIVERGENCE_LIMIT 10.0
57 : #define DT_CONSERVATIVE_LIMIT 0.0001
58 : #define UPDATE_LIMIT 1.0
59 : #define PRESSURE_UPDATE_FACTOR 0.1
60 :
61 : typedef struct {
62 : double* u_new;
63 : double* v_new;
64 : double* w_new;
65 : double* p_new;
66 : double* dx_inv;
67 : double* dy_inv;
68 : size_t nx;
69 : size_t ny;
70 : size_t nz;
71 : size_t stride_z;
72 : size_t k_start;
73 : size_t k_end;
74 : double inv_2dz;
75 : double inv_dz2;
76 : int initialized;
77 : } explicit_euler_simd_context;
78 :
79 : // Public API functions
80 : cfd_status_t explicit_euler_simd_init(struct NSSolver* solver, const grid* grid,
81 : const ns_solver_params_t* params);
82 : void explicit_euler_simd_destroy(struct NSSolver* solver);
83 : cfd_status_t explicit_euler_simd_step(struct NSSolver* solver, flow_field* field, const grid* grid,
84 : const ns_solver_params_t* params, ns_solver_stats_t* stats);
85 :
86 10 : cfd_status_t explicit_euler_simd_init(struct NSSolver* solver, const grid* grid,
87 : const ns_solver_params_t* params) {
88 10 : (void)params; // Unused
89 10 : if (!solver || !grid) {
90 : return CFD_ERROR_INVALID;
91 : }
92 10 : if (grid->nx < 3 || grid->ny < 3 || (grid->nz > 1 && grid->nz < 3)) {
93 : return CFD_ERROR_INVALID;
94 : }
95 :
96 10 : explicit_euler_simd_context* ctx =
97 10 : (explicit_euler_simd_context*)cfd_calloc(1, sizeof(explicit_euler_simd_context));
98 10 : if (!ctx) {
99 : return CFD_ERROR_NOMEM;
100 : }
101 :
102 10 : ctx->nx = grid->nx;
103 10 : ctx->ny = grid->ny;
104 10 : ctx->nz = grid->nz;
105 10 : size_t field_size = ctx->nx * ctx->ny * ctx->nz * sizeof(double);
106 :
107 : /* Reject non-uniform z-spacing (solver uses constant inv_2dz/inv_dz2) */
108 10 : if (grid->nz > 1 && grid->dz) {
109 14 : for (size_t kk = 1; kk < grid->nz - 1; kk++) {
110 12 : if (fabs(grid->dz[kk] - grid->dz[0]) > 1e-14) {
111 0 : cfd_free(ctx);
112 0 : return CFD_ERROR_INVALID;
113 : }
114 : }
115 : }
116 :
117 10 : size_t plane = ctx->nx * ctx->ny;
118 10 : ctx->stride_z = (grid->nz > 1) ? plane : 0;
119 10 : ctx->k_start = (grid->nz > 1) ? 1 : 0;
120 10 : ctx->k_end = (grid->nz > 1) ? (grid->nz - 1) : 1;
121 10 : ctx->inv_2dz = (grid->nz > 1 && grid->dz) ? 1.0 / (2.0 * grid->dz[0]) : 0.0;
122 10 : ctx->inv_dz2 = (grid->nz > 1 && grid->dz) ? 1.0 / (grid->dz[0] * grid->dz[0]) : 0.0;
123 :
124 10 : ctx->u_new = (double*)cfd_aligned_malloc(field_size);
125 10 : ctx->v_new = (double*)cfd_aligned_malloc(field_size);
126 10 : ctx->w_new = (double*)cfd_aligned_malloc(field_size);
127 10 : ctx->p_new = (double*)cfd_aligned_malloc(field_size);
128 10 : ctx->dx_inv = (double*)cfd_aligned_malloc(ctx->nx * sizeof(double));
129 10 : ctx->dy_inv = (double*)cfd_aligned_malloc(ctx->ny * sizeof(double));
130 :
131 10 : if (!ctx->u_new || !ctx->v_new || !ctx->w_new || !ctx->p_new || !ctx->dx_inv || !ctx->dy_inv) {
132 0 : if (ctx->u_new) {
133 0 : cfd_aligned_free(ctx->u_new);
134 : }
135 0 : if (ctx->v_new) {
136 0 : cfd_aligned_free(ctx->v_new);
137 : }
138 0 : if (ctx->w_new) {
139 0 : cfd_aligned_free(ctx->w_new);
140 : }
141 0 : if (ctx->p_new) {
142 0 : cfd_aligned_free(ctx->p_new);
143 : }
144 0 : if (ctx->dx_inv) {
145 0 : cfd_aligned_free(ctx->dx_inv);
146 : }
147 0 : if (ctx->dy_inv) {
148 0 : cfd_aligned_free(ctx->dy_inv);
149 : }
150 0 : cfd_free(ctx);
151 0 : return CFD_ERROR_NOMEM;
152 : }
153 :
154 : // Pre-compute inverses
155 263 : for (size_t i = 0; i < ctx->nx; i++) {
156 253 : ctx->dx_inv[i] = (i < ctx->nx - 1) ? 1.0 / (2.0 * grid->dx[i]) : 0.0;
157 : }
158 261 : for (size_t j = 0; j < ctx->ny; j++) {
159 251 : ctx->dy_inv[j] = (j < ctx->ny - 1) ? 1.0 / (2.0 * grid->dy[j]) : 0.0;
160 : }
161 :
162 10 : ctx->initialized = 1;
163 10 : solver->context = ctx;
164 :
165 : #if USE_AVX
166 : #ifdef _OPENMP
167 : printf("Explicit Euler SIMD: AVX2 + OpenMP enabled (%d threads)\n", omp_get_max_threads());
168 : #else
169 : printf("Explicit Euler SIMD: AVX2 enabled (OpenMP disabled)\n");
170 : #endif
171 : #else
172 : #ifdef _OPENMP
173 10 : printf("Explicit Euler OMP: Scalar + OpenMP enabled (%d threads)\n", omp_get_max_threads());
174 : #else
175 : printf("Explicit Euler: Scalar fallback (no SIMD or OpenMP)\n");
176 : #endif
177 : #endif
178 :
179 10 : return CFD_SUCCESS;
180 : }
181 :
182 14 : void explicit_euler_simd_destroy(struct NSSolver* solver) {
183 14 : if (solver && solver->context) {
184 10 : explicit_euler_simd_context* ctx = (explicit_euler_simd_context*)solver->context;
185 10 : if (ctx->initialized) {
186 10 : cfd_aligned_free(ctx->u_new);
187 10 : cfd_aligned_free(ctx->v_new);
188 10 : cfd_aligned_free(ctx->w_new);
189 10 : cfd_aligned_free(ctx->p_new);
190 10 : cfd_aligned_free(ctx->dx_inv);
191 10 : cfd_aligned_free(ctx->dy_inv);
192 : }
193 10 : cfd_free(ctx);
194 10 : solver->context = NULL;
195 : }
196 14 : }
197 :
198 : #if USE_AVX
199 : static inline __m256d vector_fmax(__m256d a, __m256d b) {
200 : return _mm256_max_pd(a, b);
201 : }
202 : static inline __m256d vector_fmin(__m256d a, __m256d b) {
203 : return _mm256_min_pd(a, b);
204 : }
205 :
206 : typedef struct {
207 : __m256d dt_vec;
208 : __m256d max_deriv;
209 : __m256d min_deriv;
210 : __m256d max_diverg;
211 : __m256d min_diverg;
212 : __m256d max_vel_limit;
213 : __m256d min_vel_limit;
214 : __m256d one_vec;
215 : __m256d neg_one_vec;
216 : __m256d pressure_factor;
217 : __m256d two;
218 : __m256d four;
219 : __m256d epsilon;
220 : __m256d mu_vec;
221 : __m256d zero;
222 : __m256d inv_2dz_vec;
223 : __m256d inv_dz2_vec;
224 : } simd_constants;
225 :
226 : static void init_simd_constants(simd_constants* c, const ns_solver_params_t* params,
227 : double conservative_dt, double inv_2dz, double inv_dz2) {
228 : c->dt_vec = _mm256_set1_pd(conservative_dt);
229 : c->max_deriv = _mm256_set1_pd(MAX_DERIVATIVE_LIMIT);
230 : c->min_deriv = _mm256_set1_pd(-MAX_DERIVATIVE_LIMIT);
231 : c->max_diverg = _mm256_set1_pd(MAX_DIVERGENCE_LIMIT);
232 : c->min_diverg = _mm256_set1_pd(-MAX_DIVERGENCE_LIMIT);
233 : c->max_vel_limit = _mm256_set1_pd(MAX_VELOCITY_LIMIT);
234 : c->min_vel_limit = _mm256_set1_pd(-MAX_VELOCITY_LIMIT);
235 : c->one_vec = _mm256_set1_pd(UPDATE_LIMIT);
236 : c->neg_one_vec = _mm256_set1_pd(-UPDATE_LIMIT);
237 : c->pressure_factor = _mm256_set1_pd(-PRESSURE_UPDATE_FACTOR);
238 : c->two = _mm256_set1_pd(2.0);
239 : c->four = _mm256_set1_pd(4.0);
240 : c->epsilon = _mm256_set1_pd(1e-10);
241 : c->mu_vec = _mm256_set1_pd(params->mu);
242 : c->zero = _mm256_setzero_pd();
243 : c->inv_2dz_vec = _mm256_set1_pd(inv_2dz);
244 : c->inv_dz2_vec = _mm256_set1_pd(inv_dz2);
245 : }
246 :
247 : static void process_simd_row(explicit_euler_simd_context* ctx, flow_field* field, const grid* grid,
248 : size_t j, const simd_constants* sc,
249 : size_t stride_z, size_t k_offset) {
250 : double dy2 = grid->dy[j] * grid->dy[j];
251 : __m256d dy_inv_val = _mm256_set1_pd(ctx->dy_inv[j]);
252 : __m256d dy2_val = _mm256_set1_pd(dy2);
253 : __m256d dy2_recip = _mm256_div_pd(sc->one_vec, dy2_val);
254 :
255 : for (size_t i = 1; i + 3 < ctx->nx - 1; i += 4) {
256 : size_t idx = k_offset + IDX_2D(i, j, ctx->nx);
257 :
258 : __m256d u = _mm256_loadu_pd(&field->u[idx]);
259 : __m256d v = _mm256_loadu_pd(&field->v[idx]);
260 : __m256d rho = _mm256_loadu_pd(&field->rho[idx]);
261 : __m256d rho_inv = _mm256_div_pd(sc->one_vec, _mm256_max_pd(rho, sc->epsilon));
262 : __m256d dx_inv_val = _mm256_loadu_pd(&ctx->dx_inv[i]);
263 :
264 : __m256d u_xp = _mm256_loadu_pd(&field->u[idx + 1]);
265 : __m256d u_xm = _mm256_loadu_pd(&field->u[idx - 1]);
266 : __m256d u_yp = _mm256_loadu_pd(&field->u[idx + ctx->nx]);
267 : __m256d u_ym = _mm256_loadu_pd(&field->u[idx - ctx->nx]);
268 : __m256d u_zp = _mm256_loadu_pd(&field->u[idx + stride_z]);
269 : __m256d u_zm = _mm256_loadu_pd(&field->u[idx - stride_z]);
270 :
271 : __m256d du_dx = _mm256_mul_pd(_mm256_sub_pd(u_xp, u_xm), dx_inv_val);
272 : __m256d du_dy = _mm256_mul_pd(_mm256_sub_pd(u_yp, u_ym), dy_inv_val);
273 : __m256d du_dz = _mm256_mul_pd(_mm256_sub_pd(u_zp, u_zm), sc->inv_2dz_vec);
274 :
275 : du_dx = vector_fmax(sc->min_deriv, vector_fmin(sc->max_deriv, du_dx));
276 : du_dy = vector_fmax(sc->min_deriv, vector_fmin(sc->max_deriv, du_dy));
277 : du_dz = vector_fmax(sc->min_deriv, vector_fmin(sc->max_deriv, du_dz));
278 :
279 : __m256d v_xp = _mm256_loadu_pd(&field->v[idx + 1]);
280 : __m256d v_xm = _mm256_loadu_pd(&field->v[idx - 1]);
281 : __m256d v_yp = _mm256_loadu_pd(&field->v[idx + ctx->nx]);
282 : __m256d v_ym = _mm256_loadu_pd(&field->v[idx - ctx->nx]);
283 : __m256d v_zp = _mm256_loadu_pd(&field->v[idx + stride_z]);
284 : __m256d v_zm = _mm256_loadu_pd(&field->v[idx - stride_z]);
285 :
286 : __m256d dv_dx = _mm256_mul_pd(_mm256_sub_pd(v_xp, v_xm), dx_inv_val);
287 : __m256d dv_dy = _mm256_mul_pd(_mm256_sub_pd(v_yp, v_ym), dy_inv_val);
288 : __m256d dv_dz = _mm256_mul_pd(_mm256_sub_pd(v_zp, v_zm), sc->inv_2dz_vec);
289 :
290 : dv_dx = vector_fmax(sc->min_deriv, vector_fmin(sc->max_deriv, dv_dx));
291 : dv_dy = vector_fmax(sc->min_deriv, vector_fmin(sc->max_deriv, dv_dy));
292 : dv_dz = vector_fmax(sc->min_deriv, vector_fmin(sc->max_deriv, dv_dz));
293 :
294 : __m256d p_xp = _mm256_loadu_pd(&field->p[idx + 1]);
295 : __m256d p_xm = _mm256_loadu_pd(&field->p[idx - 1]);
296 : __m256d p_yp = _mm256_loadu_pd(&field->p[idx + ctx->nx]);
297 : __m256d p_ym = _mm256_loadu_pd(&field->p[idx - ctx->nx]);
298 : __m256d p_zp = _mm256_loadu_pd(&field->p[idx + stride_z]);
299 : __m256d p_zm = _mm256_loadu_pd(&field->p[idx - stride_z]);
300 :
301 : __m256d dp_dx = _mm256_mul_pd(_mm256_sub_pd(p_xp, p_xm), dx_inv_val);
302 : __m256d dp_dy = _mm256_mul_pd(_mm256_sub_pd(p_yp, p_ym), dy_inv_val);
303 : __m256d dp_dz = _mm256_mul_pd(_mm256_sub_pd(p_zp, p_zm), sc->inv_2dz_vec);
304 :
305 : dp_dx = vector_fmax(sc->min_deriv, vector_fmin(sc->max_deriv, dp_dx));
306 : dp_dy = vector_fmax(sc->min_deriv, vector_fmin(sc->max_deriv, dp_dy));
307 : dp_dz = vector_fmax(sc->min_deriv, vector_fmin(sc->max_deriv, dp_dz));
308 :
309 : __m256d w = _mm256_loadu_pd(&field->w[idx]);
310 : __m256d w_xp = _mm256_loadu_pd(&field->w[idx + 1]);
311 : __m256d w_xm = _mm256_loadu_pd(&field->w[idx - 1]);
312 : __m256d w_yp = _mm256_loadu_pd(&field->w[idx + ctx->nx]);
313 : __m256d w_ym = _mm256_loadu_pd(&field->w[idx - ctx->nx]);
314 : __m256d w_zp = _mm256_loadu_pd(&field->w[idx + stride_z]);
315 : __m256d w_zm = _mm256_loadu_pd(&field->w[idx - stride_z]);
316 :
317 : __m256d dw_dx = _mm256_mul_pd(_mm256_sub_pd(w_xp, w_xm), dx_inv_val);
318 : __m256d dw_dy = _mm256_mul_pd(_mm256_sub_pd(w_yp, w_ym), dy_inv_val);
319 : __m256d dw_dz = _mm256_mul_pd(_mm256_sub_pd(w_zp, w_zm), sc->inv_2dz_vec);
320 :
321 : dw_dx = vector_fmax(sc->min_deriv, vector_fmin(sc->max_deriv, dw_dx));
322 : dw_dy = vector_fmax(sc->min_deriv, vector_fmin(sc->max_deriv, dw_dy));
323 : dw_dz = vector_fmax(sc->min_deriv, vector_fmin(sc->max_deriv, dw_dz));
324 :
325 : __m256d inv_dx_sq = _mm256_mul_pd(sc->four, _mm256_mul_pd(dx_inv_val, dx_inv_val));
326 :
327 : __m256d d2u_dx2 = _mm256_mul_pd(
328 : _mm256_sub_pd(_mm256_add_pd(u_xp, u_xm), _mm256_mul_pd(sc->two, u)), inv_dx_sq);
329 : __m256d d2u_dy2 = _mm256_mul_pd(
330 : _mm256_sub_pd(_mm256_add_pd(u_yp, u_ym), _mm256_mul_pd(sc->two, u)), dy2_recip);
331 : __m256d d2u_dz2 = _mm256_mul_pd(
332 : _mm256_sub_pd(_mm256_add_pd(u_zp, u_zm), _mm256_mul_pd(sc->two, u)), sc->inv_dz2_vec);
333 :
334 : __m256d d2v_dx2 = _mm256_mul_pd(
335 : _mm256_sub_pd(_mm256_add_pd(v_xp, v_xm), _mm256_mul_pd(sc->two, v)), inv_dx_sq);
336 : __m256d d2v_dy2 = _mm256_mul_pd(
337 : _mm256_sub_pd(_mm256_add_pd(v_yp, v_ym), _mm256_mul_pd(sc->two, v)), dy2_recip);
338 : __m256d d2v_dz2 = _mm256_mul_pd(
339 : _mm256_sub_pd(_mm256_add_pd(v_zp, v_zm), _mm256_mul_pd(sc->two, v)), sc->inv_dz2_vec);
340 :
341 : __m256d d2w_dx2 = _mm256_mul_pd(
342 : _mm256_sub_pd(_mm256_add_pd(w_xp, w_xm), _mm256_mul_pd(sc->two, w)), inv_dx_sq);
343 : __m256d d2w_dy2 = _mm256_mul_pd(
344 : _mm256_sub_pd(_mm256_add_pd(w_yp, w_ym), _mm256_mul_pd(sc->two, w)), dy2_recip);
345 : __m256d d2w_dz2 = _mm256_mul_pd(
346 : _mm256_sub_pd(_mm256_add_pd(w_zp, w_zm), _mm256_mul_pd(sc->two, w)), sc->inv_dz2_vec);
347 :
348 : __m256d nu = _mm256_min_pd(sc->one_vec, _mm256_mul_pd(sc->mu_vec, rho_inv));
349 :
350 : __m256d term_pres_x = _mm256_mul_pd(dp_dx, rho_inv);
351 : __m256d term_visc_u = _mm256_mul_pd(nu, _mm256_add_pd(_mm256_add_pd(d2u_dx2, d2u_dy2), d2u_dz2));
352 : __m256d conv_u = _mm256_add_pd(
353 : _mm256_add_pd(_mm256_mul_pd(u, du_dx), _mm256_mul_pd(v, du_dy)),
354 : _mm256_mul_pd(w, du_dz));
355 : __m256d du =
356 : _mm256_mul_pd(sc->dt_vec, _mm256_add_pd(_mm256_sub_pd(term_visc_u, term_pres_x),
357 : _mm256_sub_pd(sc->zero, conv_u)));
358 :
359 : __m256d term_pres_y = _mm256_mul_pd(dp_dy, rho_inv);
360 : __m256d term_visc_v = _mm256_mul_pd(nu, _mm256_add_pd(_mm256_add_pd(d2v_dx2, d2v_dy2), d2v_dz2));
361 : __m256d conv_v = _mm256_add_pd(
362 : _mm256_add_pd(_mm256_mul_pd(u, dv_dx), _mm256_mul_pd(v, dv_dy)),
363 : _mm256_mul_pd(w, dv_dz));
364 : __m256d dv =
365 : _mm256_mul_pd(sc->dt_vec, _mm256_add_pd(_mm256_sub_pd(term_visc_v, term_pres_y),
366 : _mm256_sub_pd(sc->zero, conv_v)));
367 :
368 : __m256d term_pres_z = _mm256_mul_pd(dp_dz, rho_inv);
369 : __m256d term_visc_w = _mm256_mul_pd(nu, _mm256_add_pd(_mm256_add_pd(d2w_dx2, d2w_dy2), d2w_dz2));
370 : __m256d conv_w = _mm256_add_pd(
371 : _mm256_add_pd(_mm256_mul_pd(u, dw_dx), _mm256_mul_pd(v, dw_dy)),
372 : _mm256_mul_pd(w, dw_dz));
373 : __m256d dw =
374 : _mm256_mul_pd(sc->dt_vec, _mm256_add_pd(_mm256_sub_pd(term_visc_w, term_pres_z),
375 : _mm256_sub_pd(sc->zero, conv_w)));
376 :
377 : du = vector_fmin(sc->one_vec, vector_fmax(sc->neg_one_vec, du));
378 : dv = vector_fmin(sc->one_vec, vector_fmax(sc->neg_one_vec, dv));
379 : dw = vector_fmin(sc->one_vec, vector_fmax(sc->neg_one_vec, dw));
380 :
381 : __m256d u_next = _mm256_add_pd(u, du);
382 : __m256d v_next = _mm256_add_pd(v, dv);
383 : __m256d w_next = _mm256_add_pd(w, dw);
384 :
385 : u_next = vector_fmin(sc->max_vel_limit, vector_fmax(sc->min_vel_limit, u_next));
386 : v_next = vector_fmin(sc->max_vel_limit, vector_fmax(sc->min_vel_limit, v_next));
387 : w_next = vector_fmin(sc->max_vel_limit, vector_fmax(sc->min_vel_limit, w_next));
388 :
389 : __m256d divergence = _mm256_add_pd(_mm256_add_pd(du_dx, dv_dy), dw_dz);
390 : divergence = vector_fmin(sc->max_diverg, vector_fmax(sc->min_diverg, divergence));
391 : __m256d p = _mm256_loadu_pd(&field->p[idx]);
392 : __m256d dp = _mm256_mul_pd(
393 : sc->dt_vec, _mm256_mul_pd(sc->pressure_factor, _mm256_mul_pd(rho, divergence)));
394 : dp = vector_fmin(sc->one_vec, vector_fmax(sc->neg_one_vec, dp));
395 : __m256d p_next = _mm256_add_pd(p, dp);
396 :
397 : _mm256_storeu_pd(&ctx->u_new[idx], u_next);
398 : _mm256_storeu_pd(&ctx->v_new[idx], v_next);
399 : _mm256_storeu_pd(&ctx->w_new[idx], w_next);
400 : _mm256_storeu_pd(&ctx->p_new[idx], p_next);
401 : }
402 : }
403 : #endif
404 :
405 : #if !USE_AVX
406 15536 : static void process_scalar_row(explicit_euler_simd_context* ctx, flow_field* field,
407 : const grid* grid, const ns_solver_params_t* params, size_t j,
408 : double conservative_dt, double t, size_t stride_z, size_t k_offset) {
409 876450 : for (size_t i = 1; i < ctx->nx - 1; i++) {
410 860914 : size_t idx = k_offset + IDX_2D(i, j, ctx->nx);
411 :
412 860914 : double du_dx = (field->u[idx + 1] - field->u[idx - 1]) / (2.0 * grid->dx[i]);
413 860914 : double du_dy = (field->u[idx + ctx->nx] - field->u[idx - ctx->nx]) / (2.0 * grid->dy[j]);
414 860914 : double du_dz = (field->u[idx + stride_z] - field->u[idx - stride_z]) * ctx->inv_2dz;
415 860914 : double dv_dx = (field->v[idx + 1] - field->v[idx - 1]) / (2.0 * grid->dx[i]);
416 860914 : double dv_dy = (field->v[idx + ctx->nx] - field->v[idx - ctx->nx]) / (2.0 * grid->dy[j]);
417 860914 : double dv_dz = (field->v[idx + stride_z] - field->v[idx - stride_z]) * ctx->inv_2dz;
418 860914 : double dw_dx = (field->w[idx + 1] - field->w[idx - 1]) / (2.0 * grid->dx[i]);
419 860914 : double dw_dy = (field->w[idx + ctx->nx] - field->w[idx - ctx->nx]) / (2.0 * grid->dy[j]);
420 860914 : double dw_dz = (field->w[idx + stride_z] - field->w[idx - stride_z]) * ctx->inv_2dz;
421 :
422 860914 : double dp_dx = (field->p[idx + 1] - field->p[idx - 1]) / (2.0 * grid->dx[i]);
423 860914 : double dp_dy = (field->p[idx + ctx->nx] - field->p[idx - ctx->nx]) / (2.0 * grid->dy[j]);
424 860914 : double dp_dz = (field->p[idx + stride_z] - field->p[idx - stride_z]) * ctx->inv_2dz;
425 :
426 860914 : double d2u_dx2 = (field->u[idx + 1] - 2.0 * field->u[idx] + field->u[idx - 1]) /
427 860914 : (grid->dx[i] * grid->dx[i]);
428 860914 : double d2u_dy2 = (field->u[idx + ctx->nx] - 2.0 * field->u[idx] + field->u[idx - ctx->nx]) /
429 860914 : (grid->dy[j] * grid->dy[j]);
430 860914 : double d2u_dz2 = (field->u[idx + stride_z] - 2.0 * field->u[idx] + field->u[idx - stride_z]) *
431 860914 : ctx->inv_dz2;
432 860914 : double d2v_dx2 = (field->v[idx + 1] - 2.0 * field->v[idx] + field->v[idx - 1]) /
433 : (grid->dx[i] * grid->dx[i]);
434 860914 : double d2v_dy2 = (field->v[idx + ctx->nx] - 2.0 * field->v[idx] + field->v[idx - ctx->nx]) /
435 : (grid->dy[j] * grid->dy[j]);
436 860914 : double d2v_dz2 = (field->v[idx + stride_z] - 2.0 * field->v[idx] + field->v[idx - stride_z]) *
437 : ctx->inv_dz2;
438 860914 : double d2w_dx2 = (field->w[idx + 1] - 2.0 * field->w[idx] + field->w[idx - 1]) /
439 : (grid->dx[i] * grid->dx[i]);
440 860914 : double d2w_dy2 = (field->w[idx + ctx->nx] - 2.0 * field->w[idx] + field->w[idx - ctx->nx]) /
441 : (grid->dy[j] * grid->dy[j]);
442 860914 : double d2w_dz2 = (field->w[idx + stride_z] - 2.0 * field->w[idx] + field->w[idx - stride_z]) *
443 : ctx->inv_dz2;
444 :
445 860914 : double rho = fmax(field->rho[idx], 1e-10);
446 : // Using manual define for M_PI just in case it is missed in fallback
447 : #ifndef M_PI
448 : #define M_PI 3.14159265358979323846
449 : #endif
450 860914 : double nu = fmin(params->mu / rho, 1.0);
451 :
452 860914 : du_dx = fmax(-MAX_DERIVATIVE_LIMIT, fmin(MAX_DERIVATIVE_LIMIT, du_dx));
453 860914 : du_dy = fmax(-MAX_DERIVATIVE_LIMIT, fmin(MAX_DERIVATIVE_LIMIT, du_dy));
454 860914 : du_dz = fmax(-MAX_DERIVATIVE_LIMIT, fmin(MAX_DERIVATIVE_LIMIT, du_dz));
455 860914 : dv_dx = fmax(-MAX_DERIVATIVE_LIMIT, fmin(MAX_DERIVATIVE_LIMIT, dv_dx));
456 860914 : dv_dy = fmax(-MAX_DERIVATIVE_LIMIT, fmin(MAX_DERIVATIVE_LIMIT, dv_dy));
457 860914 : dv_dz = fmax(-MAX_DERIVATIVE_LIMIT, fmin(MAX_DERIVATIVE_LIMIT, dv_dz));
458 860914 : dw_dx = fmax(-MAX_DERIVATIVE_LIMIT, fmin(MAX_DERIVATIVE_LIMIT, dw_dx));
459 860914 : dw_dy = fmax(-MAX_DERIVATIVE_LIMIT, fmin(MAX_DERIVATIVE_LIMIT, dw_dy));
460 860914 : dw_dz = fmax(-MAX_DERIVATIVE_LIMIT, fmin(MAX_DERIVATIVE_LIMIT, dw_dz));
461 860914 : dp_dx = fmax(-MAX_DERIVATIVE_LIMIT, fmin(MAX_DERIVATIVE_LIMIT, dp_dx));
462 860914 : dp_dy = fmax(-MAX_DERIVATIVE_LIMIT, fmin(MAX_DERIVATIVE_LIMIT, dp_dy));
463 860914 : dp_dz = fmax(-MAX_DERIVATIVE_LIMIT, fmin(MAX_DERIVATIVE_LIMIT, dp_dz));
464 :
465 860914 : double source_u = 0.0;
466 860914 : double source_v = 0.0;
467 860914 : double source_w = 0.0;
468 860914 : if (params->source_func) {
469 0 : params->source_func(grid->x[i], grid->y[j], 0.0, t,
470 0 : params->source_context,
471 : &source_u, &source_v, &source_w);
472 860914 : } else if (params->source_amplitude_u > 0) {
473 860482 : source_u = params->source_amplitude_u * sin(M_PI * grid->y[j]);
474 860482 : source_v = params->source_amplitude_v * sin(2.0 * M_PI * grid->x[i]);
475 : }
476 :
477 860914 : double u_c = field->u[idx];
478 860914 : double v_c = field->v[idx];
479 860914 : double w_c = field->w[idx];
480 :
481 860914 : double du = conservative_dt * (-u_c * du_dx - v_c * du_dy - w_c * du_dz -
482 860914 : dp_dx / rho + nu * (d2u_dx2 + d2u_dy2 + d2u_dz2) + source_u);
483 860914 : double dv = conservative_dt * (-u_c * dv_dx - v_c * dv_dy - w_c * dv_dz -
484 860914 : dp_dy / rho + nu * (d2v_dx2 + d2v_dy2 + d2v_dz2) + source_v);
485 860914 : double dw = conservative_dt * (-u_c * dw_dx - v_c * dw_dy - w_c * dw_dz -
486 860914 : dp_dz / rho + nu * (d2w_dx2 + d2w_dy2 + d2w_dz2) + source_w);
487 :
488 860914 : du = fmax(-UPDATE_LIMIT, fmin(UPDATE_LIMIT, du));
489 860914 : dv = fmax(-UPDATE_LIMIT, fmin(UPDATE_LIMIT, dv));
490 860914 : dw = fmax(-UPDATE_LIMIT, fmin(UPDATE_LIMIT, dw));
491 :
492 860914 : ctx->u_new[idx] = fmax(-MAX_VELOCITY_LIMIT, fmin(MAX_VELOCITY_LIMIT, u_c + du));
493 860914 : ctx->v_new[idx] = fmax(-MAX_VELOCITY_LIMIT, fmin(MAX_VELOCITY_LIMIT, v_c + dv));
494 860914 : ctx->w_new[idx] = fmax(-MAX_VELOCITY_LIMIT, fmin(MAX_VELOCITY_LIMIT, w_c + dw));
495 :
496 860914 : double divergence = du_dx + dv_dy + dw_dz;
497 860914 : divergence = fmax(-MAX_DIVERGENCE_LIMIT, fmin(MAX_DIVERGENCE_LIMIT, divergence));
498 860914 : double dp = -PRESSURE_UPDATE_FACTOR * conservative_dt * rho * divergence;
499 860914 : dp = fmax(-UPDATE_LIMIT, fmin(UPDATE_LIMIT, dp));
500 860914 : ctx->p_new[idx] = field->p[idx] + dp;
501 : }
502 15536 : }
503 : #endif
504 :
505 305 : cfd_status_t explicit_euler_simd_step(struct NSSolver* solver, flow_field* field, const grid* grid,
506 : const ns_solver_params_t* params, ns_solver_stats_t* stats) {
507 305 : if (!solver || !solver->context || !field || !grid || !params) {
508 : return CFD_ERROR_INVALID;
509 : }
510 :
511 305 : explicit_euler_simd_context* ctx = (explicit_euler_simd_context*)solver->context;
512 :
513 305 : if (field->nx < 3 || field->ny < 3 || (field->nz > 1 && field->nz < 3)) {
514 : return CFD_ERROR_INVALID;
515 : }
516 :
517 305 : if (field->nx != ctx->nx || field->ny != ctx->ny || field->nz != ctx->nz) {
518 : return CFD_ERROR_INVALID;
519 : }
520 :
521 : // Use conservative time step to match basic solver stability
522 305 : double conservative_dt = fmin(params->dt, DT_CONSERVATIVE_LIMIT);
523 :
524 : // Copy current state to temp buffers
525 305 : size_t size = ctx->nx * ctx->ny * ctx->nz;
526 305 : memcpy(ctx->u_new, field->u, size * sizeof(double));
527 305 : memcpy(ctx->v_new, field->v, size * sizeof(double));
528 305 : memcpy(ctx->w_new, field->w, size * sizeof(double));
529 305 : memcpy(ctx->p_new, field->p, size * sizeof(double));
530 :
531 305 : size_t nx = ctx->nx;
532 305 : size_t ny = ctx->ny;
533 :
534 : #if USE_AVX
535 : simd_constants sc;
536 : init_simd_constants(&sc, params, conservative_dt, ctx->inv_2dz, ctx->inv_dz2);
537 :
538 : int ny_int = (int)(ctx->ny);
539 : int j;
540 : for (size_t k = ctx->k_start; k < ctx->k_end; k++) {
541 : size_t k_offset = k * ctx->stride_z;
542 : #ifdef _OPENMP
543 : #pragma omp parallel for schedule(static)
544 : #endif
545 : for (j = 1; j < ny_int - 1; j++) {
546 : process_simd_row(ctx, field, grid, (size_t)j, &sc, ctx->stride_z, k_offset);
547 : }
548 : }
549 : #else
550 305 : int ny_int = (int)(ctx->ny);
551 305 : int j;
552 620 : for (size_t k = ctx->k_start; k < ctx->k_end; k++) {
553 315 : size_t k_offset = k * ctx->stride_z;
554 : #ifdef _OPENMP
555 315 : #pragma omp parallel for schedule(static)
556 : #endif
557 : for (j = 1; j < ny_int - 1; j++) {
558 : process_scalar_row(ctx, field, grid, params, (size_t)j, conservative_dt, 0.0,
559 : ctx->stride_z, k_offset);
560 : }
561 : }
562 : #endif
563 :
564 : // Apply boundary, check NaNs, etc.
565 305 : memcpy(field->u, ctx->u_new, size * sizeof(double));
566 305 : memcpy(field->v, ctx->v_new, size * sizeof(double));
567 305 : memcpy(field->w, ctx->w_new, size * sizeof(double));
568 305 : memcpy(field->p, ctx->p_new, size * sizeof(double));
569 :
570 : // Store caller-set boundary values before apply_boundary_conditions overwrites them
571 305 : copy_boundary_velocities_3d(ctx->u_new, ctx->v_new, ctx->w_new,
572 305 : field->u, field->v, field->w, nx, ny, ctx->nz);
573 305 : apply_boundary_conditions(field, grid);
574 305 : copy_boundary_velocities_3d(field->u, field->v, field->w,
575 305 : ctx->u_new, ctx->v_new, ctx->w_new, nx, ny, ctx->nz);
576 :
577 305 : if (stats) {
578 305 : stats->iterations = 1;
579 : }
580 :
581 : // NaN/Inf check
582 305 : int has_nan = 0;
583 924847 : for (size_t n = 0; n < size; n++) {
584 924542 : if (!isfinite(field->u[n]) || !isfinite(field->v[n]) ||
585 924542 : !isfinite(field->w[n]) || !isfinite(field->p[n])) {
586 : has_nan = 1;
587 : break;
588 : }
589 : }
590 305 : if (has_nan) {
591 0 : cfd_set_error(CFD_ERROR_DIVERGED,
592 : "NaN/Inf detected in explicit_euler_simd step");
593 0 : return CFD_ERROR_DIVERGED;
594 : }
595 :
596 : return CFD_SUCCESS;
597 : }
|