Line data Source code
1 : /**
2 : * Optimized Projection Method NSSolver (Chorin's Method) with SIMD + OpenMP
3 : *
4 : * This implementation combines SIMD vectorization (AVX2) with OpenMP
5 : * parallelization for maximum performance on multi-core CPUs.
6 : *
7 : * - Predictor step: OpenMP parallelized (scalar inner loops)
8 : * - Corrector step: OpenMP parallelized with AVX2 SIMD inner loops
9 : * - Poisson solver: Uses SIMD Poisson solver for pressure computation
10 : */
11 :
12 : #include "cfd/boundary/boundary_conditions.h"
13 : #include "cfd/core/cfd_status.h"
14 : #include "cfd/core/grid.h"
15 : #include "cfd/core/indexing.h"
16 : #include "cfd/core/memory.h"
17 : #include "cfd/solvers/navier_stokes_solver.h"
18 : #include "cfd/solvers/poisson_solver.h"
19 :
20 : #include "../boundary_copy_utils.h"
21 :
22 : #include <math.h>
23 : #include <stdio.h>
24 : #include <string.h>
25 :
26 : #ifndef M_PI
27 : #define M_PI 3.14159265358979323846
28 : #endif
29 :
30 : #ifdef _OPENMP
31 : #include <omp.h>
32 : #endif
33 :
34 : /* AVX2 detection
35 : * CFD_HAS_AVX2 is set by CMake when -DCFD_ENABLE_AVX2=ON.
36 : * This works consistently across all compilers (GCC, Clang, MSVC).
37 : */
38 : #if defined(CFD_HAS_AVX2)
39 : #include <immintrin.h>
40 : #define USE_AVX 1
41 : #else
42 : #define USE_AVX 0
43 : #endif
44 :
45 : // Physical limits
46 : #define MAX_VELOCITY 100.0
47 :
48 : typedef struct {
49 : double* u_star;
50 : double* v_star;
51 : double* w_star;
52 : double* p_new;
53 : double* rhs;
54 : double* u_new; /* used as p_temp for Poisson solver */
55 : size_t nx;
56 : size_t ny;
57 : size_t nz;
58 : size_t stride_z;
59 : size_t k_start;
60 : size_t k_end;
61 : double inv_2dz;
62 : double inv_dz2;
63 : int initialized;
64 : int iter_count;
65 : } projection_simd_context;
66 :
67 : // Public API
68 : cfd_status_t projection_simd_init(struct NSSolver* solver, const grid* grid,
69 : const ns_solver_params_t* params);
70 : void projection_simd_destroy(struct NSSolver* solver);
71 : cfd_status_t projection_simd_step(struct NSSolver* solver, flow_field* field, const grid* grid,
72 : const ns_solver_params_t* params, ns_solver_stats_t* stats);
73 :
74 14 : cfd_status_t projection_simd_init(struct NSSolver* solver, const grid* grid,
75 : const ns_solver_params_t* params) {
76 14 : (void)params;
77 14 : if (!solver || !grid) {
78 : return CFD_ERROR_INVALID;
79 : }
80 14 : if (grid->nx < 3 || grid->ny < 3 || (grid->nz > 1 && grid->nz < 3)) {
81 : return CFD_ERROR_INVALID;
82 : }
83 :
84 : /* Verify SIMD CG Poisson solver is available before allocating resources */
85 14 : poisson_solver_t* test_solver = poisson_solver_create(
86 : POISSON_METHOD_CG, POISSON_BACKEND_SIMD);
87 14 : if (!test_solver) {
88 14 : fprintf(stderr, "projection_simd_init: SIMD CG Poisson solver not available\n");
89 14 : return CFD_ERROR_UNSUPPORTED;
90 : }
91 0 : poisson_solver_destroy(test_solver);
92 :
93 0 : projection_simd_context* ctx =
94 0 : (projection_simd_context*)cfd_calloc(1, sizeof(projection_simd_context));
95 0 : if (!ctx) {
96 : return CFD_ERROR_NOMEM;
97 : }
98 :
99 0 : ctx->nx = grid->nx;
100 0 : ctx->ny = grid->ny;
101 0 : ctx->nz = grid->nz;
102 0 : size_t size = ctx->nx * ctx->ny * grid->nz * sizeof(double);
103 :
104 : /* Reject non-uniform z-spacing (solver uses constant dz) */
105 0 : if (grid->nz > 1 && grid->dz) {
106 0 : for (size_t kk = 1; kk < grid->nz - 1; kk++) {
107 0 : if (fabs(grid->dz[kk] - grid->dz[0]) > 1e-14) {
108 0 : cfd_free(ctx);
109 0 : return CFD_ERROR_INVALID;
110 : }
111 : }
112 : }
113 :
114 0 : size_t plane = ctx->nx * ctx->ny;
115 0 : ctx->stride_z = (grid->nz > 1) ? plane : 0;
116 0 : ctx->k_start = (grid->nz > 1) ? 1 : 0;
117 0 : ctx->k_end = (grid->nz > 1) ? (grid->nz - 1) : 1;
118 0 : double dz = (grid->nz > 1 && grid->dz) ? grid->dz[0] : 0.0;
119 0 : ctx->inv_2dz = (grid->nz > 1 && grid->dz) ? 1.0 / (2.0 * dz) : 0.0;
120 0 : ctx->inv_dz2 = (grid->nz > 1 && grid->dz) ? 1.0 / (dz * dz) : 0.0;
121 :
122 0 : ctx->u_star = (double*)cfd_aligned_malloc(size);
123 0 : ctx->v_star = (double*)cfd_aligned_malloc(size);
124 0 : ctx->w_star = (double*)cfd_aligned_malloc(size);
125 0 : ctx->p_new = (double*)cfd_aligned_malloc(size);
126 0 : ctx->rhs = (double*)cfd_aligned_malloc(size);
127 0 : ctx->u_new = (double*)cfd_aligned_malloc(size);
128 :
129 0 : if (!ctx->u_star || !ctx->v_star || !ctx->w_star || !ctx->p_new ||
130 0 : !ctx->rhs || !ctx->u_new) {
131 0 : if (ctx->u_star) {
132 0 : cfd_aligned_free(ctx->u_star);
133 : }
134 0 : if (ctx->v_star) {
135 0 : cfd_aligned_free(ctx->v_star);
136 : }
137 0 : if (ctx->w_star) {
138 0 : cfd_aligned_free(ctx->w_star);
139 : }
140 0 : if (ctx->p_new) {
141 0 : cfd_aligned_free(ctx->p_new);
142 : }
143 0 : if (ctx->rhs) {
144 0 : cfd_aligned_free(ctx->rhs);
145 : }
146 0 : if (ctx->u_new) {
147 0 : cfd_aligned_free(ctx->u_new);
148 : }
149 0 : cfd_free(ctx);
150 0 : return CFD_ERROR_NOMEM;
151 : }
152 :
153 0 : ctx->initialized = 1;
154 0 : solver->context = ctx;
155 0 : return CFD_SUCCESS;
156 : }
157 :
158 18 : void projection_simd_destroy(struct NSSolver* solver) {
159 18 : if (solver && solver->context) {
160 0 : projection_simd_context* ctx = (projection_simd_context*)solver->context;
161 0 : if (ctx->initialized) {
162 0 : cfd_aligned_free(ctx->u_star);
163 0 : cfd_aligned_free(ctx->v_star);
164 0 : cfd_aligned_free(ctx->w_star);
165 0 : cfd_aligned_free(ctx->p_new);
166 0 : cfd_aligned_free(ctx->rhs);
167 0 : cfd_aligned_free(ctx->u_new);
168 : }
169 0 : cfd_free(ctx);
170 0 : solver->context = NULL;
171 : }
172 18 : }
173 :
174 1 : cfd_status_t projection_simd_step(struct NSSolver* solver, flow_field* field, const grid* grid,
175 : const ns_solver_params_t* params, ns_solver_stats_t* stats) {
176 1 : if (!solver || !solver->context || !field || !grid || !params) {
177 : return CFD_ERROR_INVALID;
178 : }
179 0 : if (field->nx < 3 || field->ny < 3 || (field->nz > 1 && field->nz < 3)) {
180 : return CFD_ERROR_INVALID;
181 : }
182 :
183 0 : projection_simd_context* ctx = (projection_simd_context*)solver->context;
184 :
185 : // Verify context matches current grid
186 0 : if (ctx->nx != field->nx || ctx->ny != field->ny || ctx->nz != field->nz) {
187 : return CFD_ERROR_INVALID;
188 : }
189 :
190 0 : size_t nx = field->nx;
191 0 : size_t ny = field->ny;
192 0 : size_t size = nx * ny * ctx->nz;
193 :
194 0 : double dx = grid->dx[0];
195 0 : double dy = grid->dy[0];
196 0 : double dz = (ctx->nz > 1 && grid->dz) ? grid->dz[0] : 0.0;
197 0 : double dt = params->dt;
198 0 : double nu = params->mu; // Viscosity (treated as kinematic for ρ=1)
199 :
200 0 : double* u_star = ctx->u_star;
201 0 : double* v_star = ctx->v_star;
202 0 : double* w_star = ctx->w_star;
203 0 : double* p_new = ctx->p_new;
204 0 : double* rhs = ctx->rhs;
205 :
206 : // Copy current field values to work buffers (includes boundaries)
207 0 : memcpy(u_star, field->u, size * sizeof(double));
208 0 : memcpy(v_star, field->v, size * sizeof(double));
209 0 : memcpy(w_star, field->w, size * sizeof(double));
210 0 : memcpy(p_new, field->p, size * sizeof(double));
211 0 : memset(rhs, 0, size * sizeof(double));
212 :
213 : // ============================================================
214 : // STEP 1: Predictor - Compute intermediate velocity u*
215 : // (OpenMP parallelized outer loop, scalar inner loop)
216 : // ============================================================
217 0 : int ny_int = (int)ny;
218 0 : int nx_int = (int)nx;
219 0 : int jj;
220 0 : (void)nx_int; /* suppress unused variable warning */
221 :
222 0 : for (size_t k = ctx->k_start; k < ctx->k_end; k++) {
223 0 : size_t k_off = k * ctx->stride_z;
224 : #ifdef _OPENMP
225 0 : #pragma omp parallel for schedule(static)
226 : #endif
227 : for (jj = 1; jj < ny_int - 1; jj++) {
228 : size_t j = (size_t)jj;
229 : for (size_t i = 1; i < nx - 1; i++) {
230 : size_t idx = k_off + IDX_2D(i, j, nx);
231 :
232 : double u = field->u[idx];
233 : double v = field->v[idx];
234 : double w = field->w[idx];
235 :
236 : // Convective terms: -u·∇u (central differences)
237 : double du_dx = (field->u[idx + 1] - field->u[idx - 1]) / (2.0 * dx);
238 : double du_dy = (field->u[idx + nx] - field->u[idx - nx]) / (2.0 * dy);
239 : double du_dz = (field->u[idx + ctx->stride_z] - field->u[idx - ctx->stride_z]) *
240 : ctx->inv_2dz;
241 :
242 : double dv_dx = (field->v[idx + 1] - field->v[idx - 1]) / (2.0 * dx);
243 : double dv_dy = (field->v[idx + nx] - field->v[idx - nx]) / (2.0 * dy);
244 : double dv_dz = (field->v[idx + ctx->stride_z] - field->v[idx - ctx->stride_z]) *
245 : ctx->inv_2dz;
246 :
247 : double dw_dx = (field->w[idx + 1] - field->w[idx - 1]) / (2.0 * dx);
248 : double dw_dy = (field->w[idx + nx] - field->w[idx - nx]) / (2.0 * dy);
249 : double dw_dz = (field->w[idx + ctx->stride_z] - field->w[idx - ctx->stride_z]) *
250 : ctx->inv_2dz;
251 :
252 : double conv_u = (u * du_dx) + (v * du_dy) + (w * du_dz);
253 : double conv_v = (u * dv_dx) + (v * dv_dy) + (w * dv_dz);
254 : double conv_w = (u * dw_dx) + (v * dw_dy) + (w * dw_dz);
255 :
256 : // Viscous terms: ν∇²u
257 : double d2u_dx2 = (field->u[idx + 1] - 2.0 * u + field->u[idx - 1]) / (dx * dx);
258 : double d2u_dy2 = (field->u[idx + nx] - 2.0 * u + field->u[idx - nx]) / (dy * dy);
259 : double d2u_dz2 = (field->u[idx + ctx->stride_z] - 2.0 * u +
260 : field->u[idx - ctx->stride_z]) * ctx->inv_dz2;
261 :
262 : double d2v_dx2 = (field->v[idx + 1] - 2.0 * v + field->v[idx - 1]) / (dx * dx);
263 : double d2v_dy2 = (field->v[idx + nx] - 2.0 * v + field->v[idx - nx]) / (dy * dy);
264 : double d2v_dz2 = (field->v[idx + ctx->stride_z] - 2.0 * v +
265 : field->v[idx - ctx->stride_z]) * ctx->inv_dz2;
266 :
267 : double d2w_dx2 = (field->w[idx + 1] - 2.0 * w + field->w[idx - 1]) / (dx * dx);
268 : double d2w_dy2 = (field->w[idx + nx] - 2.0 * w + field->w[idx - nx]) / (dy * dy);
269 : double d2w_dz2 = (field->w[idx + ctx->stride_z] - 2.0 * w +
270 : field->w[idx - ctx->stride_z]) * ctx->inv_dz2;
271 :
272 : double visc_u = nu * (d2u_dx2 + d2u_dy2 + d2u_dz2);
273 : double visc_v = nu * (d2v_dx2 + d2v_dy2 + d2v_dz2);
274 : double visc_w = nu * (d2w_dx2 + d2w_dy2 + d2w_dz2);
275 :
276 : // Source terms
277 : double source_u = 0.0;
278 : double source_v = 0.0;
279 : double source_w = 0.0;
280 : double x_coord = grid->x[i];
281 : double y_coord = grid->y[j];
282 : double z_coord = (ctx->nz > 1 && grid->z) ? grid->z[k] : 0.0;
283 : compute_source_terms(x_coord, y_coord, z_coord, ctx->iter_count, dt, params,
284 : &source_u, &source_v, &source_w);
285 :
286 : // Intermediate velocity (without pressure gradient)
287 : u_star[idx] = u + (dt * (-conv_u + visc_u + source_u));
288 : v_star[idx] = v + (dt * (-conv_v + visc_v + source_v));
289 : w_star[idx] = w + (dt * (-conv_w + visc_w + source_w));
290 :
291 : // Limit velocities
292 : u_star[idx] = fmax(-MAX_VELOCITY, fmin(MAX_VELOCITY, u_star[idx]));
293 : v_star[idx] = fmax(-MAX_VELOCITY, fmin(MAX_VELOCITY, v_star[idx]));
294 : w_star[idx] = fmax(-MAX_VELOCITY, fmin(MAX_VELOCITY, w_star[idx]));
295 : }
296 : }
297 : }
298 :
299 : // Copy boundary values from field to star arrays
300 0 : copy_boundary_velocities_3d(u_star, v_star, w_star, field->u, field->v, field->w,
301 : nx, ny, ctx->nz);
302 :
303 : // ============================================================
304 : // STEP 2: Solve Poisson equation for pressure
305 : // ∇²p = (ρ/dt) * ∇·u*
306 : // ============================================================
307 :
308 0 : double rho = field->rho[0];
309 0 : if (rho < 1e-10) {
310 0 : rho = 1.0;
311 : }
312 :
313 : // Compute RHS: divergence of intermediate velocity
314 0 : for (size_t k = ctx->k_start; k < ctx->k_end; k++) {
315 0 : size_t k_off = k * ctx->stride_z;
316 : #ifdef _OPENMP
317 0 : #pragma omp parallel for schedule(static)
318 : #endif
319 : for (jj = 1; jj < ny_int - 1; jj++) {
320 : size_t j = (size_t)jj;
321 : for (size_t i = 1; i < nx - 1; i++) {
322 : size_t idx = k_off + IDX_2D(i, j, nx);
323 :
324 : double du_star_dx = (u_star[idx + 1] - u_star[idx - 1]) / (2.0 * dx);
325 : double dv_star_dy = (v_star[idx + nx] - v_star[idx - nx]) / (2.0 * dy);
326 : double dw_star_dz = (w_star[idx + ctx->stride_z] -
327 : w_star[idx - ctx->stride_z]) * ctx->inv_2dz;
328 :
329 : double divergence = du_star_dx + dv_star_dy + dw_star_dz;
330 : rhs[idx] = (rho / dt) * divergence;
331 : }
332 : }
333 : }
334 :
335 : // Use SIMD Poisson solver (Conjugate Gradient with SIMD)
336 : // ctx->u_new is used as temp buffer for the Poisson solver
337 0 : int poisson_iters = poisson_solve_3d(p_new, ctx->u_new, rhs, nx, ny, ctx->nz,
338 : dx, dy, dz, POISSON_SOLVER_CG_SIMD);
339 :
340 0 : if (poisson_iters < 0) {
341 : return CFD_ERROR_MAX_ITER;
342 : }
343 :
344 : // ============================================================
345 : // STEP 3: Corrector - Project velocity to be divergence-free
346 : // u^(n+1) = u* - (dt/ρ) * ∇p
347 : // (OpenMP parallelized with SIMD inner loop)
348 : // ============================================================
349 :
350 0 : double dt_over_rho = dt / rho;
351 0 : double inv_2dx = 1.0 / (2.0 * dx);
352 0 : double inv_2dy = 1.0 / (2.0 * dy);
353 :
354 : #if USE_AVX
355 : __m256d dt_rho_vec = _mm256_set1_pd(dt_over_rho);
356 : __m256d inv_2dx_vec = _mm256_set1_pd(inv_2dx);
357 : __m256d inv_2dy_vec = _mm256_set1_pd(inv_2dy);
358 : __m256d inv_2dz_vec = _mm256_set1_pd(ctx->inv_2dz);
359 : __m256d max_vel_vec = _mm256_set1_pd(MAX_VELOCITY);
360 : __m256d neg_max_vel_vec = _mm256_set1_pd(-MAX_VELOCITY);
361 : #endif
362 :
363 0 : for (size_t k = ctx->k_start; k < ctx->k_end; k++) {
364 0 : size_t k_off = k * ctx->stride_z;
365 : #ifdef _OPENMP
366 0 : #pragma omp parallel for schedule(static)
367 : #endif
368 : for (jj = 1; jj < ny_int - 1; jj++) {
369 : size_t j = (size_t)jj;
370 : size_t i = 1;
371 :
372 : #if USE_AVX
373 : // SIMD loop - process 4 cells at once
374 : for (; i + 4 <= nx - 1; i += 4) {
375 : size_t idx = k_off + IDX_2D(i, j, nx);
376 :
377 : // Load pressure neighbors for gradient computation
378 : __m256d p_xp = _mm256_loadu_pd(&p_new[idx + 1]);
379 : __m256d p_xm = _mm256_loadu_pd(&p_new[idx - 1]);
380 : __m256d p_yp = _mm256_loadu_pd(&p_new[idx + nx]);
381 : __m256d p_ym = _mm256_loadu_pd(&p_new[idx - nx]);
382 : __m256d p_zp = _mm256_loadu_pd(&p_new[idx + ctx->stride_z]);
383 : __m256d p_zm = _mm256_loadu_pd(&p_new[idx - ctx->stride_z]);
384 :
385 : // Compute pressure gradients
386 : __m256d dp_dx = _mm256_mul_pd(_mm256_sub_pd(p_xp, p_xm), inv_2dx_vec);
387 : __m256d dp_dy = _mm256_mul_pd(_mm256_sub_pd(p_yp, p_ym), inv_2dy_vec);
388 : __m256d dp_dz = _mm256_mul_pd(_mm256_sub_pd(p_zp, p_zm), inv_2dz_vec);
389 :
390 : // Load intermediate velocities
391 : __m256d u_s = _mm256_loadu_pd(&u_star[idx]);
392 : __m256d v_s = _mm256_loadu_pd(&v_star[idx]);
393 : __m256d w_s = _mm256_loadu_pd(&w_star[idx]);
394 :
395 : // Corrector: u = u* - (dt/rho) * dp/dx
396 : __m256d u_new = _mm256_sub_pd(u_s, _mm256_mul_pd(dt_rho_vec, dp_dx));
397 : __m256d v_new = _mm256_sub_pd(v_s, _mm256_mul_pd(dt_rho_vec, dp_dy));
398 : __m256d w_new = _mm256_sub_pd(w_s, _mm256_mul_pd(dt_rho_vec, dp_dz));
399 :
400 : // Clamp velocities to [-MAX_VELOCITY, MAX_VELOCITY]
401 : u_new = _mm256_max_pd(neg_max_vel_vec, _mm256_min_pd(max_vel_vec, u_new));
402 : v_new = _mm256_max_pd(neg_max_vel_vec, _mm256_min_pd(max_vel_vec, v_new));
403 : w_new = _mm256_max_pd(neg_max_vel_vec, _mm256_min_pd(max_vel_vec, w_new));
404 :
405 : // Store results
406 : _mm256_storeu_pd(&field->u[idx], u_new);
407 : _mm256_storeu_pd(&field->v[idx], v_new);
408 : _mm256_storeu_pd(&field->w[idx], w_new);
409 : }
410 : #endif
411 :
412 : // Scalar remainder
413 : for (; i < nx - 1; i++) {
414 : size_t idx = k_off + IDX_2D(i, j, nx);
415 :
416 : double dp_dx = (p_new[idx + 1] - p_new[idx - 1]) * inv_2dx;
417 : double dp_dy = (p_new[idx + nx] - p_new[idx - nx]) * inv_2dy;
418 : double dp_dz = (p_new[idx + ctx->stride_z] - p_new[idx - ctx->stride_z]) *
419 : ctx->inv_2dz;
420 :
421 : field->u[idx] = u_star[idx] - (dt_over_rho * dp_dx);
422 : field->v[idx] = v_star[idx] - (dt_over_rho * dp_dy);
423 : field->w[idx] = w_star[idx] - (dt_over_rho * dp_dz);
424 :
425 : // Limit velocities
426 : field->u[idx] = fmax(-MAX_VELOCITY, fmin(MAX_VELOCITY, field->u[idx]));
427 : field->v[idx] = fmax(-MAX_VELOCITY, fmin(MAX_VELOCITY, field->v[idx]));
428 : field->w[idx] = fmax(-MAX_VELOCITY, fmin(MAX_VELOCITY, field->w[idx]));
429 : }
430 : }
431 : }
432 :
433 : // Update pressure field
434 0 : memcpy(field->p, p_new, size * sizeof(double));
435 :
436 : // Copy boundary velocity values from star arrays (which have caller's BCs)
437 0 : copy_boundary_velocities_3d(field->u, field->v, field->w, u_star, v_star, w_star,
438 : nx, ny, ctx->nz);
439 :
440 : // Check for NaN
441 0 : for (size_t n = 0; n < size; n++) {
442 0 : if (!isfinite(field->u[n]) || !isfinite(field->v[n]) ||
443 0 : !isfinite(field->w[n]) || !isfinite(field->p[n])) {
444 : return CFD_ERROR_DIVERGED;
445 : }
446 : }
447 :
448 0 : ctx->iter_count++;
449 :
450 0 : if (stats) {
451 0 : stats->iterations = 1;
452 : }
453 :
454 : return CFD_SUCCESS;
455 : }
|