Line data Source code
1 : /**
2 : * @file linear_solver_cg.c
3 : * @brief Conjugate Gradient solver - scalar CPU implementation
4 : *
5 : * Conjugate Gradient method for solving Ax = b where A is symmetric positive
6 : * definite (SPD). For the Poisson equation with Neumann BCs, the Laplacian
7 : * operator is SPD on the space orthogonal to constants.
8 : *
9 : * CG characteristics:
10 : * - Optimal for SPD systems (Poisson with appropriate BCs)
11 : * - Converges in at most N iterations for N unknowns (in exact arithmetic)
12 : * - Typically converges in O(sqrt(cond(A))) iterations
13 : * - Requires extra storage for r, p, Ap vectors
14 : * - Each iteration: 1 matrix-vector product, 2 dot products, 2 axpy operations
15 : *
16 : * Algorithm (standard CG):
17 : * r_0 = b - A*x_0
18 : * p_0 = r_0
19 : * for k = 0, 1, 2, ...
20 : * alpha_k = (r_k, r_k) / (p_k, A*p_k)
21 : * x_{k+1} = x_k + alpha_k * p_k
22 : * r_{k+1} = r_k - alpha_k * A*p_k
23 : * beta_k = (r_{k+1}, r_{k+1}) / (r_k, r_k)
24 : * p_{k+1} = r_{k+1} + beta_k * p_k
25 : */
26 :
27 : #include "../linear_solver_internal.h"
28 :
29 : #include "cfd/core/indexing.h"
30 : #include "cfd/core/memory.h"
31 :
32 : #include <math.h>
33 : #include <stdio.h>
34 : #include <string.h>
35 :
36 : /* ============================================================================
37 : * CG CONTEXT
38 : * ============================================================================ */
39 :
40 : typedef struct {
41 : double dx2; /* dx^2 */
42 : double dy2; /* dy^2 */
43 : double inv_dz2; /* 1/dz^2 (0 for 2D) */
44 : double diag_inv; /* Jacobi preconditioner: 1/(2/dx2 + 2/dy2 + 2*inv_dz2) */
45 :
46 : size_t stride_z; /* nx*ny for 3D, 0 for 2D */
47 : size_t k_start; /* first interior k index */
48 : size_t k_end; /* one-past-last interior k index */
49 :
50 : /* CG working vectors (allocated during init) */
51 : double* r; /* Residual vector */
52 : double* z; /* Preconditioned residual (NULL if no precond) */
53 : double* p; /* Search direction */
54 : double* Ap; /* A * p (Laplacian applied to p) */
55 :
56 : int use_precond; /* Flag: is preconditioner enabled? */
57 : int initialized;
58 : } cg_context_t;
59 :
60 : /* ============================================================================
61 : * HELPER FUNCTIONS
62 : * ============================================================================ */
63 :
64 : /**
65 : * Compute dot product of two vectors (interior points only)
66 : */
67 385534 : static double dot_product(const double* a, const double* b,
68 : size_t nx, size_t ny,
69 : size_t k_start, size_t k_end, size_t stride_z) {
70 385534 : double sum = 0.0;
71 1791836 : for (size_t k = k_start; k < k_end; k++) {
72 22480213 : for (size_t j = 1; j < ny - 1; j++) {
73 558430736 : for (size_t i = 1; i < nx - 1; i++) {
74 537166723 : size_t idx = (k * stride_z) + (IDX_2D(i, j, nx));
75 537166723 : sum += a[idx] * b[idx];
76 : }
77 : }
78 : }
79 385534 : return sum;
80 : }
81 :
82 : /**
83 : * Compute y = y + alpha * x (interior points only)
84 : */
85 380746 : static void axpy(double alpha, const double* x, double* y,
86 : size_t nx, size_t ny,
87 : size_t k_start, size_t k_end, size_t stride_z) {
88 1176014 : for (size_t k = k_start; k < k_end; k++) {
89 14735142 : for (size_t j = 1; j < ny - 1; j++) {
90 367904464 : for (size_t i = 1; i < nx - 1; i++) {
91 353964590 : size_t idx = (k * stride_z) + (IDX_2D(i, j, nx));
92 353964590 : y[idx] += alpha * x[idx];
93 : }
94 : }
95 : }
96 : }
97 :
98 : /**
99 : * Apply negative Laplacian operator: Ap = -nabla^2(p)
100 : * For Poisson equation: nabla^2(x) = rhs, we solve -nabla^2(x) = -rhs
101 : * which is SPD with positive eigenvalues.
102 : */
103 190373 : static void apply_laplacian(const double* p, double* Ap,
104 : size_t nx, size_t ny,
105 : double dx2, double dy2, double inv_dz2,
106 : size_t k_start, size_t k_end, size_t stride_z) {
107 190373 : double dx2_inv = 1.0 / dx2;
108 190373 : double dy2_inv = 1.0 / dy2;
109 :
110 588007 : for (size_t k = k_start; k < k_end; k++) {
111 7367571 : for (size_t j = 1; j < ny - 1; j++) {
112 183952232 : for (size_t i = 1; i < nx - 1; i++) {
113 176982295 : size_t idx = (k * stride_z) + (IDX_2D(i, j, nx));
114 :
115 176982295 : double laplacian =
116 176982295 : ((p[idx + 1] - (2.0 * p[idx]) + p[idx - 1]) * dx2_inv)
117 176982295 : + ((p[idx + (nx)] - (2.0 * p[idx]) + p[idx - (nx)]) * dy2_inv)
118 176982295 : + ((p[idx + (stride_z)] + p[idx - (stride_z)] - (2.0 * p[idx])) * inv_dz2);
119 176982295 : Ap[idx] = -laplacian;
120 : }
121 : }
122 : }
123 190373 : }
124 :
125 : /**
126 : * Compute initial residual: r = b - A*x
127 : * For our formulation: r = -rhs - (-nabla^2 x) = -rhs + nabla^2 x
128 : * which simplifies to: r = -(rhs - nabla^2 x)
129 : *
130 : * To maintain consistency with the standard CG formulation where we solve
131 : * -nabla^2 x = -rhs (i.e., A = -nabla^2, b = -rhs), we compute:
132 : * r = b - Ax = -rhs - (-nabla^2 x) = -rhs + nabla^2 x
133 : */
134 4788 : static void compute_residual(const double* x, const double* rhs, double* r,
135 : size_t nx, size_t ny,
136 : double dx2, double dy2, double inv_dz2,
137 : size_t k_start, size_t k_end, size_t stride_z) {
138 4788 : double dx2_inv = 1.0 / dx2;
139 4788 : double dy2_inv = 1.0 / dy2;
140 :
141 16437 : for (size_t k = k_start; k < k_end; k++) {
142 188750 : for (size_t j = 1; j < ny - 1; j++) {
143 3287020 : for (size_t i = 1; i < nx - 1; i++) {
144 3109919 : size_t idx = (k * stride_z) + (IDX_2D(i, j, nx));
145 :
146 : /* Laplacian(x) = d^2x/dx^2 + d^2x/dy^2 */
147 3109919 : double laplacian =
148 3109919 : ((x[idx + 1] - (2.0 * x[idx]) + x[idx - 1]) * dx2_inv)
149 3109919 : + ((x[idx + (nx)] - (2.0 * x[idx]) + x[idx - (nx)]) * dy2_inv)
150 3109919 : + ((x[idx + (stride_z)] + x[idx - (stride_z)] - (2.0 * x[idx])) * inv_dz2);
151 :
152 : /* r = -rhs + laplacian = -(rhs - laplacian)
153 : * This is b - Ax where A = -nabla^2 and b = -rhs */
154 3109919 : r[idx] = -rhs[idx] + laplacian;
155 : }
156 : }
157 : }
158 4788 : }
159 :
160 : /**
161 : * Copy vector p = r (interior points only)
162 : */
163 : static void copy_vector(const double* src, double* dst,
164 : size_t nx, size_t ny,
165 : size_t k_start, size_t k_end, size_t stride_z) {
166 16432 : for (size_t k = k_start; k < k_end; k++) {
167 188750 : for (size_t j = 1; j < ny - 1; j++) {
168 3287020 : for (size_t i = 1; i < nx - 1; i++) {
169 3109919 : size_t idx = (k * stride_z) + (IDX_2D(i, j, nx));
170 3109919 : dst[idx] = src[idx];
171 : }
172 : }
173 : }
174 : }
175 :
176 : /**
177 : * Apply Jacobi preconditioner: z = M^{-1} * r
178 : * For the negative Laplacian -nabla^2, the diagonal is (2/dx^2 + 2/dy^2 + 2*inv_dz2),
179 : * so M^{-1} = 1/(2/dx^2 + 2/dy^2 + 2*inv_dz2) = diag_inv.
180 : */
181 : static void apply_jacobi_precond(const double* r, double* z,
182 : size_t nx, size_t ny,
183 : double diag_inv,
184 : size_t k_start, size_t k_end, size_t stride_z) {
185 552 : for (size_t k = k_start; k < k_end; k++) {
186 11472 : for (size_t j = 1; j < ny - 1; j++) {
187 545328 : for (size_t i = 1; i < nx - 1; i++) {
188 534132 : size_t idx = (k * stride_z) + (IDX_2D(i, j, nx));
189 534132 : z[idx] = diag_inv * r[idx];
190 : }
191 : }
192 : }
193 : }
194 :
195 : /* ============================================================================
196 : * CG SCALAR IMPLEMENTATION
197 : * ============================================================================ */
198 :
199 58 : static cfd_status_t cg_scalar_init(
200 : poisson_solver_t* solver,
201 : size_t nx, size_t ny, size_t nz,
202 : double dx, double dy, double dz,
203 : const poisson_solver_params_t* params)
204 : {
205 58 : cg_context_t* ctx = (cg_context_t*)cfd_calloc(1, sizeof(cg_context_t));
206 58 : if (!ctx) {
207 : return CFD_ERROR_NOMEM;
208 : }
209 :
210 58 : ctx->dx2 = dx * dx;
211 58 : ctx->dy2 = dy * dy;
212 58 : ctx->inv_dz2 = poisson_solver_compute_inv_dz2(dz);
213 58 : poisson_solver_compute_3d_bounds(nz, nx, ny, &ctx->stride_z, &ctx->k_start, &ctx->k_end);
214 :
215 : /* Compute Jacobi preconditioner diagonal inverse */
216 58 : ctx->diag_inv = 1.0 / (2.0 / ctx->dx2 + 2.0 / ctx->dy2 + 2.0 * ctx->inv_dz2);
217 :
218 : /* Check if preconditioner is enabled */
219 58 : ctx->use_precond = (params && params->preconditioner == POISSON_PRECOND_JACOBI);
220 :
221 : /* Allocate working vectors */
222 58 : size_t n = nx * ny * nz;
223 58 : ctx->r = (double*)cfd_calloc(n, sizeof(double));
224 58 : ctx->p = (double*)cfd_calloc(n, sizeof(double));
225 58 : ctx->Ap = (double*)cfd_calloc(n, sizeof(double));
226 58 : ctx->z = NULL;
227 :
228 58 : if (!ctx->r || !ctx->p || !ctx->Ap) {
229 0 : cfd_free(ctx->r);
230 0 : cfd_free(ctx->p);
231 0 : cfd_free(ctx->Ap);
232 0 : cfd_free(ctx);
233 0 : return CFD_ERROR_NOMEM;
234 : }
235 :
236 : /* Allocate preconditioner vector if needed */
237 58 : if (ctx->use_precond) {
238 5 : ctx->z = (double*)cfd_calloc(n, sizeof(double));
239 5 : if (!ctx->z) {
240 0 : cfd_free(ctx->r);
241 0 : cfd_free(ctx->p);
242 0 : cfd_free(ctx->Ap);
243 0 : cfd_free(ctx);
244 0 : return CFD_ERROR_NOMEM;
245 : }
246 : }
247 :
248 58 : ctx->initialized = 1;
249 58 : solver->context = ctx;
250 58 : return CFD_SUCCESS;
251 : }
252 :
253 62 : static void cg_scalar_destroy(poisson_solver_t* solver) {
254 62 : if (solver && solver->context) {
255 58 : cg_context_t* ctx = (cg_context_t*)solver->context;
256 58 : cfd_free(ctx->r);
257 58 : cfd_free(ctx->z);
258 58 : cfd_free(ctx->p);
259 58 : cfd_free(ctx->Ap);
260 58 : cfd_free(ctx);
261 58 : solver->context = NULL;
262 : }
263 62 : }
264 :
265 : /**
266 : * CG solve function (supports both standard CG and preconditioned CG)
267 : *
268 : * CG uses its own solve loop (not the common one) because:
269 : * 1. The iterate function doesn't fit the standard pattern
270 : * 2. CG maintains state across iterations (rho)
271 : * 3. Convergence is measured by residual norm, not separate residual computation
272 : *
273 : * Standard CG: Preconditioned CG (PCG):
274 : * r = b - Ax r = b - Ax
275 : * p = r z = M^{-1}r
276 : * p = z
277 : * rho = (r,r) rho = (r,z)
278 : *
279 : * loop: loop:
280 : * Ap = A*p Ap = A*p
281 : * alpha = rho / (p,Ap) alpha = rho / (p,Ap)
282 : * x += alpha*p x += alpha*p
283 : * r -= alpha*Ap r -= alpha*Ap
284 : * rho_new = (r,r) z = M^{-1}r
285 : * beta = rho_new/rho rho_new = (r,z)
286 : * p = r + beta*p beta = rho_new/rho
287 : * rho = rho_new p = z + beta*p
288 : * rho = rho_new
289 : */
290 4788 : static cfd_status_t cg_scalar_solve(
291 : poisson_solver_t* solver,
292 : double* x,
293 : double* x_temp,
294 : const double* rhs,
295 : poisson_solver_stats_t* stats)
296 : {
297 4788 : (void)x_temp; /* CG doesn't use the temp buffer */
298 :
299 4788 : cg_context_t* ctx = (cg_context_t*)solver->context;
300 4788 : size_t nx = solver->nx;
301 4788 : size_t ny = solver->ny;
302 4788 : double dx2 = ctx->dx2;
303 4788 : double dy2 = ctx->dy2;
304 4788 : size_t stride_z = ctx->stride_z;
305 4788 : size_t k_start = ctx->k_start;
306 4788 : size_t k_end = ctx->k_end;
307 4788 : double inv_dz2 = ctx->inv_dz2;
308 :
309 4788 : double* r = ctx->r;
310 4788 : double* z = ctx->z;
311 4788 : double* p = ctx->p;
312 4788 : double* Ap = ctx->Ap;
313 4788 : int use_precond = ctx->use_precond;
314 4788 : double diag_inv = ctx->diag_inv;
315 :
316 4788 : poisson_solver_params_t* params = &solver->params;
317 4788 : double start_time = poisson_solver_get_time_ms();
318 :
319 : /* Apply initial boundary conditions */
320 4788 : poisson_solver_apply_bc(solver, x);
321 :
322 : /* Compute initial residual: r_0 = b - A*x_0 */
323 4788 : compute_residual(x, rhs, r, nx, ny, dx2, dy2, inv_dz2, k_start, k_end, stride_z);
324 :
325 : /* Initialize search direction and rho */
326 4788 : double rho;
327 4788 : if (use_precond) {
328 : /* z_0 = M^{-1} r_0 */
329 10 : apply_jacobi_precond(r, z, nx, ny, diag_inv, k_start, k_end, stride_z);
330 : /* p_0 = z_0 */
331 10 : copy_vector(z, p, nx, ny, k_start, k_end, stride_z);
332 : /* rho_0 = (r_0, z_0) */
333 4788 : rho = dot_product(r, z, nx, ny, k_start, k_end, stride_z);
334 : } else {
335 : /* p_0 = r_0 */
336 16427 : copy_vector(r, p, nx, ny, k_start, k_end, stride_z);
337 : /* rho_0 = (r_0, r_0) */
338 4788 : rho = dot_product(r, r, nx, ny, k_start, k_end, stride_z);
339 : }
340 :
341 4788 : double initial_res = sqrt(dot_product(r, r, nx, ny, k_start, k_end, stride_z));
342 :
343 4788 : if (stats) {
344 4788 : stats->initial_residual = initial_res;
345 : }
346 :
347 : /* Check if already converged */
348 4788 : double tolerance = params->tolerance * initial_res;
349 4788 : if (tolerance < params->absolute_tolerance) {
350 156 : tolerance = params->absolute_tolerance;
351 : }
352 :
353 4788 : if (initial_res < params->absolute_tolerance) {
354 156 : if (stats) {
355 156 : stats->status = POISSON_CONVERGED;
356 156 : stats->iterations = 0;
357 156 : stats->final_residual = initial_res;
358 156 : stats->elapsed_time_ms = poisson_solver_get_time_ms() - start_time;
359 : }
360 156 : return CFD_SUCCESS;
361 : }
362 :
363 190373 : int converged = 0;
364 : int iter;
365 : double res_norm = initial_res;
366 :
367 190373 : for (iter = 0; iter < params->max_iterations; iter++) {
368 : /* Compute Ap = A * p */
369 190373 : apply_laplacian(p, Ap, nx, ny, dx2, dy2, inv_dz2, k_start, k_end, stride_z);
370 :
371 : /* alpha = rho / (p, Ap) */
372 190373 : double p_dot_Ap = dot_product(p, Ap, nx, ny, k_start, k_end, stride_z);
373 :
374 : /* Check for breakdown (p_dot_Ap should be positive for SPD) */
375 190373 : CG_CHECK_BREAKDOWN(p_dot_Ap, stats, iter, res_norm, start_time);
376 :
377 190373 : double alpha = rho / p_dot_Ap;
378 :
379 : /* x_{k+1} = x_k + alpha * p */
380 190373 : axpy(alpha, p, x, nx, ny, k_start, k_end, stride_z);
381 :
382 : /* r_{k+1} = r_k - alpha * Ap */
383 190373 : axpy(-alpha, Ap, r, nx, ny, k_start, k_end, stride_z);
384 :
385 : /* Compute rho_new and update search direction */
386 190373 : double rho_new;
387 190373 : if (use_precond) {
388 : /* z_{k+1} = M^{-1} r_{k+1} */
389 542 : apply_jacobi_precond(r, z, nx, ny, diag_inv, k_start, k_end, stride_z);
390 : /* rho_new = (r_{k+1}, z_{k+1}) */
391 190373 : rho_new = dot_product(r, z, nx, ny, k_start, k_end, stride_z);
392 : } else {
393 : /* rho_new = (r_{k+1}, r_{k+1}) */
394 190373 : rho_new = dot_product(r, r, nx, ny, k_start, k_end, stride_z);
395 : }
396 :
397 190373 : res_norm = sqrt(dot_product(r, r, nx, ny, k_start, k_end, stride_z));
398 :
399 : /* Check convergence at intervals */
400 190373 : if (iter % params->check_interval == 0) {
401 190373 : if (params->verbose) {
402 0 : printf(" CG Iter %d: residual = %.6e\n", iter, res_norm);
403 : }
404 :
405 190373 : if (res_norm < tolerance || res_norm < params->absolute_tolerance) {
406 : converged = 1;
407 : break;
408 : }
409 : }
410 :
411 : /* Check for breakdown in rho before computing beta */
412 185741 : CG_CHECK_BREAKDOWN(rho, stats, iter, res_norm, start_time);
413 :
414 : /* beta = rho_new / rho */
415 185741 : double beta = rho_new / rho;
416 :
417 : /* p_{k+1} = (z or r)_{k+1} + beta * p_k */
418 185741 : if (use_precond) {
419 532 : for (size_t k = k_start; k < k_end; k++) {
420 11120 : for (size_t j = 1; j < ny - 1; j++) {
421 530832 : for (size_t i = 1; i < nx - 1; i++) {
422 519978 : size_t idx = (k * stride_z) + (IDX_2D(i, j, nx));
423 519978 : p[idx] = z[idx] + beta * p[idx];
424 : }
425 : }
426 : }
427 : } else {
428 571605 : for (size_t k = k_start; k < k_end; k++) {
429 7171466 : for (size_t j = 1; j < ny - 1; j++) {
430 180171442 : for (size_t i = 1; i < nx - 1; i++) {
431 173386106 : size_t idx = (k * stride_z) + (IDX_2D(i, j, nx));
432 173386106 : p[idx] = r[idx] + beta * p[idx];
433 : }
434 : }
435 : }
436 : }
437 :
438 185741 : rho = rho_new;
439 : }
440 :
441 : /* Final convergence check (in case we converged between check intervals) */
442 4632 : if (!converged && (res_norm < tolerance || res_norm < params->absolute_tolerance)) {
443 4632 : converged = 1;
444 : }
445 :
446 : /* Apply final boundary conditions */
447 4632 : poisson_solver_apply_bc(solver, x);
448 :
449 4632 : double end_time = poisson_solver_get_time_ms();
450 :
451 4632 : if (stats) {
452 : /* When loop breaks early (converged), iter is the last iteration index, so +1.
453 : * When loop completes naturally, iter == max_iterations, report as-is. */
454 4632 : stats->iterations = (iter < params->max_iterations) ? (iter + 1) : iter;
455 4632 : stats->final_residual = res_norm;
456 4632 : stats->elapsed_time_ms = end_time - start_time;
457 4632 : stats->status = converged ? POISSON_CONVERGED : POISSON_MAX_ITER;
458 : }
459 :
460 4632 : return converged ? CFD_SUCCESS : CFD_ERROR_MAX_ITER;
461 : }
462 :
463 : /**
464 : * Single iteration is not well-defined for CG as it maintains internal state.
465 : * We provide a minimal implementation that returns the current residual.
466 : */
467 0 : static cfd_status_t cg_scalar_iterate(
468 : poisson_solver_t* solver,
469 : double* x,
470 : double* x_temp,
471 : const double* rhs,
472 : double* residual)
473 : {
474 0 : (void)x_temp;
475 :
476 : /* CG doesn't support single iteration mode well.
477 : * Return the current residual from the Laplacian. */
478 0 : if (residual) {
479 0 : *residual = poisson_solver_compute_residual(solver, x, rhs);
480 : }
481 0 : return CFD_SUCCESS;
482 : }
483 :
484 : /* ============================================================================
485 : * FACTORY FUNCTION
486 : * ============================================================================ */
487 :
488 62 : poisson_solver_t* create_cg_scalar_solver(void) {
489 62 : poisson_solver_t* solver = (poisson_solver_t*)cfd_calloc(1, sizeof(poisson_solver_t));
490 62 : if (!solver) {
491 : return NULL;
492 : }
493 :
494 62 : solver->name = POISSON_SOLVER_TYPE_CG_SCALAR;
495 62 : solver->description = "Conjugate Gradient (scalar CPU)";
496 62 : solver->method = POISSON_METHOD_CG;
497 62 : solver->backend = POISSON_BACKEND_SCALAR;
498 62 : solver->params = poisson_solver_params_default();
499 :
500 62 : solver->init = cg_scalar_init;
501 62 : solver->destroy = cg_scalar_destroy;
502 62 : solver->solve = cg_scalar_solve; /* CG uses custom solve loop */
503 62 : solver->iterate = cg_scalar_iterate;
504 62 : solver->apply_bc = NULL; /* Use default Neumann */
505 :
506 62 : return solver;
507 : }
|