Line data Source code
1 : /**
2 : * Boundary Conditions - SIMD Dispatcher with Runtime Detection
3 : *
4 : * This file provides the unified bc_impl_simd interface by selecting
5 : * the correct architecture-specific implementation at RUNTIME:
6 : * - AVX2 on x86-64 (detected via CPUID)
7 : * - NEON on ARM64 (always available on ARM64)
8 : *
9 : * The actual implementations remain in separate files:
10 : * - avx2/boundary_conditions_avx2.c
11 : * - neon/boundary_conditions_neon.c
12 : *
13 : * Compile-Time vs Runtime Detection:
14 : * ----------------------------------
15 : * The availability check (simd_available) uses BOTH:
16 : * 1. Runtime CPU detection: cfd_detect_simd_arch() checks if CPU supports AVX2/NEON
17 : * 2. Compile-time availability: Checks if function pointers are non-NULL
18 : *
19 : * This two-phase check handles the case where:
20 : * - CPU supports AVX2, but code was compiled without -mavx2 flag
21 : * - In this case, bc_impl_avx2 has NULL pointers, so simd_available()
22 : * returns false even though runtime detection reports AVX2 support.
23 : *
24 : * This design ensures safe operation: SIMD backend is only used when BOTH
25 : * the CPU supports it AND the code was compiled with SIMD instructions.
26 : *
27 : * Error Handling:
28 : * If called when no SIMD backend is available (programming error), these
29 : * dispatcher functions will:
30 : * 1. Call the user-configurable error handler (or print to stderr if none set)
31 : * 2. Assert in debug builds
32 : * 3. Fall back to scalar implementation to avoid leaving fields in invalid state
33 : *
34 : * Callers SHOULD check bc_simd_backend_available() before using this backend.
35 : */
36 :
37 : #include "../boundary_conditions_internal.h"
38 : #include "cfd/core/cpu_features.h"
39 : #include <stdbool.h>
40 : #include <stdint.h>
41 : #include <assert.h>
42 : #include <stdio.h>
43 :
44 : /* Platform-specific atomic operations for thread-safe caching */
45 : #ifdef _MSC_VER
46 : #include <intrin.h>
47 : #define ATOMIC_LOAD(ptr) _InterlockedCompareExchange64((volatile long long*)(ptr), 0, 0)
48 : #define ATOMIC_CAS(ptr, expected, desired) \
49 : (_InterlockedCompareExchange64((volatile long long*)(ptr), (long long)(desired), (long long)(*(expected))) == (long long)(*(expected)))
50 : #define COMPILER_BARRIER() _ReadWriteBarrier()
51 : #else
52 : /* GCC/Clang built-in atomics */
53 : #define ATOMIC_LOAD(ptr) __atomic_load_n((ptr), __ATOMIC_ACQUIRE)
54 : #define ATOMIC_CAS(ptr, expected, desired) \
55 : __atomic_compare_exchange_n((ptr), (expected), (desired), 0, __ATOMIC_RELEASE, __ATOMIC_ACQUIRE)
56 : #define COMPILER_BARRIER() __asm__ __volatile__("" ::: "memory")
57 : #endif
58 :
59 : /* ============================================================================
60 : * Helper: Get SIMD backend based on runtime detection
61 : *
62 : * Returns the appropriate backend implementation table (AVX2 or NEON) based
63 : * on runtime CPU detection. Returns NULL if no SIMD backend is available.
64 : *
65 : * The result is cached after first call since SIMD architecture doesn't change
66 : * at runtime. This avoids redundant calls to cfd_detect_simd_arch() on every
67 : * boundary condition operation.
68 : *
69 : * Thread Safety:
70 : * Uses atomic compare-and-swap to ensure proper synchronization. Only one
71 : * thread will successfully initialize the cache; others will use the
72 : * already-cached result. Memory barriers ensure visibility across threads.
73 : * ============================================================================ */
74 :
75 : /* Cache for the SIMD backend pointer.
76 : * Values: 0 = not initialized, 1 = no backend, 2+ = valid backend pointer + 1
77 : * Using intptr_t allows atomic operations and encodes state in a single variable. */
78 : static volatile intptr_t g_simd_backend_cache = 0;
79 :
80 : /* Sentinel values for cache state */
81 : #define CACHE_UNINITIALIZED 0
82 : #define CACHE_NO_BACKEND 1
83 :
84 0 : static const bc_backend_impl_t* get_simd_backend(void) {
85 : /* Fast path: check if already initialized */
86 0 : intptr_t cached = (intptr_t)ATOMIC_LOAD(&g_simd_backend_cache);
87 :
88 0 : if (cached != CACHE_UNINITIALIZED) {
89 0 : if (cached == CACHE_NO_BACKEND) {
90 : return NULL;
91 : }
92 : /* Decode pointer: subtract 1 and cast back */
93 0 : return (const bc_backend_impl_t*)(cached - 1);
94 : }
95 :
96 : /* Slow path: detect SIMD backend */
97 0 : cfd_simd_arch_t arch = cfd_detect_simd_arch();
98 0 : const bc_backend_impl_t* result = NULL;
99 :
100 0 : if (arch == CFD_SIMD_AVX2 && bc_impl_avx2.apply_neumann != NULL) {
101 : result = &bc_impl_avx2;
102 0 : } else if (arch == CFD_SIMD_NEON && bc_impl_neon.apply_neumann != NULL) {
103 : result = &bc_impl_neon;
104 : }
105 :
106 : /* Encode result: NULL becomes CACHE_NO_BACKEND, valid pointer becomes ptr+1 */
107 0 : intptr_t new_value = result ? ((intptr_t)result + 1) : CACHE_NO_BACKEND;
108 :
109 : /* Try to set cache atomically. If another thread beat us, use their result. */
110 0 : intptr_t expected = CACHE_UNINITIALIZED;
111 0 : if (!ATOMIC_CAS(&g_simd_backend_cache, &expected, new_value)) {
112 : /* Another thread initialized first - use their cached value */
113 0 : if (expected == CACHE_NO_BACKEND) {
114 : return NULL;
115 : }
116 0 : return (const bc_backend_impl_t*)(expected - 1);
117 : }
118 :
119 : return result;
120 : }
121 :
122 : /**
123 : * Report error when SIMD backend is unavailable.
124 : * Called as a programming error fallback - callers should check availability first.
125 : */
126 0 : static void report_no_simd_error(const char* function) {
127 0 : char message[128];
128 0 : snprintf(message, sizeof(message),
129 : "SIMD backend called but no SIMD available (detected: %s). "
130 : "Falling back to scalar.",
131 : cfd_get_simd_name());
132 0 : bc_report_error(BC_ERROR_NO_SIMD_BACKEND, function, message);
133 0 : assert(0 && "SIMD backend called without available implementation");
134 : }
135 :
136 : /* ============================================================================
137 : * Runtime Dispatching Functions
138 : *
139 : * These functions use get_simd_backend() for unified dispatch logic.
140 : * ============================================================================ */
141 :
142 0 : static void bc_simd_neumann(double* field, size_t nx, size_t ny,
143 : size_t nz, size_t stride_z) {
144 0 : const bc_backend_impl_t* impl = get_simd_backend();
145 0 : if (impl != NULL) {
146 0 : impl->apply_neumann(field, nx, ny, nz, stride_z);
147 0 : return;
148 : }
149 0 : report_no_simd_error("bc_simd_neumann");
150 : bc_apply_neumann_scalar_impl(field, nx, ny, nz, stride_z);
151 : }
152 :
153 0 : static void bc_simd_periodic(double* field, size_t nx, size_t ny,
154 : size_t nz, size_t stride_z) {
155 0 : const bc_backend_impl_t* impl = get_simd_backend();
156 0 : if (impl != NULL) {
157 0 : impl->apply_periodic(field, nx, ny, nz, stride_z);
158 0 : return;
159 : }
160 0 : report_no_simd_error("bc_simd_periodic");
161 : bc_apply_periodic_scalar_impl(field, nx, ny, nz, stride_z);
162 : }
163 :
164 0 : static void bc_simd_dirichlet(double* field, size_t nx, size_t ny,
165 : size_t nz, size_t stride_z,
166 : const bc_dirichlet_values_t* values) {
167 0 : const bc_backend_impl_t* impl = get_simd_backend();
168 0 : if (impl != NULL) {
169 0 : impl->apply_dirichlet(field, nx, ny, nz, stride_z, values);
170 0 : return;
171 : }
172 0 : report_no_simd_error("bc_simd_dirichlet");
173 : bc_apply_dirichlet_scalar_impl(field, nx, ny, nz, stride_z, values);
174 : }
175 :
176 0 : static cfd_status_t bc_simd_inlet(double* u, double* v, double* w,
177 : size_t nx, size_t ny,
178 : size_t nz, size_t stride_z,
179 : const bc_inlet_config_t* config) {
180 : /* Inlet BCs operate on 1D boundaries - SIMD provides limited benefit.
181 : * Delegate to the architecture-specific backend if available, otherwise
182 : * fall back to scalar implementation. */
183 0 : const bc_backend_impl_t* impl = get_simd_backend();
184 0 : if (impl != NULL && impl->apply_inlet != NULL) {
185 0 : return impl->apply_inlet(u, v, w, nx, ny, nz, stride_z, config);
186 : }
187 : /* Fall back to scalar implementation for inlet */
188 0 : return bc_apply_inlet_scalar_impl(u, v, w, nx, ny, nz, stride_z, config);
189 : }
190 :
191 0 : static cfd_status_t bc_simd_outlet(double* field, size_t nx, size_t ny,
192 : size_t nz, size_t stride_z,
193 : const bc_outlet_config_t* config) {
194 : /* Outlet BCs operate on 1D boundaries - SIMD provides limited benefit
195 : * except for top/bottom edges where memory is contiguous.
196 : * Delegate to the architecture-specific backend if available, otherwise
197 : * fall back to scalar implementation. */
198 0 : const bc_backend_impl_t* impl = get_simd_backend();
199 0 : if (impl != NULL && impl->apply_outlet != NULL) {
200 0 : return impl->apply_outlet(field, nx, ny, nz, stride_z, config);
201 : }
202 : /* Fall back to scalar implementation for outlet */
203 0 : return bc_apply_outlet_scalar_impl(field, nx, ny, nz, stride_z, config);
204 : }
205 :
206 0 : static cfd_status_t bc_simd_symmetry(double* u, double* v, double* w,
207 : size_t nx, size_t ny,
208 : size_t nz, size_t stride_z,
209 : const bc_symmetry_config_t* config) {
210 : /* Symmetry BCs operate on 1D boundaries - SIMD provides limited benefit.
211 : * Delegate to the architecture-specific backend if available, otherwise
212 : * fall back to scalar implementation. */
213 0 : const bc_backend_impl_t* impl = get_simd_backend();
214 0 : if (impl != NULL && impl->apply_symmetry != NULL) {
215 0 : return impl->apply_symmetry(u, v, w, nx, ny, nz, stride_z, config);
216 : }
217 : /* Fall back to scalar implementation for symmetry */
218 0 : return bc_apply_symmetry_scalar_impl(u, v, w, nx, ny, nz, stride_z, config);
219 : }
220 :
221 : /* ============================================================================
222 : * Check if SIMD backend is available at runtime
223 : * ============================================================================ */
224 :
225 : /**
226 : * Check if a backend implementation table is fully populated.
227 : * All three function pointers must be non-NULL for the backend to be usable.
228 : */
229 234460 : static bool backend_impl_complete(const bc_backend_impl_t* impl) {
230 234460 : return impl->apply_neumann != NULL &&
231 0 : impl->apply_periodic != NULL &&
232 0 : impl->apply_dirichlet != NULL;
233 : }
234 :
235 : /**
236 : * Check if any SIMD implementation is available.
237 : * Called during initialization to determine if bc_impl_simd should be used.
238 : *
239 : * Verifies all three function pointers (neumann, periodic, dirichlet) are present.
240 : * This ensures the backend is fully functional, not just partially implemented.
241 : */
242 234460 : static bool simd_available(void) {
243 234460 : cfd_simd_arch_t arch = cfd_detect_simd_arch();
244 :
245 234460 : if (arch == CFD_SIMD_AVX2) {
246 468920 : return backend_impl_complete(&bc_impl_avx2);
247 0 : } else if (arch == CFD_SIMD_NEON) {
248 0 : return backend_impl_complete(&bc_impl_neon);
249 : }
250 : return false;
251 : }
252 :
253 : /* ============================================================================
254 : * Unified SIMD Interface
255 : *
256 : * bc_impl_simd provides runtime-dispatching functions.
257 : * The functions check availability internally.
258 : * ============================================================================ */
259 :
260 : const bc_backend_impl_t bc_impl_simd = {
261 : .apply_neumann = bc_simd_neumann,
262 : .apply_periodic = bc_simd_periodic,
263 : .apply_dirichlet = bc_simd_dirichlet,
264 : .apply_inlet = bc_simd_inlet,
265 : .apply_outlet = bc_simd_outlet,
266 : .apply_symmetry = bc_simd_symmetry
267 : };
268 :
269 : /**
270 : * Query function for external code to check if SIMD is actually available.
271 : * This is used by the backend availability check since bc_impl_simd
272 : * always has non-NULL function pointers (they do runtime dispatch).
273 : */
274 234460 : bool bc_simd_backend_available(void) {
275 234460 : return simd_available();
276 : }
277 :
278 : /**
279 : * Get the name of the detected SIMD architecture.
280 : * Returns "avx2", "neon", or "none" based on runtime detection.
281 : */
282 0 : const char* bc_simd_get_arch_name(void) {
283 0 : return cfd_get_simd_name();
284 : }
|