Line data Source code
1 : /**
2 : * CPU Feature Detection Implementation
3 : *
4 : * Runtime detection of CPU SIMD capabilities using platform-specific methods:
5 : * - x86/x64 (MSVC): __cpuid/__cpuidex intrinsics + XGETBV for OS support
6 : * - x86/x64 (GCC/Clang): __get_cpuid_count + xgetbv for OS support
7 : * - ARM64: NEON is always available
8 : *
9 : * IMPORTANT: For AVX2, we must verify both:
10 : * 1. CPU support (CPUID leaf 7, EBX bit 5)
11 : * 2. OS support (OSXSAVE enabled + XCR0 bits 1,2 set for AVX state saving)
12 : *
13 : * Without OS support verification, AVX instructions will cause illegal
14 : * instruction exceptions on systems where the OS hasn't enabled AVX.
15 : *
16 : * Thread Safety:
17 : * The cache uses atomic operations with proper memory ordering to ensure
18 : * thread-safe initialization. The detection logic is idempotent (always
19 : * produces the same result), so multiple threads racing to initialize is
20 : * safe - they will all compute and store the same value.
21 : *
22 : * We use:
23 : * - C11 stdatomic.h when available (GCC 4.9+, Clang 3.1+, MSVC 2022+)
24 : * - GCC __atomic builtins as fallback (GCC 4.7+)
25 : * - MSVC Interlocked functions as final fallback
26 : */
27 :
28 : #include "cfd/core/cpu_features.h"
29 :
30 : /* ============================================================================
31 : * Atomic Operations Abstraction
32 : *
33 : * Provides atomic load/store with proper memory ordering across platforms.
34 : * Priority: C11 atomics > GCC atomics > MSVC Interlocked > volatile fallback
35 : * ============================================================================ */
36 :
37 : /* Check for C11 atomics support */
38 : #if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L && !defined(__STDC_NO_ATOMICS__)
39 : /* C11 atomics available */
40 : #include <stdatomic.h>
41 : #define CFD_HAS_C11_ATOMICS 1
42 : #elif defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7))
43 : /* GCC 4.7+ atomic builtins */
44 : #define CFD_HAS_GCC_ATOMICS 1
45 : #elif defined(_MSC_VER)
46 : /* MSVC Interlocked functions */
47 : #include <intrin.h>
48 : #define CFD_HAS_MSVC_ATOMICS 1
49 : #endif
50 :
51 : /* Atomic cache type and operations */
52 : #if defined(CFD_HAS_C11_ATOMICS)
53 : static _Atomic int g_simd_arch_cache = -1;
54 :
55 234518 : static inline int atomic_cache_load(void) {
56 234518 : return atomic_load_explicit(&g_simd_arch_cache, memory_order_acquire);
57 : }
58 :
59 30 : static inline void atomic_cache_store(int value) {
60 30 : atomic_store_explicit(&g_simd_arch_cache, value, memory_order_release);
61 : }
62 :
63 : #elif defined(CFD_HAS_GCC_ATOMICS)
64 : static int g_simd_arch_cache = -1;
65 :
66 : static inline int atomic_cache_load(void) {
67 : return __atomic_load_n(&g_simd_arch_cache, __ATOMIC_ACQUIRE);
68 : }
69 :
70 : static inline void atomic_cache_store(int value) {
71 : __atomic_store_n(&g_simd_arch_cache, value, __ATOMIC_RELEASE);
72 : }
73 :
74 : #elif defined(CFD_HAS_MSVC_ATOMICS)
75 : static volatile long g_simd_arch_cache = -1;
76 :
77 : static inline int atomic_cache_load(void) {
78 : /* _InterlockedCompareExchange provides full barrier on x86/x64
79 : * This is the intrinsic form (note the underscore prefix) */
80 : return (int)_InterlockedCompareExchange(&g_simd_arch_cache, -1, -1);
81 : }
82 :
83 : static inline void atomic_cache_store(int value) {
84 : _InterlockedExchange(&g_simd_arch_cache, (long)value);
85 : }
86 :
87 : #else
88 : /* Fallback: volatile with no guarantees beyond single-threaded correctness.
89 : * This should rarely be hit on modern compilers. */
90 : #warning "No atomic primitives available - cache may not be thread-safe"
91 : static volatile int g_simd_arch_cache = -1;
92 :
93 : static inline int atomic_cache_load(void) {
94 : return g_simd_arch_cache;
95 : }
96 :
97 : static inline void atomic_cache_store(int value) {
98 : g_simd_arch_cache = value;
99 : }
100 : #endif
101 :
102 : /* ============================================================================
103 : * Platform-Specific CPU Detection Headers
104 : * ============================================================================ */
105 :
106 : #if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
107 : #include <intrin.h>
108 : #define CFD_RUNTIME_X86_MSVC 1
109 : #elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
110 : #include <cpuid.h>
111 : #define CFD_RUNTIME_X86_GCC 1
112 :
113 : /**
114 : * Inline xgetbv for GCC/Clang.
115 : *
116 : * We use inline assembly instead of _xgetbv() intrinsic because the intrinsic
117 : * requires -mxsave compiler flag, but we want this file to compile without
118 : * requiring any special CPU feature flags (since it's doing runtime detection).
119 : *
120 : * IMPORTANT: This function must ONLY be called after verifying OSXSAVE support
121 : * via CPUID leaf 1, ECX bit 27. The OSXSAVE bit indicates that:
122 : * 1. The CPU supports the XGETBV instruction
123 : * 2. The OS has enabled XSAVE/XRSTOR for context switching
124 : *
125 : * Calling xgetbv without OSXSAVE support will cause an undefined opcode (#UD)
126 : * exception. The detect_simd_arch_impl() function enforces this requirement
127 : * by only calling cfd_xgetbv() inside an if(osxsave) block.
128 : *
129 : * Register usage (identical on 32-bit and 64-bit x86):
130 : * - Input: ECX = XCR number (0 for XCR0)
131 : * - Output: EDX:EAX = XCR value (64-bit)
132 : * - xgetbv does NOT modify XMM/YMM registers - no SIMD clobbers needed
133 : *
134 : * The "memory" clobber ensures the compiler doesn't reorder this with
135 : * memory operations, though xgetbv itself doesn't access memory.
136 : */
137 30 : static inline unsigned long long cfd_xgetbv(unsigned int xcr) {
138 30 : unsigned int eax, edx;
139 60 : __asm__ __volatile__(
140 : "xgetbv"
141 : : "=a"(eax), "=d"(edx)
142 : : "c"(xcr)
143 : : "memory"
144 : );
145 30 : return ((unsigned long long)edx << 32) | eax;
146 : }
147 : #endif
148 :
149 : /* ============================================================================
150 : * Runtime Detection Implementation
151 : * ============================================================================ */
152 :
153 : /**
154 : * Perform the actual SIMD detection.
155 : * This is a pure function - always returns the same result for a given CPU.
156 : */
157 30 : static cfd_simd_arch_t detect_simd_arch_impl(void) {
158 : #if defined(CFD_RUNTIME_X86_MSVC)
159 : /* MSVC on x86/x64: Use __cpuid intrinsic */
160 : int cpuInfo[4] = {0};
161 : __cpuid(cpuInfo, 0);
162 : int nIds = cpuInfo[0];
163 :
164 : if (nIds >= 7) {
165 : /* First check CPUID leaf 1 for OSXSAVE support (ECX bit 27) */
166 : __cpuid(cpuInfo, 1);
167 : int osxsave = (cpuInfo[2] & (1 << 27)) != 0;
168 :
169 : if (osxsave) {
170 : /* OSXSAVE is enabled, now check XCR0 for AVX state support */
171 : /* XCR0 bits: bit 1 = SSE state, bit 2 = AVX state */
172 : /* Both must be set for AVX to work */
173 : unsigned long long xcr0 = _xgetbv(0);
174 : int avx_os_support = ((xcr0 & 0x6) == 0x6);
175 :
176 : if (avx_os_support) {
177 : /* Now check CPU support for AVX2 */
178 : __cpuidex(cpuInfo, 7, 0);
179 : /* Check AVX2 bit (EBX bit 5) */
180 : if (cpuInfo[1] & (1 << 5)) {
181 : return CFD_SIMD_AVX2;
182 : }
183 : }
184 : }
185 : }
186 : return CFD_SIMD_NONE;
187 :
188 : #elif defined(CFD_RUNTIME_X86_GCC)
189 : /* GCC/Clang on x86/x64: Use __get_cpuid_count */
190 30 : unsigned int eax, ebx, ecx, edx;
191 :
192 : /* First check CPUID leaf 1 for OSXSAVE support (ECX bit 27) */
193 60 : if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) {
194 30 : int osxsave = (ecx & (1 << 27)) != 0;
195 :
196 30 : if (osxsave) {
197 : /* OSXSAVE is enabled, now check XCR0 for AVX state support */
198 : /* XCR0 bits: bit 1 = SSE state, bit 2 = AVX state */
199 : /* Both must be set for AVX to work */
200 30 : unsigned long long xcr0 = cfd_xgetbv(0);
201 30 : int avx_os_support = ((xcr0 & 0x6) == 0x6);
202 :
203 30 : if (avx_os_support) {
204 : /* Now check CPU support for AVX2 */
205 30 : if (__get_cpuid_count(7, 0, &eax, &ebx, &ecx, &edx)) {
206 : /* Check AVX2 bit (EBX bit 5) */
207 30 : if (ebx & (1 << 5)) {
208 : return CFD_SIMD_AVX2;
209 : }
210 : }
211 : }
212 : }
213 : }
214 : return CFD_SIMD_NONE;
215 :
216 : #elif defined(__aarch64__) || defined(_M_ARM64)
217 : /* ARM64: NEON is always available on AArch64 */
218 : return CFD_SIMD_NEON;
219 :
220 : #elif defined(__ARM_NEON) || defined(__ARM_NEON__)
221 : /* ARMv7 with NEON: Assume available if compiled with NEON support */
222 : return CFD_SIMD_NEON;
223 :
224 : #else
225 : return CFD_SIMD_NONE;
226 : #endif
227 : }
228 :
229 234518 : cfd_simd_arch_t cfd_detect_simd_arch(void) {
230 : /* Read cached result with acquire semantics */
231 234518 : int cached = atomic_cache_load();
232 :
233 234518 : if (cached >= 0) {
234 : return (cfd_simd_arch_t)cached;
235 : }
236 :
237 : /* Perform detection - result will be the same regardless of which thread
238 : * computes it, so racing here is safe (just wastes some CPU cycles). */
239 30 : cfd_simd_arch_t detected = detect_simd_arch_impl();
240 :
241 : /* Store result with release semantics to ensure visibility to other threads */
242 30 : atomic_cache_store((int)detected);
243 :
244 30 : return detected;
245 : }
246 :
247 : /* ============================================================================
248 : * Convenience Functions
249 : * ============================================================================ */
250 :
251 0 : bool cfd_has_avx2(void) {
252 0 : return cfd_detect_simd_arch() == CFD_SIMD_AVX2;
253 : }
254 :
255 0 : bool cfd_has_neon(void) {
256 0 : return cfd_detect_simd_arch() == CFD_SIMD_NEON;
257 : }
258 :
259 18 : bool cfd_has_simd(void) {
260 18 : cfd_simd_arch_t arch = cfd_detect_simd_arch();
261 18 : return arch == CFD_SIMD_AVX2 || arch == CFD_SIMD_NEON;
262 : }
263 :
264 3 : const char* cfd_get_simd_name(void) {
265 3 : switch (cfd_detect_simd_arch()) {
266 : case CFD_SIMD_AVX2:
267 : return "avx2";
268 0 : case CFD_SIMD_NEON:
269 0 : return "neon";
270 0 : default:
271 0 : return "none";
272 : }
273 : }
|