LCOV - code coverage report
Current view: top level - core - cpu_features.c (source / functions) Coverage Total Hit
Test: coverage.info Lines: 78.4 % 37 29
Test Date: 2026-03-04 10:22:18 Functions: 66.7 % 6 4

            Line data    Source code
       1              : /**
       2              :  * CPU Feature Detection Implementation
       3              :  *
       4              :  * Runtime detection of CPU SIMD capabilities using platform-specific methods:
       5              :  * - x86/x64 (MSVC): __cpuid/__cpuidex intrinsics + XGETBV for OS support
       6              :  * - x86/x64 (GCC/Clang): __get_cpuid_count + xgetbv for OS support
       7              :  * - ARM64: NEON is always available
       8              :  *
       9              :  * IMPORTANT: For AVX2, we must verify both:
      10              :  * 1. CPU support (CPUID leaf 7, EBX bit 5)
      11              :  * 2. OS support (OSXSAVE enabled + XCR0 bits 1,2 set for AVX state saving)
      12              :  *
      13              :  * Without OS support verification, AVX instructions will cause illegal
      14              :  * instruction exceptions on systems where the OS hasn't enabled AVX.
      15              :  *
      16              :  * Thread Safety:
      17              :  * The cache uses atomic operations with proper memory ordering to ensure
      18              :  * thread-safe initialization. The detection logic is idempotent (always
      19              :  * produces the same result), so multiple threads racing to initialize is
      20              :  * safe - they will all compute and store the same value.
      21              :  *
      22              :  * We use:
      23              :  * - C11 stdatomic.h when available (GCC 4.9+, Clang 3.1+, MSVC 2022+)
      24              :  * - GCC __atomic builtins as fallback (GCC 4.7+)
      25              :  * - MSVC Interlocked functions as final fallback
      26              :  */
      27              : 
      28              : #include "cfd/core/cpu_features.h"
      29              : 
      30              : /* ============================================================================
      31              :  * Atomic Operations Abstraction
      32              :  *
      33              :  * Provides atomic load/store with proper memory ordering across platforms.
      34              :  * Priority: C11 atomics > GCC atomics > MSVC Interlocked > volatile fallback
      35              :  * ============================================================================ */
      36              : 
      37              : /* Check for C11 atomics support */
      38              : #if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L && !defined(__STDC_NO_ATOMICS__)
      39              :     /* C11 atomics available */
      40              :     #include <stdatomic.h>
      41              :     #define CFD_HAS_C11_ATOMICS 1
      42              : #elif defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7))
      43              :     /* GCC 4.7+ atomic builtins */
      44              :     #define CFD_HAS_GCC_ATOMICS 1
      45              : #elif defined(_MSC_VER)
      46              :     /* MSVC Interlocked functions */
      47              :     #include <intrin.h>
      48              :     #define CFD_HAS_MSVC_ATOMICS 1
      49              : #endif
      50              : 
      51              : /* Atomic cache type and operations */
      52              : #if defined(CFD_HAS_C11_ATOMICS)
      53              :     static _Atomic int g_simd_arch_cache = -1;
      54              : 
      55       234518 :     static inline int atomic_cache_load(void) {
      56       234518 :         return atomic_load_explicit(&g_simd_arch_cache, memory_order_acquire);
      57              :     }
      58              : 
      59           30 :     static inline void atomic_cache_store(int value) {
      60           30 :         atomic_store_explicit(&g_simd_arch_cache, value, memory_order_release);
      61              :     }
      62              : 
      63              : #elif defined(CFD_HAS_GCC_ATOMICS)
      64              :     static int g_simd_arch_cache = -1;
      65              : 
      66              :     static inline int atomic_cache_load(void) {
      67              :         return __atomic_load_n(&g_simd_arch_cache, __ATOMIC_ACQUIRE);
      68              :     }
      69              : 
      70              :     static inline void atomic_cache_store(int value) {
      71              :         __atomic_store_n(&g_simd_arch_cache, value, __ATOMIC_RELEASE);
      72              :     }
      73              : 
      74              : #elif defined(CFD_HAS_MSVC_ATOMICS)
      75              :     static volatile long g_simd_arch_cache = -1;
      76              : 
      77              :     static inline int atomic_cache_load(void) {
      78              :         /* _InterlockedCompareExchange provides full barrier on x86/x64
      79              :          * This is the intrinsic form (note the underscore prefix) */
      80              :         return (int)_InterlockedCompareExchange(&g_simd_arch_cache, -1, -1);
      81              :     }
      82              : 
      83              :     static inline void atomic_cache_store(int value) {
      84              :         _InterlockedExchange(&g_simd_arch_cache, (long)value);
      85              :     }
      86              : 
      87              : #else
      88              :     /* Fallback: volatile with no guarantees beyond single-threaded correctness.
      89              :      * This should rarely be hit on modern compilers. */
      90              :     #warning "No atomic primitives available - cache may not be thread-safe"
      91              :     static volatile int g_simd_arch_cache = -1;
      92              : 
      93              :     static inline int atomic_cache_load(void) {
      94              :         return g_simd_arch_cache;
      95              :     }
      96              : 
      97              :     static inline void atomic_cache_store(int value) {
      98              :         g_simd_arch_cache = value;
      99              :     }
     100              : #endif
     101              : 
     102              : /* ============================================================================
     103              :  * Platform-Specific CPU Detection Headers
     104              :  * ============================================================================ */
     105              : 
     106              : #if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
     107              : #include <intrin.h>
     108              : #define CFD_RUNTIME_X86_MSVC 1
     109              : #elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
     110              : #include <cpuid.h>
     111              : #define CFD_RUNTIME_X86_GCC 1
     112              : 
     113              : /**
     114              :  * Inline xgetbv for GCC/Clang.
     115              :  *
     116              :  * We use inline assembly instead of _xgetbv() intrinsic because the intrinsic
     117              :  * requires -mxsave compiler flag, but we want this file to compile without
     118              :  * requiring any special CPU feature flags (since it's doing runtime detection).
     119              :  *
     120              :  * IMPORTANT: This function must ONLY be called after verifying OSXSAVE support
     121              :  * via CPUID leaf 1, ECX bit 27. The OSXSAVE bit indicates that:
     122              :  * 1. The CPU supports the XGETBV instruction
     123              :  * 2. The OS has enabled XSAVE/XRSTOR for context switching
     124              :  *
     125              :  * Calling xgetbv without OSXSAVE support will cause an undefined opcode (#UD)
     126              :  * exception. The detect_simd_arch_impl() function enforces this requirement
     127              :  * by only calling cfd_xgetbv() inside an if(osxsave) block.
     128              :  *
     129              :  * Register usage (identical on 32-bit and 64-bit x86):
     130              :  * - Input: ECX = XCR number (0 for XCR0)
     131              :  * - Output: EDX:EAX = XCR value (64-bit)
     132              :  * - xgetbv does NOT modify XMM/YMM registers - no SIMD clobbers needed
     133              :  *
     134              :  * The "memory" clobber ensures the compiler doesn't reorder this with
     135              :  * memory operations, though xgetbv itself doesn't access memory.
     136              :  */
     137           30 : static inline unsigned long long cfd_xgetbv(unsigned int xcr) {
     138           30 :     unsigned int eax, edx;
     139           60 :     __asm__ __volatile__(
     140              :         "xgetbv"
     141              :         : "=a"(eax), "=d"(edx)
     142              :         : "c"(xcr)
     143              :         : "memory"
     144              :     );
     145           30 :     return ((unsigned long long)edx << 32) | eax;
     146              : }
     147              : #endif
     148              : 
     149              : /* ============================================================================
     150              :  * Runtime Detection Implementation
     151              :  * ============================================================================ */
     152              : 
     153              : /**
     154              :  * Perform the actual SIMD detection.
     155              :  * This is a pure function - always returns the same result for a given CPU.
     156              :  */
     157           30 : static cfd_simd_arch_t detect_simd_arch_impl(void) {
     158              : #if defined(CFD_RUNTIME_X86_MSVC)
     159              :     /* MSVC on x86/x64: Use __cpuid intrinsic */
     160              :     int cpuInfo[4] = {0};
     161              :     __cpuid(cpuInfo, 0);
     162              :     int nIds = cpuInfo[0];
     163              : 
     164              :     if (nIds >= 7) {
     165              :         /* First check CPUID leaf 1 for OSXSAVE support (ECX bit 27) */
     166              :         __cpuid(cpuInfo, 1);
     167              :         int osxsave = (cpuInfo[2] & (1 << 27)) != 0;
     168              : 
     169              :         if (osxsave) {
     170              :             /* OSXSAVE is enabled, now check XCR0 for AVX state support */
     171              :             /* XCR0 bits: bit 1 = SSE state, bit 2 = AVX state */
     172              :             /* Both must be set for AVX to work */
     173              :             unsigned long long xcr0 = _xgetbv(0);
     174              :             int avx_os_support = ((xcr0 & 0x6) == 0x6);
     175              : 
     176              :             if (avx_os_support) {
     177              :                 /* Now check CPU support for AVX2 */
     178              :                 __cpuidex(cpuInfo, 7, 0);
     179              :                 /* Check AVX2 bit (EBX bit 5) */
     180              :                 if (cpuInfo[1] & (1 << 5)) {
     181              :                     return CFD_SIMD_AVX2;
     182              :                 }
     183              :             }
     184              :         }
     185              :     }
     186              :     return CFD_SIMD_NONE;
     187              : 
     188              : #elif defined(CFD_RUNTIME_X86_GCC)
     189              :     /* GCC/Clang on x86/x64: Use __get_cpuid_count */
     190           30 :     unsigned int eax, ebx, ecx, edx;
     191              : 
     192              :     /* First check CPUID leaf 1 for OSXSAVE support (ECX bit 27) */
     193           60 :     if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) {
     194           30 :         int osxsave = (ecx & (1 << 27)) != 0;
     195              : 
     196           30 :         if (osxsave) {
     197              :             /* OSXSAVE is enabled, now check XCR0 for AVX state support */
     198              :             /* XCR0 bits: bit 1 = SSE state, bit 2 = AVX state */
     199              :             /* Both must be set for AVX to work */
     200           30 :             unsigned long long xcr0 = cfd_xgetbv(0);
     201           30 :             int avx_os_support = ((xcr0 & 0x6) == 0x6);
     202              : 
     203           30 :             if (avx_os_support) {
     204              :                 /* Now check CPU support for AVX2 */
     205           30 :                 if (__get_cpuid_count(7, 0, &eax, &ebx, &ecx, &edx)) {
     206              :                     /* Check AVX2 bit (EBX bit 5) */
     207           30 :                     if (ebx & (1 << 5)) {
     208              :                         return CFD_SIMD_AVX2;
     209              :                     }
     210              :                 }
     211              :             }
     212              :         }
     213              :     }
     214              :     return CFD_SIMD_NONE;
     215              : 
     216              : #elif defined(__aarch64__) || defined(_M_ARM64)
     217              :     /* ARM64: NEON is always available on AArch64 */
     218              :     return CFD_SIMD_NEON;
     219              : 
     220              : #elif defined(__ARM_NEON) || defined(__ARM_NEON__)
     221              :     /* ARMv7 with NEON: Assume available if compiled with NEON support */
     222              :     return CFD_SIMD_NEON;
     223              : 
     224              : #else
     225              :     return CFD_SIMD_NONE;
     226              : #endif
     227              : }
     228              : 
     229       234518 : cfd_simd_arch_t cfd_detect_simd_arch(void) {
     230              :     /* Read cached result with acquire semantics */
     231       234518 :     int cached = atomic_cache_load();
     232              : 
     233       234518 :     if (cached >= 0) {
     234              :         return (cfd_simd_arch_t)cached;
     235              :     }
     236              : 
     237              :     /* Perform detection - result will be the same regardless of which thread
     238              :      * computes it, so racing here is safe (just wastes some CPU cycles). */
     239           30 :     cfd_simd_arch_t detected = detect_simd_arch_impl();
     240              : 
     241              :     /* Store result with release semantics to ensure visibility to other threads */
     242           30 :     atomic_cache_store((int)detected);
     243              : 
     244           30 :     return detected;
     245              : }
     246              : 
     247              : /* ============================================================================
     248              :  * Convenience Functions
     249              :  * ============================================================================ */
     250              : 
     251            0 : bool cfd_has_avx2(void) {
     252            0 :     return cfd_detect_simd_arch() == CFD_SIMD_AVX2;
     253              : }
     254              : 
     255            0 : bool cfd_has_neon(void) {
     256            0 :     return cfd_detect_simd_arch() == CFD_SIMD_NEON;
     257              : }
     258              : 
     259           18 : bool cfd_has_simd(void) {
     260           18 :     cfd_simd_arch_t arch = cfd_detect_simd_arch();
     261           18 :     return arch == CFD_SIMD_AVX2 || arch == CFD_SIMD_NEON;
     262              : }
     263              : 
     264            3 : const char* cfd_get_simd_name(void) {
     265            3 :     switch (cfd_detect_simd_arch()) {
     266              :         case CFD_SIMD_AVX2:
     267              :             return "avx2";
     268            0 :         case CFD_SIMD_NEON:
     269            0 :             return "neon";
     270            0 :         default:
     271            0 :             return "none";
     272              :     }
     273              : }
        

Generated by: LCOV version 2.0-1