|
1 | 1 | # |
2 | 2 | # Encapsulates building FFTW as an External Project. |
3 | 3 | # |
4 | | -# NOTE: internal building of fftw is for convenience, |
5 | | -# and the version of fftw built here does not |
6 | | -# use modern hardware optimzations. |
| 4 | +# SIMD codelet selection |
| 5 | +# ---------------------- |
| 6 | +# FFTW SIMD codelets are hand-written assembly routines baked into the |
| 7 | +# library at compile time. Passing -march=native to the ITK build does |
| 8 | +# NOT activate them; they must be requested explicitly via FFTW's own |
| 9 | +# CMake options (ENABLE_NEON, ENABLE_SSE, ENABLE_SSE2, ENABLE_AVX, ENABLE_AVX2). |
7 | 10 | # |
8 | | -# The build configuration chosen to be |
9 | | -# generalizable to as many hardware platforms. |
10 | | -# Being backward compatible for decades |
11 | | -# old hardware is the goal of this internal |
12 | | -# representation. |
| 11 | +# This file detects appropriate defaults at cmake configure time: |
13 | 12 | # |
14 | | -# This is primarily used to support testing |
15 | | -# and should not be used for production |
16 | | -# builds where performance is a concern. |
| 13 | +# Native builds (CMAKE_CROSSCOMPILING is false): |
| 14 | +# - ARM64 (aarch64/arm64/ARM64): NEON=ON (mandatory in ARMv8); x86 SIMD off. |
| 15 | +# - x86/x86_64 with GCC/Clang: each of SSE, SSE2, AVX, AVX2 is probed |
| 16 | +# individually via __builtin_cpu_supports() / CheckCSourceRuns so that |
| 17 | +# the detected flags match the actual build-host CPU. A pre-AVX |
| 18 | +# Sandy Bridge gets SSE+SSE2 only; a Haswell or later gets all four. |
| 19 | +# On MSVC the probes are skipped (intrinsic unavailable) and SIMD |
| 20 | +# defaults to off; users can override via FFTW_ENABLE_* options. |
| 21 | +# - Other architectures: all SIMD off (conservative fallback). |
| 22 | +# |
| 23 | +# Cross-compiled builds (CMAKE_CROSSCOMPILING is true): |
| 24 | +# - ARM64: NEON=ON (mandatory); x86 SIMD off. |
| 25 | +# - x86_64: SSE+SSE2 only (baseline; AVX/AVX2 not assumed for target). |
| 26 | +# - Other: all SIMD off. |
| 27 | +# |
| 28 | +# Every flag is an individually overridable cache option, e.g.: |
| 29 | +# cmake -DFFTW_ENABLE_AVX2=OFF ... |
| 30 | +# Note: option() defaults are only applied on the first configure. |
| 31 | +# To re-detect after a toolchain change, delete the CMake cache or use |
| 32 | +# cmake --fresh, or pass explicit -DFFTW_ENABLE_*= overrides. |
| 33 | +# |
| 34 | +# ENABLE_SSE (SSE1) is float-only and is not forwarded to the |
| 35 | +# double-precision fftwd build. |
17 | 36 | # |
18 | 37 | # These instructions follow the guidance provided for modern cmake usage as described: |
19 | 38 | # https://github.com/dev-cafe/cmake-cookbook/blob/master/chapter-08/recipe-03/c-example/external/upstream/fftw3/CMakeLists.txt |
@@ -64,6 +83,79 @@ if(NOT ITK_USE_SYSTEM_FFTW) |
64 | 83 |
|
65 | 84 | set(FFTW_STAGED_INSTALL_PREFIX "${ITK_BINARY_DIR}/fftw") |
66 | 85 |
|
| 86 | + # Detect SIMD defaults (see file header for full policy description). |
| 87 | + # CheckCSourceRuns results are cached after the first cmake configure run. |
| 88 | + include(CheckCSourceRuns) |
| 89 | + |
| 90 | + set(_fftw_default_neon OFF) |
| 91 | + set(_fftw_default_sse OFF) |
| 92 | + set(_fftw_default_sse2 OFF) |
| 93 | + set(_fftw_default_avx OFF) |
| 94 | + set(_fftw_default_avx2 OFF) |
| 95 | + |
| 96 | + if(NOT CMAKE_CROSSCOMPILING) |
| 97 | + if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64|ARM64") |
| 98 | + # NEON is mandatory in ARMv8/AArch64 — every arm64 CPU has it. |
| 99 | + set(_fftw_default_neon ON) |
| 100 | + elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64|i686") |
| 101 | + # Probe each x86 SIMD level individually via CPUID so the defaults |
| 102 | + # are accurate for the actual build-host CPU (e.g. pre-AVX Sandy Bridge |
| 103 | + # or pre-AVX2 Ivy Bridge get only the levels their hardware supports). |
| 104 | + # __builtin_cpu_supports is a GCC/Clang intrinsic; skip on MSVC. |
| 105 | + if(CMAKE_C_COMPILER_ID MATCHES "GNU|Clang|AppleClang") |
| 106 | + foreach(_fftw_simd IN ITEMS sse sse2 avx avx2) |
| 107 | + check_c_source_runs( |
| 108 | + "int main(void){return __builtin_cpu_supports(\"${_fftw_simd}\")?0:1;}" |
| 109 | + _fftw_cpu_has_${_fftw_simd} |
| 110 | + ) |
| 111 | + if(_fftw_cpu_has_${_fftw_simd}) |
| 112 | + set(_fftw_default_${_fftw_simd} ON) |
| 113 | + endif() |
| 114 | + endforeach() |
| 115 | + endif() |
| 116 | + endif() |
| 117 | + else() |
| 118 | + # Cross-compiling: conservative architecture-level fallback. |
| 119 | + if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64|ARM64") |
| 120 | + set(_fftw_default_neon ON) |
| 121 | + elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64") |
| 122 | + # SSE/SSE2 are baseline on all 64-bit x86 CPUs; AVX/AVX2 not assumed. |
| 123 | + set(_fftw_default_sse ON) |
| 124 | + set(_fftw_default_sse2 ON) |
| 125 | + endif() |
| 126 | + endif() |
| 127 | + |
| 128 | + option( |
| 129 | + FFTW_ENABLE_NEON |
| 130 | + "Enable FFTW NEON SIMD codelets (ARM64)" |
| 131 | + ${_fftw_default_neon} |
| 132 | + ) |
| 133 | + option( |
| 134 | + FFTW_ENABLE_SSE |
| 135 | + "Enable FFTW SSE SIMD codelets (x86)" |
| 136 | + ${_fftw_default_sse} |
| 137 | + ) |
| 138 | + option( |
| 139 | + FFTW_ENABLE_SSE2 |
| 140 | + "Enable FFTW SSE2 SIMD codelets (x86)" |
| 141 | + ${_fftw_default_sse2} |
| 142 | + ) |
| 143 | + option( |
| 144 | + FFTW_ENABLE_AVX |
| 145 | + "Enable FFTW AVX SIMD codelets (x86)" |
| 146 | + ${_fftw_default_avx} |
| 147 | + ) |
| 148 | + option( |
| 149 | + FFTW_ENABLE_AVX2 |
| 150 | + "Enable FFTW AVX2 SIMD codelets (x86)" |
| 151 | + ${_fftw_default_avx2} |
| 152 | + ) |
| 153 | + |
| 154 | + message( |
| 155 | + STATUS |
| 156 | + "FFTW SIMD: NEON=${FFTW_ENABLE_NEON} SSE=${FFTW_ENABLE_SSE} SSE2=${FFTW_ENABLE_SSE2} AVX=${FFTW_ENABLE_AVX} AVX2=${FFTW_ENABLE_AVX2}" |
| 157 | + ) |
| 158 | + |
67 | 159 | # Macro to generate library filename with appropriate prefix/suffix |
68 | 160 | # Args: output_var library_base_name |
69 | 161 | macro(_library_name_to_filename output_var library_base_name) |
@@ -111,10 +203,12 @@ if(NOT ITK_USE_SYSTEM_FFTW) |
111 | 203 | -DCMAKE_INSTALL_PREFIX:PATH=${FFTW_STAGED_INSTALL_PREFIX} |
112 | 204 | -DCMAKE_INSTALL_LIBDIR:STRING=${CMAKE_INSTALL_LIBDIR} |
113 | 205 | -DCMAKE_INSTALL_BINDIR:STRING=${CMAKE_INSTALL_BINDIR} |
114 | | - -DDISABLE_FORTRAN:BOOL=ON -DENABLE_AVX:BOOL=OFF -DENABLE_AVX2:BOOL=OFF |
115 | | - -DENABLE_FLOAT:BOOL=ON -DENABLE_LONG_DOUBLE:BOOL=OFF |
| 206 | + -DDISABLE_FORTRAN:BOOL=ON -DENABLE_AVX:BOOL=${FFTW_ENABLE_AVX} |
| 207 | + -DENABLE_AVX2:BOOL=${FFTW_ENABLE_AVX2} -DENABLE_FLOAT:BOOL=ON |
| 208 | + -DENABLE_LONG_DOUBLE:BOOL=OFF -DENABLE_NEON:BOOL=${FFTW_ENABLE_NEON} |
116 | 209 | -DENABLE_OPENMP:BOOL=OFF -DENABLE_QUAD_PRECISION:BOOL=OFF |
117 | | - -DENABLE_SSE:BOOL=OFF -DENABLE_SSE2:BOOL=OFF -DENABLE_THREADS:BOOL=ON |
| 210 | + -DENABLE_SSE:BOOL=${FFTW_ENABLE_SSE} |
| 211 | + -DENABLE_SSE2:BOOL=${FFTW_ENABLE_SSE2} -DENABLE_THREADS:BOOL=ON |
118 | 212 | -DCMAKE_APPLE_SILICON_PROCESSOR:STRING=${CMAKE_APPLE_SILICON_PROCESSOR} |
119 | 213 | -DCMAKE_C_COMPILER_LAUNCHER:PATH=${CMAKE_C_COMPILER_LAUNCHER} |
120 | 214 | -DCMAKE_C_COMPILER:PATH=${CMAKE_C_COMPILER} |
@@ -175,10 +269,12 @@ if(NOT ITK_USE_SYSTEM_FFTW) |
175 | 269 | -DCMAKE_INSTALL_PREFIX:PATH=${FFTW_STAGED_INSTALL_PREFIX} |
176 | 270 | -DCMAKE_INSTALL_LIBDIR:STRING=${CMAKE_INSTALL_LIBDIR} |
177 | 271 | -DCMAKE_INSTALL_BINDIR:STRING=${CMAKE_INSTALL_BINDIR} |
178 | | - -DDISABLE_FORTRAN:BOOL=ON -DENABLE_AVX:BOOL=OFF -DENABLE_AVX2:BOOL=OFF |
179 | | - -DENABLE_FLOAT:BOOL=OFF -DENABLE_LONG_DOUBLE:BOOL=OFF |
| 272 | + -DDISABLE_FORTRAN:BOOL=ON -DENABLE_AVX:BOOL=${FFTW_ENABLE_AVX} |
| 273 | + -DENABLE_AVX2:BOOL=${FFTW_ENABLE_AVX2} -DENABLE_FLOAT:BOOL=OFF |
| 274 | + -DENABLE_LONG_DOUBLE:BOOL=OFF -DENABLE_NEON:BOOL=${FFTW_ENABLE_NEON} |
180 | 275 | -DENABLE_OPENMP:BOOL=OFF -DENABLE_QUAD_PRECISION:BOOL=OFF |
181 | | - -DENABLE_SSE:BOOL=OFF -DENABLE_SSE2:BOOL=OFF -DENABLE_THREADS:BOOL=ON |
| 276 | + -DENABLE_SSE:BOOL=OFF # SSE1 codelets are 32-bit float only; no effect on double-precision |
| 277 | + -DENABLE_SSE2:BOOL=${FFTW_ENABLE_SSE2} -DENABLE_THREADS:BOOL=ON |
182 | 278 | -DCMAKE_APPLE_SILICON_PROCESSOR:STRING=${CMAKE_APPLE_SILICON_PROCESSOR} |
183 | 279 | -DCMAKE_C_COMPILER_LAUNCHER:PATH=${CMAKE_C_COMPILER_LAUNCHER} |
184 | 280 | -DCMAKE_C_COMPILER:PATH=${CMAKE_C_COMPILER} |
|
0 commit comments