Skip to content

Commit 360c9be

Browse files
authored
Merge pull request #6006 from hjmjohnson/fftw-simd-windows-arm64-fix
PERF: Enable FFTW SIMD codelets with per-CPU introspection at configure time
2 parents 76c1bf3 + a9b11c8 commit 360c9be

File tree

1 file changed

+113
-17
lines changed

1 file changed

+113
-17
lines changed

CMake/itkExternal_FFTW.cmake

Lines changed: 113 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,38 @@
11
#
22
# Encapsulates building FFTW as an External Project.
33
#
4-
# NOTE: internal building of fftw is for convenience,
5-
# and the version of fftw built here does not
6-
# use modern hardware optimzations.
4+
# SIMD codelet selection
5+
# ----------------------
6+
# FFTW SIMD codelets are hand-written assembly routines baked into the
7+
# library at compile time. Passing -march=native to the ITK build does
8+
# NOT activate them; they must be requested explicitly via FFTW's own
9+
# CMake options (ENABLE_NEON, ENABLE_SSE, ENABLE_SSE2, ENABLE_AVX, ENABLE_AVX2).
710
#
8-
# The build configuration chosen to be
9-
# generalizable to as many hardware platforms.
10-
# Being backward compatible for decades
11-
# old hardware is the goal of this internal
12-
# representation.
11+
# This file detects appropriate defaults at cmake configure time:
1312
#
14-
# This is primarily used to support testing
15-
# and should not be used for production
16-
# builds where performance is a concern.
13+
# Native builds (CMAKE_CROSSCOMPILING is false):
14+
# - ARM64 (aarch64/arm64/ARM64): NEON=ON (mandatory in ARMv8); x86 SIMD off.
15+
# - x86/x86_64 with GCC/Clang: each of SSE, SSE2, AVX, AVX2 is probed
16+
# individually via __builtin_cpu_supports() / CheckCSourceRuns so that
17+
# the detected flags match the actual build-host CPU. A pre-AVX
18+
# Sandy Bridge gets SSE+SSE2 only; a Haswell or later gets all four.
19+
# On MSVC the probes are skipped (intrinsic unavailable) and SIMD
20+
# defaults to off; users can override via FFTW_ENABLE_* options.
21+
# - Other architectures: all SIMD off (conservative fallback).
22+
#
23+
# Cross-compiled builds (CMAKE_CROSSCOMPILING is true):
24+
# - ARM64: NEON=ON (mandatory); x86 SIMD off.
25+
# - x86_64: SSE+SSE2 only (baseline; AVX/AVX2 not assumed for target).
26+
# - Other: all SIMD off.
27+
#
28+
# Every flag is an individually overridable cache option, e.g.:
29+
# cmake -DFFTW_ENABLE_AVX2=OFF ...
30+
# Note: option() defaults are only applied on the first configure.
31+
# To re-detect after a toolchain change, delete the CMake cache or use
32+
# cmake --fresh, or pass explicit -DFFTW_ENABLE_*= overrides.
33+
#
34+
# ENABLE_SSE (SSE1) is float-only and is not forwarded to the
35+
# double-precision fftwd build.
1736
#
1837
# These instructions follow the guidance provided for modern cmake usage as described:
1938
# https://github.com/dev-cafe/cmake-cookbook/blob/master/chapter-08/recipe-03/c-example/external/upstream/fftw3/CMakeLists.txt
@@ -64,6 +83,79 @@ if(NOT ITK_USE_SYSTEM_FFTW)
6483

6584
set(FFTW_STAGED_INSTALL_PREFIX "${ITK_BINARY_DIR}/fftw")
6685

86+
# Detect SIMD defaults (see file header for full policy description).
87+
# CheckCSourceRuns results are cached after the first cmake configure run.
88+
include(CheckCSourceRuns)
89+
90+
set(_fftw_default_neon OFF)
91+
set(_fftw_default_sse OFF)
92+
set(_fftw_default_sse2 OFF)
93+
set(_fftw_default_avx OFF)
94+
set(_fftw_default_avx2 OFF)
95+
96+
if(NOT CMAKE_CROSSCOMPILING)
97+
if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64|ARM64")
98+
# NEON is mandatory in ARMv8/AArch64 — every arm64 CPU has it.
99+
set(_fftw_default_neon ON)
100+
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64|i686")
101+
# Probe each x86 SIMD level individually via CPUID so the defaults
102+
# are accurate for the actual build-host CPU (e.g. pre-AVX Sandy Bridge
103+
# or pre-AVX2 Ivy Bridge get only the levels their hardware supports).
104+
# __builtin_cpu_supports is a GCC/Clang intrinsic; skip on MSVC.
105+
if(CMAKE_C_COMPILER_ID MATCHES "GNU|Clang|AppleClang")
106+
foreach(_fftw_simd IN ITEMS sse sse2 avx avx2)
107+
check_c_source_runs(
108+
"int main(void){return __builtin_cpu_supports(\"${_fftw_simd}\")?0:1;}"
109+
_fftw_cpu_has_${_fftw_simd}
110+
)
111+
if(_fftw_cpu_has_${_fftw_simd})
112+
set(_fftw_default_${_fftw_simd} ON)
113+
endif()
114+
endforeach()
115+
endif()
116+
endif()
117+
else()
118+
# Cross-compiling: conservative architecture-level fallback.
119+
if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64|ARM64")
120+
set(_fftw_default_neon ON)
121+
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64")
122+
# SSE/SSE2 are baseline on all 64-bit x86 CPUs; AVX/AVX2 not assumed.
123+
set(_fftw_default_sse ON)
124+
set(_fftw_default_sse2 ON)
125+
endif()
126+
endif()
127+
128+
option(
129+
FFTW_ENABLE_NEON
130+
"Enable FFTW NEON SIMD codelets (ARM64)"
131+
${_fftw_default_neon}
132+
)
133+
option(
134+
FFTW_ENABLE_SSE
135+
"Enable FFTW SSE SIMD codelets (x86)"
136+
${_fftw_default_sse}
137+
)
138+
option(
139+
FFTW_ENABLE_SSE2
140+
"Enable FFTW SSE2 SIMD codelets (x86)"
141+
${_fftw_default_sse2}
142+
)
143+
option(
144+
FFTW_ENABLE_AVX
145+
"Enable FFTW AVX SIMD codelets (x86)"
146+
${_fftw_default_avx}
147+
)
148+
option(
149+
FFTW_ENABLE_AVX2
150+
"Enable FFTW AVX2 SIMD codelets (x86)"
151+
${_fftw_default_avx2}
152+
)
153+
154+
message(
155+
STATUS
156+
"FFTW SIMD: NEON=${FFTW_ENABLE_NEON} SSE=${FFTW_ENABLE_SSE} SSE2=${FFTW_ENABLE_SSE2} AVX=${FFTW_ENABLE_AVX} AVX2=${FFTW_ENABLE_AVX2}"
157+
)
158+
67159
# Macro to generate library filename with appropriate prefix/suffix
68160
# Args: output_var library_base_name
69161
macro(_library_name_to_filename output_var library_base_name)
@@ -111,10 +203,12 @@ if(NOT ITK_USE_SYSTEM_FFTW)
111203
-DCMAKE_INSTALL_PREFIX:PATH=${FFTW_STAGED_INSTALL_PREFIX}
112204
-DCMAKE_INSTALL_LIBDIR:STRING=${CMAKE_INSTALL_LIBDIR}
113205
-DCMAKE_INSTALL_BINDIR:STRING=${CMAKE_INSTALL_BINDIR}
114-
-DDISABLE_FORTRAN:BOOL=ON -DENABLE_AVX:BOOL=OFF -DENABLE_AVX2:BOOL=OFF
115-
-DENABLE_FLOAT:BOOL=ON -DENABLE_LONG_DOUBLE:BOOL=OFF
206+
-DDISABLE_FORTRAN:BOOL=ON -DENABLE_AVX:BOOL=${FFTW_ENABLE_AVX}
207+
-DENABLE_AVX2:BOOL=${FFTW_ENABLE_AVX2} -DENABLE_FLOAT:BOOL=ON
208+
-DENABLE_LONG_DOUBLE:BOOL=OFF -DENABLE_NEON:BOOL=${FFTW_ENABLE_NEON}
116209
-DENABLE_OPENMP:BOOL=OFF -DENABLE_QUAD_PRECISION:BOOL=OFF
117-
-DENABLE_SSE:BOOL=OFF -DENABLE_SSE2:BOOL=OFF -DENABLE_THREADS:BOOL=ON
210+
-DENABLE_SSE:BOOL=${FFTW_ENABLE_SSE}
211+
-DENABLE_SSE2:BOOL=${FFTW_ENABLE_SSE2} -DENABLE_THREADS:BOOL=ON
118212
-DCMAKE_APPLE_SILICON_PROCESSOR:STRING=${CMAKE_APPLE_SILICON_PROCESSOR}
119213
-DCMAKE_C_COMPILER_LAUNCHER:PATH=${CMAKE_C_COMPILER_LAUNCHER}
120214
-DCMAKE_C_COMPILER:PATH=${CMAKE_C_COMPILER}
@@ -175,10 +269,12 @@ if(NOT ITK_USE_SYSTEM_FFTW)
175269
-DCMAKE_INSTALL_PREFIX:PATH=${FFTW_STAGED_INSTALL_PREFIX}
176270
-DCMAKE_INSTALL_LIBDIR:STRING=${CMAKE_INSTALL_LIBDIR}
177271
-DCMAKE_INSTALL_BINDIR:STRING=${CMAKE_INSTALL_BINDIR}
178-
-DDISABLE_FORTRAN:BOOL=ON -DENABLE_AVX:BOOL=OFF -DENABLE_AVX2:BOOL=OFF
179-
-DENABLE_FLOAT:BOOL=OFF -DENABLE_LONG_DOUBLE:BOOL=OFF
272+
-DDISABLE_FORTRAN:BOOL=ON -DENABLE_AVX:BOOL=${FFTW_ENABLE_AVX}
273+
-DENABLE_AVX2:BOOL=${FFTW_ENABLE_AVX2} -DENABLE_FLOAT:BOOL=OFF
274+
-DENABLE_LONG_DOUBLE:BOOL=OFF -DENABLE_NEON:BOOL=${FFTW_ENABLE_NEON}
180275
-DENABLE_OPENMP:BOOL=OFF -DENABLE_QUAD_PRECISION:BOOL=OFF
181-
-DENABLE_SSE:BOOL=OFF -DENABLE_SSE2:BOOL=OFF -DENABLE_THREADS:BOOL=ON
276+
-DENABLE_SSE:BOOL=OFF # SSE1 codelets are 32-bit float only; no effect on double-precision
277+
-DENABLE_SSE2:BOOL=${FFTW_ENABLE_SSE2} -DENABLE_THREADS:BOOL=ON
182278
-DCMAKE_APPLE_SILICON_PROCESSOR:STRING=${CMAKE_APPLE_SILICON_PROCESSOR}
183279
-DCMAKE_C_COMPILER_LAUNCHER:PATH=${CMAKE_C_COMPILER_LAUNCHER}
184280
-DCMAKE_C_COMPILER:PATH=${CMAKE_C_COMPILER}

0 commit comments

Comments
 (0)