Skip to content

Commit dc9b77c

Browse files
malfetpytorchmergebot
authored andcommitted
[MPS] Support includes in metal objects (pytorch#145087)
Useful for code reuse for Metal shader build both for eager mode and MPSInductor, but it requires one to implement `_cpp_embed_headers` tool that, as name suggests, would preprocess and embeds the for shader to be used in dynamic compilation. Test using: - `TestMetalLibrary.test_metal_include` - Moving `i0`/`i1` implementation to `c10/util/metal_special_math.h` and call it from `SpecialOps.metal` shader, which now looks much more compact: ```metal template <typename T, typename Tout = T> void kernel i0(constant T* input, device Tout* output, uint index [[thread_position_in_grid]]) { output[index] = c10::i0(static_cast<Tout>(input[index])); } ``` Pull Request resolved: pytorch#145087 Approved by: https://github.com/dcci ghstack dependencies: pytorch#145023
1 parent 2859b11 commit dc9b77c

File tree

8 files changed

+219
-135
lines changed

8 files changed

+219
-135
lines changed

.lintrunner.toml

+1
Original file line numberDiff line numberDiff line change
@@ -249,6 +249,7 @@ exclude_patterns = [
249249
'c10/util/complex_utils.h',
250250
'c10/util/flat_hash_map.h',
251251
'c10/util/logging*.h',
252+
'c10/metal/*.h',
252253
'c10/util/hash.h',
253254
'c10/util/strong_type.h',
254255
'c10/util/SmallVector.h',

aten/src/ATen/native/mps/kernels/SpecialOps.metal

+3-133
Original file line numberDiff line numberDiff line change
@@ -1,149 +1,19 @@
1-
#include <metal_stdlib>
2-
using namespace metal;
3-
4-
/*
5-
* For licensing information and documentation, please refer to the cpu
6-
* implementation located in "ATen/native/Math.h".
7-
*/
8-
9-
template <typename T>
10-
T chbevl(T x, const float array[], const int len) {
11-
T b0, b1, b2;
12-
13-
b0 = array[0];
14-
b1 = 0;
15-
16-
for (int i = 1; i < len; ++i) {
17-
b2 = b1;
18-
b1 = b0;
19-
b0 = x * b1 - b2 + array[i];
20-
}
21-
22-
return T{0.5} * (b0 - b2);
23-
}
24-
25-
// Copied from
26-
// https://github.com/pytorch/pytorch/blob/58b661cda2c002a8e1ac3bee494bfe1f7420437c/aten/src/ATen/native/cuda/Math.cuh#L502
27-
28-
template <typename T>
29-
T i0(T _x) {
30-
auto x = fabs(_x);
31-
32-
if (x <= 8.0) {
33-
/* Chebyshev coefficients for exp(-x) I0(x)
34-
* in the interval [0,8].
35-
*
36-
* lim(x->0){ exp(-x) I0(x) } = 1.
37-
*/
38-
const float A[] = {-4.41534164647933937950E-18, 3.33079451882223809783E-17,
39-
-2.43127984654795469359E-16, 1.71539128555513303061E-15,
40-
-1.16853328779934516808E-14, 7.67618549860493561688E-14,
41-
-4.85644678311192946090E-13, 2.95505266312963983461E-12,
42-
-1.72682629144155570723E-11, 9.67580903537323691224E-11,
43-
-5.18979560163526290666E-10, 2.65982372468238665035E-9,
44-
-1.30002500998624804212E-8, 6.04699502254191894932E-8,
45-
-2.67079385394061173391E-7, 1.11738753912010371815E-6,
46-
-4.41673835845875056359E-6, 1.64484480707288970893E-5,
47-
-5.75419501008210370398E-5, 1.88502885095841655729E-4,
48-
-5.76375574538582365885E-4, 1.63947561694133579842E-3,
49-
-4.32430999505057594430E-3, 1.05464603945949983183E-2,
50-
-2.37374148058994688156E-2, 4.93052842396707084878E-2,
51-
-9.49010970480476444210E-2, 1.71620901522208775349E-1,
52-
-3.04682672343198398683E-1, 6.76795274409476084995E-1};
53-
54-
auto y = (x / 2.0) - 2.0;
55-
return static_cast<T>(exp(x) * chbevl(y, A, 30));
56-
}
57-
58-
// Handles x > 8 case
59-
/* Chebyshev coefficients for exp(-x) sqrt(x) I0(x)
60-
* in the inverted interval [8,infinity].
61-
*
62-
* lim(x->inf){ exp(-x) sqrt(x) I0(x) } = 1/sqrt(2pi).
63-
*/
64-
const float B[] = {-7.23318048787475395456E-18, -4.83050448594418207126E-18,
65-
4.46562142029675999901E-17, 3.46122286769746109310E-17,
66-
-2.82762398051658348494E-16, -3.42548561967721913462E-16,
67-
1.77256013305652638360E-15, 3.81168066935262242075E-15,
68-
-9.55484669882830764870E-15, -4.15056934728722208663E-14,
69-
1.54008621752140982691E-14, 3.85277838274214270114E-13,
70-
7.18012445138366623367E-13, -1.79417853150680611778E-12,
71-
-1.32158118404477131188E-11, -3.14991652796324136454E-11,
72-
1.18891471078464383424E-11, 4.94060238822496958910E-10,
73-
3.39623202570838634515E-9, 2.26666899049817806459E-8,
74-
2.04891858946906374183E-7, 2.89137052083475648297E-6,
75-
6.88975834691682398426E-5, 3.36911647825569408990E-3,
76-
8.04490411014108831608E-1};
77-
78-
return static_cast<T>((exp(x) * chbevl(32.0 / x - 2.0, B, 25)) / sqrt(x));
79-
}
80-
81-
// Copied from
82-
// https://github.com/pytorch/pytorch/blob/58b661cda2c002a8e1ac3bee494bfe1f7420437c/aten/src/ATen/native/cuda/Math.cuh#L576
83-
84-
template <typename T>
85-
T i1(T _x) {
86-
const auto x = fabs(_x);
87-
88-
if (x <= 8.0) {
89-
// Chebyshev coefficients for exp(-x) i1(x) in the internal [0, 8]
90-
// lim(x->0){ exp(-x) i1(x) / x } = 1/2
91-
const float coefficients[] = {
92-
2.77791411276104639959E-18, -2.11142121435816608115E-17,
93-
1.55363195773620046921E-16, -1.10559694773538630805E-15,
94-
7.60068429473540693410E-15, -5.04218550472791168711E-14,
95-
3.22379336594557470981E-13, -1.98397439776494371520E-12,
96-
1.17361862988909016308E-11, -6.66348972350202774223E-11,
97-
3.62559028155211703701E-10, -1.88724975172282928790E-9,
98-
9.38153738649577178388E-9, -4.44505912879632808065E-8,
99-
2.00329475355213526229E-7, -8.56872026469545474066E-7,
100-
3.47025130813767847674E-6, -1.32731636560394358279E-5,
101-
4.78156510755005422638E-5, -1.61760815825896745588E-4,
102-
5.12285956168575772895E-4, -1.51357245063125314899E-3,
103-
4.15642294431288815669E-3, -1.05640848946261981558E-2,
104-
2.47264490306265168283E-2, -5.29459812080949914269E-2,
105-
1.02643658689847095384E-1, -1.76416518357834055153E-1,
106-
2.52587186443633654823E-1};
107-
const auto y = x / 2.0 - 2.0;
108-
const auto out = exp(x) * x * chbevl(y, coefficients, 29);
109-
return static_cast<T>(_x < T(0.) ? -out : out);
110-
}
111-
112-
// Chebyshev coefficients for exp(-x) sqrt(x) i1(x)
113-
// in the inverted interval [8, infinity]
114-
// lim(x->inf){ exp(-x) sqrt(x) i1(x) } = 1/sqrt(2pi)
115-
const float coefficients[] = {
116-
7.51729631084210481353E-18, 4.41434832307170791151E-18,
117-
-4.65030536848935832153E-17, -3.20952592199342395980E-17,
118-
2.96262899764595013876E-16, 3.30820231092092828324E-16,
119-
-1.88035477551078244854E-15, -3.81440307243700780478E-15,
120-
1.04202769841288027642E-14, 4.27244001671195135429E-14,
121-
-2.10154184277266431302E-14, -4.08355111109219731823E-13,
122-
-7.19855177624590851209E-13, 2.03562854414708950722E-12,
123-
1.41258074366137813316E-11, 3.25260358301548823856E-11,
124-
-1.89749581235054123450E-11, -5.58974346219658380687E-10,
125-
-3.83538038596423702205E-9, -2.63146884688951950684E-8,
126-
-2.51223623787020892529E-7, -3.88256480887769039346E-6,
127-
-1.10588938762623716291E-4, -9.76109749136146840777E-3,
128-
7.78576235018280120474E-1};
129-
const auto out = (exp(x) * chbevl(32. / x - 2., coefficients, 25)) / sqrt(x);
130-
return static_cast<T>(_x < T(0.) ? -out : out);
131-
}
1+
#include <c10/metal/special_math.h>
1322

1333
template <typename T, typename Tout = T>
1344
void kernel
1355
i0(constant T* input,
1366
device Tout* output,
1377
uint index [[thread_position_in_grid]]) {
138-
output[index] = i0(static_cast<Tout>(input[index]));
8+
output[index] = c10::metal::i0(static_cast<Tout>(input[index]));
1399
}
14010

14111
template <typename T, typename Tout = T>
14212
void kernel
14313
i1(constant T* input,
14414
device Tout* output,
14515
uint index [[thread_position_in_grid]]) {
146-
output[index] = i1(static_cast<Tout>(input[index]));
16+
output[index] = c10::metal::i1(static_cast<Tout>(input[index]));
14717
}
14818

14919
#define REGISTER_I0_I1(DTI, DTO) \

c10/metal/special_math.h

+139
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
// Implementation of specal math functions for Metal
2+
#include <metal_stdlib>
3+
4+
namespace c10 {
5+
namespace metal {
6+
7+
/*
8+
* For licensing information and documentation, please refer to the cpu
9+
* implementation located in "ATen/native/Math.h".
10+
*/
11+
12+
template <typename T>
13+
T chbevl(T x, const float array[], const int len) {
14+
T b0, b1, b2;
15+
16+
b0 = array[0];
17+
b1 = 0;
18+
19+
for (int i = 1; i < len; ++i) {
20+
b2 = b1;
21+
b1 = b0;
22+
b0 = x * b1 - b2 + array[i];
23+
}
24+
25+
return T{0.5} * (b0 - b2);
26+
}
27+
28+
// Copied from
29+
// https://github.com/pytorch/pytorch/blob/58b661cda2c002a8e1ac3bee494bfe1f7420437c/aten/src/ATen/native/cuda/Math.cuh#L502
30+
31+
template <typename T>
32+
T i0(T _x) {
33+
auto x = ::metal::fabs(_x);
34+
35+
if (x <= 8.0) {
36+
/* Chebyshev coefficients for exp(-x) I0(x)
37+
* in the interval [0,8].
38+
*
39+
* lim(x->0){ exp(-x) I0(x) } = 1.
40+
*/
41+
const float A[] = {-4.41534164647933937950E-18, 3.33079451882223809783E-17,
42+
-2.43127984654795469359E-16, 1.71539128555513303061E-15,
43+
-1.16853328779934516808E-14, 7.67618549860493561688E-14,
44+
-4.85644678311192946090E-13, 2.95505266312963983461E-12,
45+
-1.72682629144155570723E-11, 9.67580903537323691224E-11,
46+
-5.18979560163526290666E-10, 2.65982372468238665035E-9,
47+
-1.30002500998624804212E-8, 6.04699502254191894932E-8,
48+
-2.67079385394061173391E-7, 1.11738753912010371815E-6,
49+
-4.41673835845875056359E-6, 1.64484480707288970893E-5,
50+
-5.75419501008210370398E-5, 1.88502885095841655729E-4,
51+
-5.76375574538582365885E-4, 1.63947561694133579842E-3,
52+
-4.32430999505057594430E-3, 1.05464603945949983183E-2,
53+
-2.37374148058994688156E-2, 4.93052842396707084878E-2,
54+
-9.49010970480476444210E-2, 1.71620901522208775349E-1,
55+
-3.04682672343198398683E-1, 6.76795274409476084995E-1};
56+
57+
auto y = (x / 2.0) - 2.0;
58+
return static_cast<T>(::metal::exp(x) * chbevl(y, A, 30));
59+
}
60+
61+
// Handles x > 8 case
62+
/* Chebyshev coefficients for exp(-x) sqrt(x) I0(x)
63+
* in the inverted interval [8,infinity].
64+
*
65+
* lim(x->inf){ exp(-x) sqrt(x) I0(x) } = 1/sqrt(2pi).
66+
*/
67+
const float B[] = {-7.23318048787475395456E-18, -4.83050448594418207126E-18,
68+
4.46562142029675999901E-17, 3.46122286769746109310E-17,
69+
-2.82762398051658348494E-16, -3.42548561967721913462E-16,
70+
1.77256013305652638360E-15, 3.81168066935262242075E-15,
71+
-9.55484669882830764870E-15, -4.15056934728722208663E-14,
72+
1.54008621752140982691E-14, 3.85277838274214270114E-13,
73+
7.18012445138366623367E-13, -1.79417853150680611778E-12,
74+
-1.32158118404477131188E-11, -3.14991652796324136454E-11,
75+
1.18891471078464383424E-11, 4.94060238822496958910E-10,
76+
3.39623202570838634515E-9, 2.26666899049817806459E-8,
77+
2.04891858946906374183E-7, 2.89137052083475648297E-6,
78+
6.88975834691682398426E-5, 3.36911647825569408990E-3,
79+
8.04490411014108831608E-1};
80+
81+
return static_cast<T>(
82+
(::metal::exp(x) * chbevl(32.0 / x - 2.0, B, 25)) / ::metal::sqrt(x));
83+
}
84+
85+
// Copied from
86+
// https://github.com/pytorch/pytorch/blob/58b661cda2c002a8e1ac3bee494bfe1f7420437c/aten/src/ATen/native/cuda/Math.cuh#L576
87+
88+
template <typename T>
89+
T i1(T _x) {
90+
const auto x = ::metal::fabs(_x);
91+
92+
if (x <= 8.0) {
93+
// Chebyshev coefficients for exp(-x) i1(x) in the internal [0, 8]
94+
// lim(x->0){ exp(-x) i1(x) / x } = 1/2
95+
const float coefficients[] = {
96+
2.77791411276104639959E-18, -2.11142121435816608115E-17,
97+
1.55363195773620046921E-16, -1.10559694773538630805E-15,
98+
7.60068429473540693410E-15, -5.04218550472791168711E-14,
99+
3.22379336594557470981E-13, -1.98397439776494371520E-12,
100+
1.17361862988909016308E-11, -6.66348972350202774223E-11,
101+
3.62559028155211703701E-10, -1.88724975172282928790E-9,
102+
9.38153738649577178388E-9, -4.44505912879632808065E-8,
103+
2.00329475355213526229E-7, -8.56872026469545474066E-7,
104+
3.47025130813767847674E-6, -1.32731636560394358279E-5,
105+
4.78156510755005422638E-5, -1.61760815825896745588E-4,
106+
5.12285956168575772895E-4, -1.51357245063125314899E-3,
107+
4.15642294431288815669E-3, -1.05640848946261981558E-2,
108+
2.47264490306265168283E-2, -5.29459812080949914269E-2,
109+
1.02643658689847095384E-1, -1.76416518357834055153E-1,
110+
2.52587186443633654823E-1};
111+
const auto y = x / 2.0 - 2.0;
112+
const auto out = ::metal::exp(x) * x * chbevl(y, coefficients, 29);
113+
return static_cast<T>(_x < T(0.) ? -out : out);
114+
}
115+
116+
// Chebyshev coefficients for exp(-x) sqrt(x) i1(x)
117+
// in the inverted interval [8, infinity]
118+
// lim(x->inf){ exp(-x) sqrt(x) i1(x) } = 1/sqrt(2pi)
119+
const float coefficients[] = {
120+
7.51729631084210481353E-18, 4.41434832307170791151E-18,
121+
-4.65030536848935832153E-17, -3.20952592199342395980E-17,
122+
2.96262899764595013876E-16, 3.30820231092092828324E-16,
123+
-1.88035477551078244854E-15, -3.81440307243700780478E-15,
124+
1.04202769841288027642E-14, 4.27244001671195135429E-14,
125+
-2.10154184277266431302E-14, -4.08355111109219731823E-13,
126+
-7.19855177624590851209E-13, 2.03562854414708950722E-12,
127+
1.41258074366137813316E-11, 3.25260358301548823856E-11,
128+
-1.89749581235054123450E-11, -5.58974346219658380687E-10,
129+
-3.83538038596423702205E-9, -2.63146884688951950684E-8,
130+
-2.51223623787020892529E-7, -3.88256480887769039346E-6,
131+
-1.10588938762623716291E-4, -9.76109749136146840777E-3,
132+
7.78576235018280120474E-1};
133+
const auto out = (::metal::exp(x) * chbevl(32. / x - 2., coefficients, 25)) /
134+
::metal::sqrt(x);
135+
return static_cast<T>(_x < T(0.) ? -out : out);
136+
}
137+
138+
} // namespace metal
139+
} // namespace c10

cmake/Metal.cmake

+9-2
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ if(WERROR)
88
endif()
99

1010
function(metal_to_air SRC TARGET FLAGS)
11-
add_custom_command(COMMAND xcrun metal -c ${SRC} -o ${TARGET} ${FLAGS} ${METAL_CFLAGS}
11+
add_custom_command(COMMAND xcrun metal -c ${SRC} -I ${CMAKE_SOURCE_DIR} -o ${TARGET} ${FLAGS} ${METAL_CFLAGS}
1212
DEPENDS ${SRC}
1313
OUTPUT ${TARGET}
1414
COMMENT "Compiling ${SRC} to ${TARGET}"
@@ -25,7 +25,14 @@ function(air_to_metallib TARGET OBJECTS)
2525
endfunction()
2626

2727
function(metal_to_metallib_h SRC TGT)
28-
file(READ ${SRC} SHADER_CONTENT)
28+
execute_process(COMMAND ${Python_EXECUTABLE} torch/utils/_cpp_embed_headers.py ${SRC}
29+
WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
30+
OUTPUT_VARIABLE SHADER_CONTENT
31+
RESULT_VARIABLE _exitcode)
32+
if(NOT _exitcode EQUAL 0)
33+
message(FATAL_ERROR "Failed to preprocess Metal shader ${SRC}")
34+
return()
35+
endif()
2936
file(WRITE ${TGT} "#include <ATen/native/mps/OperationUtils.h>\n")
3037
file(APPEND ${TGT} "static ::at::native::mps::MetalShaderLibrary lib(R\"SHDR(\n")
3138
file(APPEND ${TGT} "${SHADER_CONTENT}")

setup.py

+1
Original file line numberDiff line numberDiff line change
@@ -1248,6 +1248,7 @@ def main():
12481248
"include/c10/cuda/impl/*.h",
12491249
"include/c10/hip/*.h",
12501250
"include/c10/hip/impl/*.h",
1251+
"include/c10/metal/*.h",
12511252
"include/c10/xpu/*.h",
12521253
"include/c10/xpu/impl/*.h",
12531254
"include/torch/*.h",

test/test_mps.py

+5
Original file line numberDiff line numberDiff line change
@@ -12698,6 +12698,11 @@ def test_metal_error_checking(self):
1269812698
# Passing no tensors asserts
1269912699
self.assertRaises(RuntimeError, lambda: lib.full(12))
1270012700

12701+
def test_metal_include(self):
12702+
# Checks that includes embedding works
12703+
lib = torch.mps._compile_shader("#include <c10/metal/special_math.h>")
12704+
self.assertIsNotNone(lib)
12705+
1270112706
@unittest.skipIf(not torch.mps.profiler.is_metal_capture_enabled(), "Set MTL_CAPTURE_ENABLED and try again")
1270212707
def test_metal_capture(self):
1270312708
lib = torch.mps._compile_shader("kernel void full(device float* x, uint idx [[thread_position_in_grid]]) { x[idx] = 1.0; }")

torch/mps/__init__.py

+9
Original file line numberDiff line numberDiff line change
@@ -152,8 +152,17 @@ def _compile_shader(source: str):
152152
>>> x = torch.zeros(16, device="mps")
153153
>>> lib.full(x, 3.14)
154154
"""
155+
from pathlib import Path
156+
157+
from torch.utils._cpp_embed_headers import _embed_headers
158+
155159
if not hasattr(torch._C, "_mps_compileShader"):
156160
raise RuntimeError("MPS is not available")
161+
source = _embed_headers(
162+
[l + "\n" for l in source.split("\n")],
163+
[Path(__file__).parent.parent / "include"],
164+
set(),
165+
)
157166
return torch._C._mps_compileShader(source)
158167

159168

0 commit comments

Comments
 (0)