|
1 |
| -#include <metal_stdlib> |
2 |
| -using namespace metal; |
3 |
| - |
4 |
| -/* |
5 |
| - * For licensing information and documentation, please refer to the cpu |
6 |
| - * implementation located in "ATen/native/Math.h". |
7 |
| - */ |
8 |
| - |
9 |
| -template <typename T> |
10 |
| -T chbevl(T x, const float array[], const int len) { |
11 |
| - T b0, b1, b2; |
12 |
| - |
13 |
| - b0 = array[0]; |
14 |
| - b1 = 0; |
15 |
| - |
16 |
| - for (int i = 1; i < len; ++i) { |
17 |
| - b2 = b1; |
18 |
| - b1 = b0; |
19 |
| - b0 = x * b1 - b2 + array[i]; |
20 |
| - } |
21 |
| - |
22 |
| - return T{0.5} * (b0 - b2); |
23 |
| -} |
24 |
| - |
25 |
| -// Copied from |
26 |
| -// https://github.com/pytorch/pytorch/blob/58b661cda2c002a8e1ac3bee494bfe1f7420437c/aten/src/ATen/native/cuda/Math.cuh#L502 |
27 |
| - |
28 |
| -template <typename T> |
29 |
| -T i0(T _x) { |
30 |
| - auto x = fabs(_x); |
31 |
| - |
32 |
| - if (x <= 8.0) { |
33 |
| - /* Chebyshev coefficients for exp(-x) I0(x) |
34 |
| - * in the interval [0,8]. |
35 |
| - * |
36 |
| - * lim(x->0){ exp(-x) I0(x) } = 1. |
37 |
| - */ |
38 |
| - const float A[] = {-4.41534164647933937950E-18, 3.33079451882223809783E-17, |
39 |
| - -2.43127984654795469359E-16, 1.71539128555513303061E-15, |
40 |
| - -1.16853328779934516808E-14, 7.67618549860493561688E-14, |
41 |
| - -4.85644678311192946090E-13, 2.95505266312963983461E-12, |
42 |
| - -1.72682629144155570723E-11, 9.67580903537323691224E-11, |
43 |
| - -5.18979560163526290666E-10, 2.65982372468238665035E-9, |
44 |
| - -1.30002500998624804212E-8, 6.04699502254191894932E-8, |
45 |
| - -2.67079385394061173391E-7, 1.11738753912010371815E-6, |
46 |
| - -4.41673835845875056359E-6, 1.64484480707288970893E-5, |
47 |
| - -5.75419501008210370398E-5, 1.88502885095841655729E-4, |
48 |
| - -5.76375574538582365885E-4, 1.63947561694133579842E-3, |
49 |
| - -4.32430999505057594430E-3, 1.05464603945949983183E-2, |
50 |
| - -2.37374148058994688156E-2, 4.93052842396707084878E-2, |
51 |
| - -9.49010970480476444210E-2, 1.71620901522208775349E-1, |
52 |
| - -3.04682672343198398683E-1, 6.76795274409476084995E-1}; |
53 |
| - |
54 |
| - auto y = (x / 2.0) - 2.0; |
55 |
| - return static_cast<T>(exp(x) * chbevl(y, A, 30)); |
56 |
| - } |
57 |
| - |
58 |
| - // Handles x > 8 case |
59 |
| - /* Chebyshev coefficients for exp(-x) sqrt(x) I0(x) |
60 |
| - * in the inverted interval [8,infinity]. |
61 |
| - * |
62 |
| - * lim(x->inf){ exp(-x) sqrt(x) I0(x) } = 1/sqrt(2pi). |
63 |
| - */ |
64 |
| - const float B[] = {-7.23318048787475395456E-18, -4.83050448594418207126E-18, |
65 |
| - 4.46562142029675999901E-17, 3.46122286769746109310E-17, |
66 |
| - -2.82762398051658348494E-16, -3.42548561967721913462E-16, |
67 |
| - 1.77256013305652638360E-15, 3.81168066935262242075E-15, |
68 |
| - -9.55484669882830764870E-15, -4.15056934728722208663E-14, |
69 |
| - 1.54008621752140982691E-14, 3.85277838274214270114E-13, |
70 |
| - 7.18012445138366623367E-13, -1.79417853150680611778E-12, |
71 |
| - -1.32158118404477131188E-11, -3.14991652796324136454E-11, |
72 |
| - 1.18891471078464383424E-11, 4.94060238822496958910E-10, |
73 |
| - 3.39623202570838634515E-9, 2.26666899049817806459E-8, |
74 |
| - 2.04891858946906374183E-7, 2.89137052083475648297E-6, |
75 |
| - 6.88975834691682398426E-5, 3.36911647825569408990E-3, |
76 |
| - 8.04490411014108831608E-1}; |
77 |
| - |
78 |
| - return static_cast<T>((exp(x) * chbevl(32.0 / x - 2.0, B, 25)) / sqrt(x)); |
79 |
| -} |
80 |
| - |
81 |
| -// Copied from |
82 |
| -// https://github.com/pytorch/pytorch/blob/58b661cda2c002a8e1ac3bee494bfe1f7420437c/aten/src/ATen/native/cuda/Math.cuh#L576 |
83 |
| - |
84 |
| -template <typename T> |
85 |
| -T i1(T _x) { |
86 |
| - const auto x = fabs(_x); |
87 |
| - |
88 |
| - if (x <= 8.0) { |
89 |
| - // Chebyshev coefficients for exp(-x) i1(x) in the internal [0, 8] |
90 |
| - // lim(x->0){ exp(-x) i1(x) / x } = 1/2 |
91 |
| - const float coefficients[] = { |
92 |
| - 2.77791411276104639959E-18, -2.11142121435816608115E-17, |
93 |
| - 1.55363195773620046921E-16, -1.10559694773538630805E-15, |
94 |
| - 7.60068429473540693410E-15, -5.04218550472791168711E-14, |
95 |
| - 3.22379336594557470981E-13, -1.98397439776494371520E-12, |
96 |
| - 1.17361862988909016308E-11, -6.66348972350202774223E-11, |
97 |
| - 3.62559028155211703701E-10, -1.88724975172282928790E-9, |
98 |
| - 9.38153738649577178388E-9, -4.44505912879632808065E-8, |
99 |
| - 2.00329475355213526229E-7, -8.56872026469545474066E-7, |
100 |
| - 3.47025130813767847674E-6, -1.32731636560394358279E-5, |
101 |
| - 4.78156510755005422638E-5, -1.61760815825896745588E-4, |
102 |
| - 5.12285956168575772895E-4, -1.51357245063125314899E-3, |
103 |
| - 4.15642294431288815669E-3, -1.05640848946261981558E-2, |
104 |
| - 2.47264490306265168283E-2, -5.29459812080949914269E-2, |
105 |
| - 1.02643658689847095384E-1, -1.76416518357834055153E-1, |
106 |
| - 2.52587186443633654823E-1}; |
107 |
| - const auto y = x / 2.0 - 2.0; |
108 |
| - const auto out = exp(x) * x * chbevl(y, coefficients, 29); |
109 |
| - return static_cast<T>(_x < T(0.) ? -out : out); |
110 |
| - } |
111 |
| - |
112 |
| - // Chebyshev coefficients for exp(-x) sqrt(x) i1(x) |
113 |
| - // in the inverted interval [8, infinity] |
114 |
| - // lim(x->inf){ exp(-x) sqrt(x) i1(x) } = 1/sqrt(2pi) |
115 |
| - const float coefficients[] = { |
116 |
| - 7.51729631084210481353E-18, 4.41434832307170791151E-18, |
117 |
| - -4.65030536848935832153E-17, -3.20952592199342395980E-17, |
118 |
| - 2.96262899764595013876E-16, 3.30820231092092828324E-16, |
119 |
| - -1.88035477551078244854E-15, -3.81440307243700780478E-15, |
120 |
| - 1.04202769841288027642E-14, 4.27244001671195135429E-14, |
121 |
| - -2.10154184277266431302E-14, -4.08355111109219731823E-13, |
122 |
| - -7.19855177624590851209E-13, 2.03562854414708950722E-12, |
123 |
| - 1.41258074366137813316E-11, 3.25260358301548823856E-11, |
124 |
| - -1.89749581235054123450E-11, -5.58974346219658380687E-10, |
125 |
| - -3.83538038596423702205E-9, -2.63146884688951950684E-8, |
126 |
| - -2.51223623787020892529E-7, -3.88256480887769039346E-6, |
127 |
| - -1.10588938762623716291E-4, -9.76109749136146840777E-3, |
128 |
| - 7.78576235018280120474E-1}; |
129 |
| - const auto out = (exp(x) * chbevl(32. / x - 2., coefficients, 25)) / sqrt(x); |
130 |
| - return static_cast<T>(_x < T(0.) ? -out : out); |
131 |
| -} |
| 1 | +#include <c10/metal/special_math.h> |
132 | 2 |
|
133 | 3 | template <typename T, typename Tout = T>
|
134 | 4 | void kernel
|
135 | 5 | i0(constant T* input,
|
136 | 6 | device Tout* output,
|
137 | 7 | uint index [[thread_position_in_grid]]) {
|
138 |
| - output[index] = i0(static_cast<Tout>(input[index])); |
| 8 | + output[index] = c10::metal::i0(static_cast<Tout>(input[index])); |
139 | 9 | }
|
140 | 10 |
|
141 | 11 | template <typename T, typename Tout = T>
|
142 | 12 | void kernel
|
143 | 13 | i1(constant T* input,
|
144 | 14 | device Tout* output,
|
145 | 15 | uint index [[thread_position_in_grid]]) {
|
146 |
| - output[index] = i1(static_cast<Tout>(input[index])); |
| 16 | + output[index] = c10::metal::i1(static_cast<Tout>(input[index])); |
147 | 17 | }
|
148 | 18 |
|
149 | 19 | #define REGISTER_I0_I1(DTI, DTO) \
|
|
0 commit comments