@@ -90,11 +90,12 @@ safeload:
90
90
// key and then shuffle the key forward in the register. We can shuffle and
91
91
// pad with zeroes at the same time to avoid having to also blend (as load
92
92
// does).
93
- ADD R3 , R2 , R2
94
- SUB $ 16 , R2 , R2
93
+ MOVD $ 16 , R12
94
+ SUB R3 , R12 , R12
95
+ SUB R12 , R2 , R2
95
96
VLD1 (R2) , [ V0.B16 ]
96
97
MOVD $ shuffle_masks<>(SB) , R10
97
- ADD R3<< 4 , R10 , R10
98
+ ADD R12 , R10 , R10
98
99
VLD1 ( R10 ) , [ V2.B16 ]
99
100
VTBL V2.B16 , [ V0.B16 , V1.B16 ], V3.B16
100
101
JMP loop
@@ -135,38 +136,8 @@ DATA blend_masks<>+256(SB)/8, $0x0706050403020100
135
136
DATA blend_masks<> + 264 (SB)/ 8 , $ 0x0F0E0D0C0B0A0908
136
137
GLOBL blend_masks<>(SB) , RODATA|NOPTR , $ 272
137
138
138
- DATA shuffle_masks<> + 0 (SB)/ 8 , $ 0x1010101010101010
139
- DATA shuffle_masks<> + 8 (SB)/ 8 , $ 0x1010101010101010
140
- DATA shuffle_masks<> + 16 (SB)/ 8 , $ 0x101010101010100F
139
+ DATA shuffle_masks<> + 0 (SB)/ 8 , $ 0x0706050403020100
140
+ DATA shuffle_masks<> + 8 (SB)/ 8 , $ 0x0F0E0D0C0B0A0908
141
+ DATA shuffle_masks<> + 16 (SB)/ 8 , $ 0x1010101010101010
141
142
DATA shuffle_masks<> + 24 (SB)/ 8 , $ 0x1010101010101010
142
- DATA shuffle_masks<> + 32 (SB)/ 8 , $ 0x1010101010100F0E
143
- DATA shuffle_masks<> + 40 (SB)/ 8 , $ 0x1010101010101010
144
- DATA shuffle_masks<> + 48 (SB)/ 8 , $ 0x10101010100F0E0D
145
- DATA shuffle_masks<> + 56 (SB)/ 8 , $ 0x1010101010101010
146
- DATA shuffle_masks<> + 64 (SB)/ 8 , $ 0x101010100F0E0D0C
147
- DATA shuffle_masks<> + 72 (SB)/ 8 , $ 0x1010101010101010
148
- DATA shuffle_masks<> + 80 (SB)/ 8 , $ 0x1010100F0E0D0C0B
149
- DATA shuffle_masks<> + 88 (SB)/ 8 , $ 0x1010101010101010
150
- DATA shuffle_masks<> + 96 (SB)/ 8 , $ 0x10100F0E0D0C0B0A
151
- DATA shuffle_masks<> + 104 (SB)/ 8 , $ 0x1010101010101010
152
- DATA shuffle_masks<> + 112 (SB)/ 8 , $ 0x100F0E0D0C0B0A09
153
- DATA shuffle_masks<> + 120 (SB)/ 8 , $ 0x1010101010101010
154
- DATA shuffle_masks<> + 128 (SB)/ 8 , $ 0x0F0E0D0C0B0A0908
155
- DATA shuffle_masks<> + 136 (SB)/ 8 , $ 0x1010101010101010
156
- DATA shuffle_masks<> + 144 (SB)/ 8 , $ 0x0E0D0C0B0A090807
157
- DATA shuffle_masks<> + 152 (SB)/ 8 , $ 0x101010101010100F
158
- DATA shuffle_masks<> + 160 (SB)/ 8 , $ 0x0D0C0B0A09080706
159
- DATA shuffle_masks<> + 168 (SB)/ 8 , $ 0x1010101010100F0E
160
- DATA shuffle_masks<> + 176 (SB)/ 8 , $ 0x0C0B0A0908070605
161
- DATA shuffle_masks<> + 184 (SB)/ 8 , $ 0x10101010100F0E0D
162
- DATA shuffle_masks<> + 192 (SB)/ 8 , $ 0x0B0A090807060504
163
- DATA shuffle_masks<> + 200 (SB)/ 8 , $ 0x101010100F0E0D0C
164
- DATA shuffle_masks<> + 208 (SB)/ 8 , $ 0x0A09080706050403
165
- DATA shuffle_masks<> + 216 (SB)/ 8 , $ 0x1010100F0E0D0C0B
166
- DATA shuffle_masks<> + 224 (SB)/ 8 , $ 0x0908070605040302
167
- DATA shuffle_masks<> + 232 (SB)/ 8 , $ 0x10100F0E0D0C0B0A
168
- DATA shuffle_masks<> + 240 (SB)/ 8 , $ 0x0807060504030201
169
- DATA shuffle_masks<> + 248 (SB)/ 8 , $ 0x100F0E0D0C0B0A09
170
- DATA shuffle_masks<> + 256 (SB)/ 8 , $ 0x0706050403020100
171
- DATA shuffle_masks<> + 264 (SB)/ 8 , $ 0x0F0E0D0C0B0A0908
172
- GLOBL shuffle_masks<>(SB) , RODATA|NOPTR , $ 272
143
+ GLOBL shuffle_masks<>(SB) , RODATA|NOPTR , $ 32
0 commit comments