-
Notifications
You must be signed in to change notification settings - Fork 37
/
Copy pathcontains_amd64.s
270 lines (248 loc) · 4.52 KB
/
contains_amd64.s
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
// Code generated by command: go run contains_asm.go -pkg mem -out ../mem/contains_amd64.s -stubs ../mem/contains_amd64.go. DO NOT EDIT.
//go:build !purego
#include "textflag.h"
// func ContainsByte(haystack []byte, needle byte) bool
// Requires: AVX, AVX2, SSE2, SSE4.1
TEXT ·ContainsByte(SB), NOSPLIT, $0-33
MOVQ haystack_base+0(FP), AX
MOVQ haystack_len+8(FP), CX
XORQ DX, DX
MOVB needle+24(FP), DL
MOVQ DX, BX
SHLQ $0x08, BX
ORQ BX, DX
MOVQ DX, BX
SHLQ $0x10, BX
ORQ BX, DX
MOVQ DX, BX
SHLQ $0x20, BX
ORQ BX, DX
MOVQ $0x0101010101010101, BX
MOVQ $0x8080808080808080, SI
MOVB $0x00, ret+32(FP)
JMP start
found:
MOVB $0x01, ret+32(FP)
JMP done
avx2_found:
MOVB $0x01, ret+32(FP)
JMP avx2_done
start:
CMPQ CX, $0x10
JBE tail
PXOR X1, X1
PINSRQ $0x00, DX, X0
PINSRQ $0x01, DX, X0
tail:
CMPQ CX, $0x00
JE done
CMPQ CX, $0x01
JE handle1
CMPQ CX, $0x03
JBE handle2to3
CMPQ CX, $0x04
JE handle4
CMPQ CX, $0x08
JB handle5to7
JE handle8
CMPQ CX, $0x10
JBE handle9to16
CMPQ CX, $0x20
JBE handle17to32
CMPQ CX, $0x40
JBE handle33to64
BTL $0x08, github·com∕segmentio∕asm∕cpu·X86+0(SB)
JCC generic
VZEROUPPER
VPBROADCASTQ X0, Y0
CMPQ CX, $0x00000100
JB avx2_tail
JMP avx2
generic:
MOVOU (AX), X2
MOVOU 16(AX), X3
MOVOU 32(AX), X4
MOVOU 48(AX), X5
PCMPEQB X0, X2
PCMPEQB X0, X3
PCMPEQB X0, X4
PCMPEQB X0, X5
POR X2, X3
POR X4, X5
POR X3, X5
PTEST X5, X1
JCC found
ADDQ $0x40, AX
SUBQ $0x40, CX
CMPQ CX, $0x40
JBE tail
JMP generic
done:
RET
handle1:
MOVB (AX), AL
CMPB AL, DL
JE found
RET
handle2to3:
MOVW (AX), DI
MOVW -2(AX)(CX*1), AX
XORW DX, DI
MOVW DI, CX
XORW DX, AX
MOVW AX, DX
SUBW BX, CX
NOTW DI
ANDW DI, CX
SUBW BX, DX
NOTW AX
ANDW AX, DX
ORW CX, DX
ANDW SI, DX
JNZ found
RET
handle4:
MOVL (AX), AX
XORL DX, AX
MOVL AX, CX
SUBL BX, CX
NOTL AX
ANDL AX, CX
ANDL SI, CX
JNZ found
RET
handle5to7:
MOVL (AX), DI
MOVL -4(AX)(CX*1), AX
XORL DX, DI
MOVL DI, CX
XORL DX, AX
MOVL AX, DX
SUBL BX, CX
NOTL DI
ANDL DI, CX
SUBL BX, DX
NOTL AX
ANDL AX, DX
ORL CX, DX
ANDL SI, DX
JNZ found
RET
handle8:
MOVQ (AX), AX
XORQ DX, AX
MOVQ AX, CX
SUBQ BX, CX
NOTQ AX
ANDQ AX, CX
ANDQ SI, CX
JNZ found
RET
handle9to16:
MOVQ (AX), DI
MOVQ -8(AX)(CX*1), AX
XORQ DX, DI
MOVQ DI, CX
XORQ DX, AX
MOVQ AX, DX
SUBQ BX, CX
NOTQ DI
ANDQ DI, CX
SUBQ BX, DX
NOTQ AX
ANDQ AX, DX
ORQ CX, DX
ANDQ SI, DX
JNZ found
RET
handle17to32:
MOVOU (AX), X2
MOVOU -16(AX)(CX*1), X3
PCMPEQB X0, X2
PCMPEQB X0, X3
POR X2, X3
PTEST X3, X1
JCC found
RET
handle33to64:
MOVOU (AX), X2
MOVOU 16(AX), X3
MOVOU -32(AX)(CX*1), X4
MOVOU -16(AX)(CX*1), X5
PCMPEQB X0, X2
PCMPEQB X0, X3
PCMPEQB X0, X4
PCMPEQB X0, X5
POR X2, X3
POR X4, X5
POR X3, X5
PTEST X5, X1
JCC found
RET
// AVX optimized version for medium to large size inputs.
avx2:
VPCMPEQB (AX), Y0, Y2
VPCMPEQB 32(AX), Y0, Y3
VPCMPEQB 64(AX), Y0, Y4
VPCMPEQB 96(AX), Y0, Y5
VPCMPEQB 128(AX), Y0, Y6
VPCMPEQB 160(AX), Y0, Y7
VPCMPEQB 192(AX), Y0, Y8
VPCMPEQB 224(AX), Y0, Y9
VPOR Y2, Y3, Y3
VPOR Y4, Y5, Y5
VPOR Y6, Y7, Y7
VPOR Y8, Y9, Y9
VPOR Y3, Y5, Y5
VPOR Y7, Y9, Y9
VPOR Y5, Y9, Y9
VPTEST Y9, Y1
JCC avx2_found
ADDQ $0x00000100, AX
SUBQ $0x00000100, CX
JZ avx2_done
CMPQ CX, $0x00000100
JAE avx2
avx2_tail:
CMPQ CX, $0x40
JBE avx2_tail_1to64
CMPQ CX, $0x80
JBE avx2_tail_65to128
VPCMPEQB (AX), Y0, Y2
VPCMPEQB 32(AX), Y0, Y3
VPCMPEQB 64(AX), Y0, Y4
VPCMPEQB 96(AX), Y0, Y5
VPCMPEQB -128(AX)(CX*1), Y0, Y6
VPCMPEQB -96(AX)(CX*1), Y0, Y7
VPCMPEQB -64(AX)(CX*1), Y0, Y8
VPCMPEQB -32(AX)(CX*1), Y0, Y0
VPOR Y2, Y3, Y3
VPOR Y4, Y5, Y5
VPOR Y6, Y7, Y7
VPOR Y8, Y0, Y0
VPOR Y3, Y5, Y5
VPOR Y7, Y0, Y0
VPOR Y5, Y0, Y0
VPTEST Y0, Y1
JCC avx2_found
JMP avx2_done
avx2_tail_65to128:
VPCMPEQB (AX), Y0, Y2
VPCMPEQB 32(AX), Y0, Y3
VPCMPEQB -64(AX)(CX*1), Y0, Y4
VPCMPEQB -32(AX)(CX*1), Y0, Y0
VPOR Y2, Y3, Y3
VPOR Y4, Y0, Y0
VPOR Y3, Y0, Y0
VPTEST Y0, Y1
JCC avx2_found
JMP avx2_done
avx2_tail_1to64:
VPCMPEQB -64(AX)(CX*1), Y0, Y2
VPCMPEQB -32(AX)(CX*1), Y0, Y0
VPOR Y2, Y0, Y0
VPTEST Y0, Y1
JCC avx2_found
avx2_done:
VZEROUPPER
RET