Skip to content

Commit 69827b5

Browse files
sophie-zhaogopherbot
authored andcommitted
crypto/subtle: implement xorBytes in hardware on loong64
goos: linux goarch: loong64 pkg: crypto/subtle cpu: Loongson-3A6000 @ 2500.00MHz │ bench.old │ bench.new │ │ sec/op │ sec/op vs base │ XORBytes/8Bytes 11.250n ± 0% 6.403n ± 0% -43.08% (p=0.000 n=20) XORBytes/128Bytes 24.61n ± 0% 12.21n ± 0% -50.39% (p=0.000 n=20) XORBytes/2048Bytes 216.7n ± 0% 108.3n ± 0% -50.02% (p=0.000 n=20) XORBytes/32768Bytes 3.657µ ± 0% 1.683µ ± 0% -53.98% (p=0.000 n=20) geomean 121.7n 61.44n -49.52% │ bench.old │ bench.new │ │ B/s │ B/s vs base │ XORBytes/8Bytes 678.1Mi ± 0% 1191.5Mi ± 0% +75.72% (p=0.000 n=20) XORBytes/128Bytes 4.844Gi ± 0% 9.766Gi ± 0% +101.63% (p=0.000 n=20) XORBytes/2048Bytes 8.801Gi ± 0% 17.619Gi ± 0% +100.18% (p=0.000 n=20) XORBytes/32768Bytes 8.346Gi ± 0% 18.137Gi ± 0% +117.32% (p=0.000 n=20) geomean 3.918Gi 7.763Gi +98.14% goos: linux goarch: loong64 pkg: crypto/subtle cpu: Loongson-3A5000 @ 2500.00MHz │ bench.old │ bench.new │ │ sec/op │ sec/op vs base │ XORBytes/8Bytes 16.420n ± 0% 8.806n ± 0% -46.37% (p=0.000 n=20) XORBytes/128Bytes 35.84n ± 0% 16.42n ± 0% -54.19% (p=0.000 n=20) XORBytes/2048Bytes 332.0n ± 0% 160.5n ± 0% -51.64% (p=0.000 n=20) XORBytes/32768Bytes 4.944µ ± 0% 2.474µ ± 0% -49.96% (p=0.000 n=20) geomean 176.3n 87.05n -50.62% │ bench.old │ bench.new │ │ B/s │ B/s vs base │ XORBytes/8Bytes 464.7Mi ± 0% 866.4Mi ± 0% +86.45% (p=0.000 n=20) XORBytes/128Bytes 3.326Gi ± 0% 7.261Gi ± 0% +118.31% (p=0.000 n=20) XORBytes/2048Bytes 5.745Gi ± 0% 11.880Gi ± 0% +106.80% (p=0.000 n=20) XORBytes/32768Bytes 6.172Gi ± 0% 12.334Gi ± 0% +99.83% (p=0.000 n=20) geomean 2.705Gi 5.477Gi +102.52% Change-Id: Id404f9023a57025f78b6922659cfa8870881d646 Reviewed-on: https://go-review.googlesource.com/c/go/+/590175 Reviewed-by: abner chenc <[email protected]> Reviewed-by: Roland Shoemaker <[email protected]> LUCI-TryBot-Result: Go LUCI <[email protected]> Auto-Submit: Tim King <[email protected]> Reviewed-by: Tim King <[email protected]>
1 parent 1dfb33e commit 69827b5

File tree

3 files changed

+177
-1
lines changed

3 files changed

+177
-1
lines changed

src/crypto/subtle/xor_generic.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
// Use of this source code is governed by a BSD-style
33
// license that can be found in the LICENSE file.
44

5-
//go:build (!amd64 && !arm64 && !ppc64 && !ppc64le) || purego
5+
//go:build (!amd64 && !arm64 && !loong64 && !ppc64 && !ppc64le) || purego
66

77
package subtle
88

src/crypto/subtle/xor_loong64.go

+10
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
// Copyright 2024 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
//go:build !purego
6+
7+
package subtle
8+
9+
//go:noescape
10+
func xorBytes(dst, a, b *byte, n int)

src/crypto/subtle/xor_loong64.s

+166
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,166 @@
1+
// Copyright 2024 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
//go:build !purego
6+
7+
#include "textflag.h"
8+
9+
// func xorBytes(dst, a, b *byte, n int)
10+
TEXT ·xorBytes(SB), NOSPLIT, $0
11+
MOVV dst+0(FP), R4
12+
MOVV a+8(FP), R5
13+
MOVV b+16(FP), R6
14+
MOVV n+24(FP), R7
15+
16+
MOVV $64, R9
17+
BGEU R7, R9, loop64 // n >= 64
18+
tail:
19+
SRLV $1, R9
20+
BGEU R7, R9, xor_32 // n >= 32 && n < 64
21+
SRLV $1, R9
22+
BGEU R7, R9, xor_16 // n >= 16 && n < 32
23+
SRLV $1, R9
24+
BGEU R7, R9, xor_8 // n >= 8 && n < 16
25+
SRLV $1, R9
26+
BGEU R7, R9, xor_4 // n >= 4 && n < 8
27+
SRLV $1, R9
28+
BGEU R7, R9, xor_2 // n >= 2 && n < 4
29+
SRLV $1, R9
30+
BGEU R7, R9, xor_1 // n = 1
31+
32+
loop64:
33+
MOVV (R5), R10
34+
MOVV 8(R5), R11
35+
MOVV 16(R5), R12
36+
MOVV 24(R5), R13
37+
MOVV (R6), R14
38+
MOVV 8(R6), R15
39+
MOVV 16(R6), R16
40+
MOVV 24(R6), R17
41+
XOR R10, R14
42+
XOR R11, R15
43+
XOR R12, R16
44+
XOR R13, R17
45+
MOVV R14, (R4)
46+
MOVV R15, 8(R4)
47+
MOVV R16, 16(R4)
48+
MOVV R17, 24(R4)
49+
MOVV 32(R5), R10
50+
MOVV 40(R5), R11
51+
MOVV 48(R5), R12
52+
MOVV 56(R5), R13
53+
MOVV 32(R6), R14
54+
MOVV 40(R6), R15
55+
MOVV 48(R6), R16
56+
MOVV 56(R6), R17
57+
XOR R10, R14
58+
XOR R11, R15
59+
XOR R12, R16
60+
XOR R13, R17
61+
MOVV R14, 32(R4)
62+
MOVV R15, 40(R4)
63+
MOVV R16, 48(R4)
64+
MOVV R17, 56(R4)
65+
ADDV $64, R5
66+
ADDV $64, R6
67+
ADDV $64, R4
68+
SUBV $64, R7
69+
// 64 in R9
70+
BGEU R7, R9, loop64
71+
BEQ R7, R0, end
72+
73+
xor_32_check:
74+
SRLV $1, R9
75+
BLT R7, R9, xor_16_check
76+
xor_32:
77+
MOVV (R5), R10
78+
MOVV 8(R5), R11
79+
MOVV 16(R5), R12
80+
MOVV 24(R5), R13
81+
MOVV (R6), R14
82+
MOVV 8(R6), R15
83+
MOVV 16(R6), R16
84+
MOVV 24(R6), R17
85+
XOR R10, R14
86+
XOR R11, R15
87+
XOR R12, R16
88+
XOR R13, R17
89+
MOVV R14, (R4)
90+
MOVV R15, 8(R4)
91+
MOVV R16, 16(R4)
92+
MOVV R17, 24(R4)
93+
ADDV $32, R5
94+
ADDV $32, R6
95+
ADDV $32, R4
96+
SUBV $32, R7
97+
BEQ R7, R0, end
98+
99+
xor_16_check:
100+
SRLV $1, R9
101+
BLT R7, R9, xor_8_check
102+
xor_16:
103+
MOVV (R5), R10
104+
MOVV 8(R5), R11
105+
MOVV (R6), R12
106+
MOVV 8(R6), R13
107+
XOR R10, R12
108+
XOR R11, R13
109+
MOVV R12, (R4)
110+
MOVV R13, 8(R4)
111+
ADDV $16, R5
112+
ADDV $16, R6
113+
ADDV $16, R4
114+
SUBV $16, R7
115+
BEQ R7, R0, end
116+
117+
xor_8_check:
118+
SRLV $1, R9
119+
BLT R7, R9, xor_4_check
120+
xor_8:
121+
MOVV (R5), R10
122+
MOVV (R6), R11
123+
XOR R10, R11
124+
MOVV R11, (R4)
125+
ADDV $8, R5
126+
ADDV $8, R6
127+
ADDV $8, R4
128+
SUBV $8, R7
129+
BEQ R7, R0, end
130+
131+
xor_4_check:
132+
SRLV $1, R9
133+
BLT R7, R9, xor_2_check
134+
xor_4:
135+
MOVW (R5), R10
136+
MOVW (R6), R11
137+
XOR R10, R11
138+
MOVW R11, (R4)
139+
ADDV $4, R5
140+
ADDV $4, R6
141+
ADDV $4, R4
142+
SUBV $4, R7
143+
BEQ R7, R0, end
144+
145+
xor_2_check:
146+
SRLV $1, R9
147+
BLT R7, R9, xor_1
148+
xor_2:
149+
MOVH (R5), R10
150+
MOVH (R6), R11
151+
XOR R10, R11
152+
MOVH R11, (R4)
153+
ADDV $2, R5
154+
ADDV $2, R6
155+
ADDV $2, R4
156+
SUBV $2, R7
157+
BEQ R7, R0, end
158+
159+
xor_1:
160+
MOVB (R5), R10
161+
MOVB (R6), R11
162+
XOR R10, R11
163+
MOVB R11, (R4)
164+
165+
end:
166+
RET

0 commit comments

Comments
 (0)