-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsu3_rdot.m4
65 lines (61 loc) · 1.62 KB
/
su3_rdot.m4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
// float su3_rdot( su3_vector *a,*b );
// return real part of dot product of A and B
// file su3_rdot.m4, i860 assembler version of su3_rdot.c
//
define(A,r16) // address of source 1
define(B,r17) // address of source 2
define(sum,f8) // return value
define(a0,f8) // complex number = register pair
define(a0r,f8) // real part
define(a0i,f9) // imag part
define(a1,f10)
define(a1r,f10)
define(a1i,f11)
define(a2,f12)
define(a2r,f12)
define(a2i,f13)
define(b0,f14) // complex number = register pair
define(b0r,f14) // real part
define(b0i,f15) // imag part
define(b1,f16)
define(b1r,f16)
define(b1i,f17)
define(b2,f18)
define(b2r,f18)
define(b2i,f19)
.text
.align 8
_su3_rdot:
fld.d 0(A),a0
fld.d 8(A),a1
fld.d 16(A),a2
fld.d 0(B),b0
fld.d 8(B),b1
fld.d 16(B),b2
pfmul.ss a0r,b0r,f0
pfmul.ss a0i,b0i,f0
pfmul.ss a1r,b1r,f0
mm12ttpm.ss a1i,b1i,f0 // a0r*b0r into T reg.
m12tpm.ss a2r,b2r,f0 // start add of a0r*b0r and a0i*b0i
mm12ttpm.ss a2i,b2i,f0 // a1r*b1r into T reg.
m12tpm.ss f0,f0,f0 // start add of a1r*b1r and a1i*b1i
// adder output is now OK
m12ttpa.ss f0,f0,a0r // a0r*b0r + a0i*b0i comes out
m12tpm.ss f0,f0,f0 // start add of a2r*b2r and a2i*b2i
i2ap1.ss a0r,f0,f0 // start (a0r*b0r+a0i*b0i) + (a1r*b1r+a1i*b1i)
pfadd.ss f0,f0,f0
pfadd.ss f0,f0,a0r // a0r gets (a2r*b2r+a2i*b2i)
//.if FLOATOPTION=X167
i2ap1.ss a0r,f0,f0
pfadd.ss f0,f0,f0
pfadd.ss f0,f0,f0
bri r1
pfadd.ss f0,f0,sum
//.else
//i2ap1.sd a0r,f0,f0
//pfadd.sd f0,f0,f0
//pfadd.sd f0,f0,f0
//bri r1
//pfadd.sd f0,f0,sum
//.endif
.globl _su3_rdot