Skip to content

Commit 6f08aa2

Browse files
committed
remove saving_func_return (which is just bx lr) and fold the separate pop of lr, into a pop of pc
1 parent 5466d9d commit 6f08aa2

1 file changed

Lines changed: 25 additions & 77 deletions

File tree

src/rp2_common/pico_double/double_fma_dcp.S

Lines changed: 25 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,6 @@ double_section WRAPPER_FUNC_NAME(\func)
3131

3232
// ============== STATE SAVE AND RESTORE ===============
3333

34-
.macro saving_func_return
35-
bx lr
36-
.endm
37-
3834
double_section __rp2350_dcp_engaged_state_save_restore_copy
3935
.thumb_func
4036
__dcp_save_state:
@@ -156,9 +152,7 @@ fma_entry:
156152
WXMO r2,r3 @ write sticky+result bits
157153
NRDD @ as dmul macro tail: exponent computed in coprocessor is correct
158154
RDDM r0,r1
159-
// todo optimize this based on final decision on saving_func_entry
160-
pop {r4-r8,lr}
161-
saving_func_return
155+
pop {r4-r8,pc}
162156

163157
93:
164158
adds r1,r1,r4
@@ -168,9 +162,7 @@ fma_entry:
168162
WXMO r2,r3 @ write sticky+result bits
169163
NRDD
170164
RDDM r0,r1
171-
// todo optimize this based on final decision on saving_func_entry
172-
pop {r4-r8,lr}
173-
saving_func_return
165+
pop {r4-r8,pc}
174166

175167
92:
176168
adds r2,r2,r4
@@ -179,9 +171,7 @@ fma_entry:
179171
WXMO r2,r3 @ write sticky+result bits
180172
NRDD
181173
RDDM r0,r1
182-
// todo optimize this based on final decision on saving_func_entry
183-
pop {r4-r8,lr}
184-
saving_func_return
174+
pop {r4-r8,pc}
185175

186176
91: @ case where a (Q52) is shifted down relative to mn (Q124); the mod 32 part of the shift of a has already been done
187177
@ r0:r1:r2:r3: mn
@@ -197,9 +187,7 @@ fma_entry:
197187
WXMO r2,r3 @ write sticky+result bits
198188
NRDD
199189
RDDM r0,r1
200-
// todo optimize this based on final decision on saving_func_entry
201-
pop {r4-r8,lr}
202-
saving_func_return
190+
pop {r4-r8,pc}
203191

204192
94:
205193
adds r0,r0,r5 @ one word shift down
@@ -211,9 +199,7 @@ fma_entry:
211199
WXMO r2,r3 @ write sticky+result bits
212200
NRDD
213201
RDDM r0,r1
214-
// todo optimize this based on final decision on saving_func_entry
215-
pop {r4-r8,lr}
216-
saving_func_return
202+
pop {r4-r8,pc}
217203

218204
95:
219205
adds r0,r0,r6 @ two word shift down
@@ -226,9 +212,7 @@ fma_entry:
226212
WXMO r2,r3 @ write sticky+result bits
227213
NRDD
228214
RDDM r0,r1
229-
// todo optimize this based on final decision on saving_func_entry
230-
pop {r4-r8,lr}
231-
saving_func_return
215+
pop {r4-r8,pc}
232216

233217
@ ======================== ADDITION PATH, RESULT HAS COMPARABLE MAGNITUDE TO a ========================
234218

@@ -267,9 +251,7 @@ fma_entry:
267251
add r3,r3,r14,lsl#20 @ note that "implied" 1 is present in r3, giving an offset of 1 in the exponent
268252
bmi 1f @ negative? then we have just constructed a denormal (or less) and the addition will give an incorrect result
269253
dcp_dadd_m r0,r1,r2,r3,r4,r5
270-
// todo optimize this based on final decision on saving_func_entry
271-
pop {r4-r8,lr}
272-
saving_func_return
254+
pop {r4-r8,pc}
273255

274256
1:
275257
@ compare with similar code in subtraction path: here we cannot underflow
@@ -279,9 +261,7 @@ fma_entry:
279261
add r5,r5,#0x40000000
280262
dcp_dadd_m r0,r1,r2,r3,r4,r5
281263
sub r1,r1,#0x40000000 @ fix exponent
282-
// todo optimize this based on final decision on saving_func_entry
283-
pop {r4-r8,lr}
284-
saving_func_return
264+
pop {r4-r8,pc}
285265

286266
90:
287267
@ dcp_dmul_m tail then dadd ("mla path")
@@ -290,17 +270,13 @@ fma_entry:
290270
NRDD
291271
RDDM r0,r1
292272
dcp_dadd_m r0,r1,r0,r1,r4,r5
293-
// todo optimize this based on final decision on saving_func_entry
294-
pop {r4-r8,lr}
295-
saving_func_return
273+
pop {r4-r8,pc}
296274

297275
82: @ |mn| is very small compared to |a|, so result is a
298276
RDDM r0,r1 @ clear the engaged flag
299277
movs r0,r4
300278
movs r1,r5
301-
// todo optimize this based on final decision on saving_func_entry
302-
pop {r4-r8,lr}
303-
saving_func_return
279+
pop {r4-r8,pc}
304280

305281
@ ======================== SUBTRACTION PATH ========================
306282

@@ -342,9 +318,7 @@ fma_entry:
342318
WXMO r2,r3 @ write sticky+result bits
343319
NRDD @ as dmul macro tail: exponent and sign computed in coprocessor is correct
344320
RDDM r0,r1
345-
// todo optimize this based on final decision on saving_func_entry
346-
pop {r4-r8,lr}
347-
saving_func_return
321+
pop {r4-r8,pc}
348322

349323
94:
350324
@ here if ea-(em+en)=3 e.g. if ea=0 then em+en=-3 and 1/8=2^-3≤mn<4.2^-3=1/2
@@ -364,9 +338,7 @@ fma_entry:
364338
NRDD
365339
RDDM r0,r1
366340
eor r1,r1,0x80000000 @ sign of result is opposite to that of product as yielded by coprocessor
367-
// todo optimize this based on final decision on saving_func_entry
368-
pop {r4-r8,lr}
369-
saving_func_return
341+
pop {r4-r8,pc}
370342

371343
93:
372344
subs r1,r1,r4 @ shifting a up by one word: this cannot go negative or have bad cancellation
@@ -376,9 +348,7 @@ fma_entry:
376348
WXMO r2,r3 @ write sticky+result bits
377349
NRDD
378350
RDDM r0,r1
379-
// todo optimize this based on final decision on saving_func_entry
380-
pop {r4-r8,lr}
381-
saving_func_return
351+
pop {r4-r8,pc}
382352

383353
92:
384354
subs r2,r2,r4 @ shifting a up by two words: this /can/ go negative or have bad cancellation
@@ -389,9 +359,7 @@ fma_entry:
389359
WXMO r2,r3 @ write sticky+result bits
390360
NRDD
391361
RDDM r0,r1
392-
// todo optimize this based on final decision on saving_func_entry
393-
pop {r4-r8,lr}
394-
saving_func_return
362+
pop {r4-r8,pc}
395363

396364
@ heavy cancellation case
397365
@ r0:r1:r2:r3: result Q124, signed
@@ -450,9 +418,7 @@ fma_entry:
450418
ble 12f @ underflow? overflow cannot occur here as the result is smaller in magnitude than a
451419
bfi r1,r8,#20,#11 @ insert exponent
452420
orr r1,r1,r14,lsl#31 @ or in sign
453-
// todo optimize this based on final decision on saving_func_entry
454-
pop {r4-r8,lr}
455-
saving_func_return
421+
pop {r4-r8,pc}
456422

457423
11:
458424
adcs r0,r0,#0 @ round up as above
@@ -462,26 +428,20 @@ fma_entry:
462428
ble 12f @ underflow?
463429
bfi r1,r8,#20,#11 @ insert exponent
464430
orr r1,r1,r14,lsl#31 @ or in sign
465-
// todo optimize this based on final decision on saving_func_entry
466-
pop {r4-r8,lr}
467-
saving_func_return
431+
pop {r4-r8,pc}
468432

469433
10:
470434
adds r8,r8,r1,lsr#20 @ candidate for exponent field
471435
ble 12f @ underflow?
472436
bfi r1,r8,#20,#11 @ insert exponent
473437
orr r1,r1,r14,lsl#31 @ or in sign
474438
9:
475-
// todo optimize this based on final decision on saving_func_entry
476-
pop {r4-r8,lr}
477-
saving_func_return
439+
pop {r4-r8,pc}
478440

479441
12:
480442
mov r1,r14,lsl#31 @ underflow: return signed zero
481443
movs r0,#0
482-
// todo optimize this based on final decision on saving_func_entry
483-
pop {r4-r8,lr}
484-
saving_func_return
444+
pop {r4-r8,pc}
485445

486446
91: @ case where a (Q52) is shifted down relative to mn (Q124); the mod 32 part of the shift of a has already been done
487447
@ r0:r1:r2:r3: mn
@@ -501,9 +461,7 @@ fma_entry:
501461
WXMO r2,r3 @ write sticky+result bits
502462
NRDD
503463
RDDM r0,r1
504-
// todo optimize this based on final decision on saving_func_entry
505-
pop {r4-r8,lr}
506-
saving_func_return
464+
pop {r4-r8,pc}
507465

508466
94:
509467
rsbs r4,r4,#0 @ one word shift down
@@ -516,9 +474,7 @@ fma_entry:
516474
WXMO r2,r3 @ write sticky+result bits
517475
NRDD
518476
RDDM r0,r1
519-
// todo optimize this based on final decision on saving_func_entry
520-
pop {r4-r8,lr}
521-
saving_func_return
477+
pop {r4-r8,pc}
522478

523479
95:
524480
movs r7,#0 @ two words shift down
@@ -534,9 +490,7 @@ fma_entry:
534490
WXMO r2,r3 @ write sticky+result bits
535491
NRDD
536492
RDDM r0,r1
537-
// todo optimize this based on final decision on saving_func_entry
538-
pop {r4-r8,lr}
539-
saving_func_return
493+
pop {r4-r8,pc}
540494

541495
80:
542496
@ here |a| is big compared to |mn|, more precisely ea-(em+en)≥4 so e.g. if ea=0 then em+en≤-4 and mn<4.2^-4=1/4
@@ -572,9 +526,7 @@ fma_entry:
572526
bmi 1f @ did exponent go negative?
573527

574528
dcp_dadd_m r0,r1,r2,r3,r4,r5
575-
// todo optimize this based on final decision on saving_func_entry
576-
pop {r4-r8,lr}
577-
saving_func_return
529+
pop {r4-r8,pc}
578530

579531
1:
580532
cmn r7,#64 @ is mn being shifted well below the bottom of a?
@@ -588,9 +540,7 @@ fma_entry:
588540
andls r1,r1,0x80000000 @ flush to signed zero
589541
movls r0,#0
590542
subhi r1,r1,#0x40000000 @ else fix exponent of result
591-
// todo optimize this based on final decision on saving_func_entry
592-
pop {r4-r8,lr}
593-
saving_func_return
543+
pop {r4-r8,pc}
594544

595545

596546
double_section fma_fast
@@ -615,8 +565,7 @@ regular_func mla
615565
dcp_dmul_m r0,r1,r0,r1,r2,r3,r0,r1,r2,r3,r4,r5,r14
616566
ldrd r2,r3,[r12,#0] @ fetch a using original SP
617567
dcp_dadd_m r0,r1,r0,r1,r2,r3
618-
pop {r4,r5,r14}
619-
saving_func_return
568+
pop {r4,r5,pc}
620569
#else
621570
// __ARM_PCS_VFP version
622571
@ d0 m
@@ -632,8 +581,7 @@ regular_func mla
632581
vmov r2,r3,d2
633582
dcp_dadd_m r0,r1,r0,r1,r2,r3
634583
vmov d0,r0,r1
635-
pop {r4, r5, r14}
636-
saving_func_return
584+
pop {r4, r5, pc}
637585
#endif
638586

639587
#endif

0 commit comments

Comments
 (0)