@@ -31,10 +31,6 @@ double_section WRAPPER_FUNC_NAME(\func)
3131
3232// ============== STATE SAVE AND RESTORE ===============
3333
34- .macro saving_func_return
35- bx lr
36- .endm
37-
3834double_section __rp2350_dcp_engaged_state_save_restore_copy
3935.thumb_func
4036__dcp_save_state:
@@ -156,9 +152,7 @@ fma_entry:
156152 WXMO r2 , r3 @ write sticky + result bits
157153 NRDD @ as dmul macro tail: exponent computed in coprocessor is correct
158154 RDDM r0 , r1
159- // todo optimize this based on final decision on saving_func_entry
160- pop {r4 - r8 , lr}
161- saving_func_return
155+ pop {r4 - r8 , pc}
162156
16315793 :
164158 adds r1 , r1 , r4
@@ -168,9 +162,7 @@ fma_entry:
168162 WXMO r2 , r3 @ write sticky + result bits
169163 NRDD
170164 RDDM r0 , r1
171- // todo optimize this based on final decision on saving_func_entry
172- pop {r4 - r8 , lr}
173- saving_func_return
165+ pop {r4 - r8 , pc}
174166
17516792 :
176168 adds r2 , r2 , r4
@@ -179,9 +171,7 @@ fma_entry:
179171 WXMO r2 , r3 @ write sticky + result bits
180172 NRDD
181173 RDDM r0 , r1
182- // todo optimize this based on final decision on saving_func_entry
183- pop {r4 - r8 , lr}
184- saving_func_return
174+ pop {r4 - r8 , pc}
185175
18617691 : @ case where a (Q52) is shifted down relative to mn (Q124) ; the mod 32 part of the shift of a has already been done
187177@ r0:r1:r2:r3: mn
@@ -197,9 +187,7 @@ fma_entry:
197187 WXMO r2 , r3 @ write sticky + result bits
198188 NRDD
199189 RDDM r0 , r1
200- // todo optimize this based on final decision on saving_func_entry
201- pop {r4 - r8 , lr}
202- saving_func_return
190+ pop {r4 - r8 , pc}
203191
20419294 :
205193 adds r0 , r0 , r5 @ one word shift down
@@ -211,9 +199,7 @@ fma_entry:
211199 WXMO r2 , r3 @ write sticky + result bits
212200 NRDD
213201 RDDM r0 , r1
214- // todo optimize this based on final decision on saving_func_entry
215- pop {r4 - r8 , lr}
216- saving_func_return
202+ pop {r4 - r8 , pc}
217203
21820495 :
219205 adds r0 , r0 , r6 @ two word shift down
@@ -226,9 +212,7 @@ fma_entry:
226212 WXMO r2 , r3 @ write sticky + result bits
227213 NRDD
228214 RDDM r0 , r1
229- // todo optimize this based on final decision on saving_func_entry
230- pop {r4 - r8 , lr}
231- saving_func_return
215+ pop {r4 - r8 , pc}
232216
233217@ ======================== ADDITION PATH , RESULT HAS COMPARABLE MAGNITUDE TO a ========================
234218
@@ -267,9 +251,7 @@ fma_entry:
267251 add r3 , r3 , r14 , lsl # 20 @ note th at "implied" 1 is present in r3 , giving an offset of 1 in the exponent
268252 bmi 1f @ negative? then we have just constructed a denormal ( or less) and the addition will give an incorrect result
269253 dcp_dadd_m r0 , r1 , r2 , r3 , r4 , r5
270- // todo optimize this based on final decision on saving_func_entry
271- pop {r4 - r8 , lr}
272- saving_func_return
254+ pop {r4 - r8 , pc}
273255
2742561 :
275257@ compare with similar code in subtraction path: here we cannot underflow
@@ -279,9 +261,7 @@ fma_entry:
279261 add r5 , r5 , # 0x40000000
280262 dcp_dadd_m r0 , r1 , r2 , r3 , r4 , r5
281263 sub r1 , r1 , # 0x40000000 @ fix exponent
282- // todo optimize this based on final decision on saving_func_entry
283- pop {r4 - r8 , lr}
284- saving_func_return
264+ pop {r4 - r8 , pc}
285265
28626690 :
287267@ dcp_dmul_m tail then dadd ( "mla path" )
@@ -290,17 +270,13 @@ fma_entry:
290270 NRDD
291271 RDDM r0 , r1
292272 dcp_dadd_m r0 , r1 , r0 , r1 , r4 , r5
293- // todo optimize this based on final decision on saving_func_entry
294- pop {r4 - r8 , lr}
295- saving_func_return
273+ pop {r4 - r8 , pc}
296274
29727582 : @ |mn| is very small compared to |a| , so result is a
298276 RDDM r0 , r1 @ clear the engaged flag
299277 movs r0 , r4
300278 movs r1 , r5
301- // todo optimize this based on final decision on saving_func_entry
302- pop {r4 - r8 , lr}
303- saving_func_return
279+ pop {r4 - r8 , pc}
304280
305281@ ======================== SUBTRACTION PATH ========================
306282
@@ -342,9 +318,7 @@ fma_entry:
342318 WXMO r2 , r3 @ write sticky + result bits
343319 NRDD @ as dmul macro tail: exponent and sign computed in coprocessor is correct
344320 RDDM r0 , r1
345- // todo optimize this based on final decision on saving_func_entry
346- pop {r4 - r8 , lr}
347- saving_func_return
321+ pop {r4 - r8 , pc}
348322
34932394 :
350324@ here if ea - (em + en)= 3 e.g. if ea= 0 then em + en= - 3 and 1 / 8 = 2 ^ - 3 ≤mn< 4 . 2 ^ - 3 = 1 / 2
@@ -364,9 +338,7 @@ fma_entry:
364338 NRDD
365339 RDDM r0 , r1
366340 eor r1 , r1 , 0x80000000 @ sign of result is opposite to th at of product as yielded by coprocessor
367- // todo optimize this based on final decision on saving_func_entry
368- pop {r4 - r8 , lr}
369- saving_func_return
341+ pop {r4 - r8 , pc}
370342
37134393 :
372344 subs r1 , r1 , r4 @ shifting a up by one word: this cannot go negative or have bad cancellation
@@ -376,9 +348,7 @@ fma_entry:
376348 WXMO r2 , r3 @ write sticky + result bits
377349 NRDD
378350 RDDM r0 , r1
379- // todo optimize this based on final decision on saving_func_entry
380- pop {r4 - r8 , lr}
381- saving_func_return
351+ pop {r4 - r8 , pc}
382352
38335392 :
384354 subs r2 , r2 , r4 @ shifting a up by two words: this /can/ go negative or have bad cancellation
@@ -389,9 +359,7 @@ fma_entry:
389359 WXMO r2 , r3 @ write sticky + result bits
390360 NRDD
391361 RDDM r0 , r1
392- // todo optimize this based on final decision on saving_func_entry
393- pop {r4 - r8 , lr}
394- saving_func_return
362+ pop {r4 - r8 , pc}
395363
396364@ heavy cancellation case
397365@ r0:r1:r2:r3: result Q124 , signed
@@ -450,9 +418,7 @@ fma_entry:
450418 ble 12f @ underflow? overflow cannot occur here as the result is smaller in magnitude than a
451419 bfi r1 , r8 , # 20 , # 11 @ insert exponent
452420 orr r1 , r1 , r14 , lsl # 31 @ or in sign
453- // todo optimize this based on final decision on saving_func_entry
454- pop {r4 - r8 , lr}
455- saving_func_return
421+ pop {r4 - r8 , pc}
456422
45742311 :
458424 adcs r0 , r0 , # 0 @ round up as above
@@ -462,26 +428,20 @@ fma_entry:
462428 ble 12f @ underflow?
463429 bfi r1 , r8 , # 20 , # 11 @ insert exponent
464430 orr r1 , r1 , r14 , lsl # 31 @ or in sign
465- // todo optimize this based on final decision on saving_func_entry
466- pop {r4 - r8 , lr}
467- saving_func_return
431+ pop {r4 - r8 , pc}
468432
46943310 :
470434 adds r8 , r8 , r1 , lsr# 20 @ candidate for exponent field
471435 ble 12f @ underflow?
472436 bfi r1 , r8 , # 20 , # 11 @ insert exponent
473437 orr r1 , r1 , r14 , lsl # 31 @ or in sign
4744389 :
475- // todo optimize this based on final decision on saving_func_entry
476- pop {r4 - r8 , lr}
477- saving_func_return
439+ pop {r4 - r8 , pc}
478440
47944112 :
480442 mov r1 , r14 , lsl # 31 @ underflow: return signed zero
481443 movs r0 , # 0
482- // todo optimize this based on final decision on saving_func_entry
483- pop {r4 - r8 , lr}
484- saving_func_return
444+ pop {r4 - r8 , pc}
485445
48644691 : @ case where a (Q52) is shifted down relative to mn (Q124) ; the mod 32 part of the shift of a has already been done
487447@ r0:r1:r2:r3: mn
@@ -501,9 +461,7 @@ fma_entry:
501461 WXMO r2 , r3 @ write sticky + result bits
502462 NRDD
503463 RDDM r0 , r1
504- // todo optimize this based on final decision on saving_func_entry
505- pop {r4 - r8 , lr}
506- saving_func_return
464+ pop {r4 - r8 , pc}
507465
50846694 :
509467 rsbs r4 , r4 , # 0 @ one word shift down
@@ -516,9 +474,7 @@ fma_entry:
516474 WXMO r2 , r3 @ write sticky + result bits
517475 NRDD
518476 RDDM r0 , r1
519- // todo optimize this based on final decision on saving_func_entry
520- pop {r4 - r8 , lr}
521- saving_func_return
477+ pop {r4 - r8 , pc}
522478
52347995 :
524480 movs r7 , # 0 @ two words shift down
@@ -534,9 +490,7 @@ fma_entry:
534490 WXMO r2 , r3 @ write sticky + result bits
535491 NRDD
536492 RDDM r0 , r1
537- // todo optimize this based on final decision on saving_func_entry
538- pop {r4 - r8 , lr}
539- saving_func_return
493+ pop {r4 - r8 , pc}
540494
54149580 :
542496@ here |a| is big compared to |mn| , more precisely ea - (em + en)≥ 4 so e.g. if ea= 0 then em + en≤ - 4 and mn< 4 . 2 ^ - 4 = 1 / 4
@@ -572,9 +526,7 @@ fma_entry:
572526 bmi 1f @ did exponent go negative?
573527
574528 dcp_dadd_m r0 , r1 , r2 , r3 , r4 , r5
575- // todo optimize this based on final decision on saving_func_entry
576- pop {r4 - r8 , lr}
577- saving_func_return
529+ pop {r4 - r8 , pc}
578530
5795311 :
580532 cmn r7 , # 64 @ is mn being shifted well below the bottom of a?
@@ -588,9 +540,7 @@ fma_entry:
588540 andls r1 , r1 , 0x80000000 @ flush to signed zero
589541 movls r0 , # 0
590542 subhi r1 , r1 , # 0x40000000 @ else fix exponent of result
591- // todo optimize this based on final decision on saving_func_entry
592- pop {r4 - r8 , lr}
593- saving_func_return
543+ pop {r4 - r8 , pc}
594544
595545
596546double_section fma_fast
@@ -615,8 +565,7 @@ regular_func mla
615565 dcp_dmul_m r0 , r1 , r0 , r1 , r2 , r3 , r0 , r1 , r2 , r3 , r4 , r5 , r14
616566 ldrd r2 , r3 ,[ r12 , # 0 ] @ fetch a using original SP
617567 dcp_dadd_m r0 , r1 , r0 , r1 , r2 , r3
618- pop {r4 , r5 , r14 }
619- saving_func_return
568+ pop {r4 , r5 , pc}
620569#else
621570// __ARM_PCS_VFP version
622571@ d0 m
@@ -632,8 +581,7 @@ regular_func mla
632581 vmov r2 , r3 , d2
633582 dcp_dadd_m r0 , r1 , r0 , r1 , r2 , r3
634583 vmov d0 , r0 , r1
635- pop {r4 , r5 , r14 }
636- saving_func_return
584+ pop {r4 , r5 , pc}
637585#endif
638586
639587#endif
0 commit comments