/*
 * Copyright (c) 2024 Raspberry Pi (Trading) Ltd.
 *
 * SPDX-License-Identifier: BSD-3-Clause
 */

#include "pico/asm_helper.S"

#if !HAS_DOUBLE_COPROCESSOR
#error attempt to compile double_fma_rp2350 when there is no DCP
#else

#include "hardware/dcp_instr.inc.S"
#include "hardware/dcp_canned.inc.S"

pico_default_asm_setup

// factor out save/restore (there is a copy in float code)

.macro double_section name
#if PICO_DOUBLE_IN_RAM
.section RAM_SECTION_NAME(\name), "ax"
#else
.section SECTION_NAME(\name), "ax"
#endif
.endm

.macro double_wrapper_section func
double_section WRAPPER_FUNC_NAME(\func)
.endm

// ============== STATE SAVE AND RESTORE ===============

.macro saving_func_return
  bx lr
.endm

double_section __rp2350_dcp_engaged_state_save_restore_copy
.thumb_func
__dcp_save_state:
  sub sp, #24
  push {r0, r1}
  // do save here
  PXMD r0, r1
  strd r0, r1, [sp, #8 + 0]
  PYMD r0, r1
  strd r0, r1, [sp, #8 + 8]
  REFD r0, r1
  strd r0, r1, [sp, #8 + 16]
  pop {r0, r1}
  blx lr
  // <- wrapped function returns here
  // fall through into restore:
.thumb_func
__dcp_restore_state:
  // do restore here
  pop {r12, r14}
  WXMD r12, r14
  pop {r12, r14}
  WYMD r12, r14
  pop {r12, r14}
  WEFD r12, r14
  pop {pc}

double_wrapper_section __dfma
@ cf saving_func macro: but here we need to record the SP before the state save possibly changes it
1:
 push {lr}              // 16-bit instruction
 bl __dcp_save_state    // 32-bit instruction
 b 1f                   // 16-bit instruction

@ compute mn+a with full intermediate precision
@ r0:r1 m
@ r2:r3 n
@ [r13,#0] a
wrapper_func fma
 mov r12,sp                  @ save the SP
 PCMP apsr_nzcv              @ test the engaged flag
 bmi 1b
1:
 push {r4-r8,r14}
 ldrd r4,r5,[r12,#0]         @ fetch a using original SP
 ubfx r7,r1,#20,#11          @ r7=em
 ubfx r8,r3,#20,#11          @ r8=en
 add r8,r7,r8                @ em+en

 eors r6,r1,r3               @ get sign of mn
 eors r6,r6,r5               @ set N if mn has opposite sign to a, i.e. if the operation is essentially a subtraction
 WXUP r4,r5                  @ write a to coprocessor to get its classification
 PEFD r14,r12                @ r14=fa
 WXUP r0,r1                  @ write m and n to coprocessor to get their classifications
 WYUP r2,r3
 PEFD r6,r12                 @ r6=fm, r12=fn, r14=fa
 orr r14,r14,r6
 orr r14,r14,r12             @ OR of all the classification flags, so we can check if any are zero/Inf/NaN

 RXMS r3,r6,0                @ we will almost always need the full product so compute it here (cf dmul macro)
 RYMS r7,r12,0
 umull r0,r1,r3,r7
 mov r2,#0                   @ seems to be no 16-bit instruction which zeros a register without affecting the flags
 umlal r1,r2,r3,r12
 umlal r1,r2,r6,r7
 mov r3,#0
 umlal r2,r3,r6,r12          @ r0:r1:r2:r3: full product mn Q124 1≤mn<4

 bmi 50f                     @ mn has opposite sign to a so operation is essentially a subtraction

@ ======================== ADDITION PATH ========================

 tst r14,#0x70000000         @ were any of the arguments zero/inf/NaN?
 bne 90f                     @ then use mla path which gives the correct result in all these cases
 ubfx r14,r5,#20,#11         @ r14=ea
@ here all operands are finite and non-zero
@ r0:r1:r2:r3: full product mn Q124 1≤mn<4
@ r4:r5 a IEEE packed
@ r8: em+en [biased +0x3ff*2]
@ r14: ea [biased +0x3ff]
 subw r7,r8,#0x3fd
 subs r7,r7,r14              @ em+en-ea+2 (debiased)
 blt 80f                     @ branch if |a| is big compared to |mn|, more precisely if ea-(em+en)≥3 so e.g. if ea=0 (hence 1≤a<2) then em+en≤-3 and mn<4.2¯³=1/2

@ ======================== ADDITION PATH, RESULT HAS COMPARABLE MAGNITUDE TO mn  ========================

@ here |mn| is big compared to |a|; e.g. if em+en=0 (so 1≤mn<4) then ea≤2 and a<8
 movs r8,#1
 bfi r5,r8,#20,#12           @ insert implied 1 in a
 rsbs r7,r7,#74              @ shift up ≤74 (can be negative) that will be required for a (Q52) to align with mn (Q124, ending in 20 zeros)
@ now add (shifted) a into mn, preserving flags
 and r8,r7,#0x1f             @ k=shift mod 32
 mov r12,#1
 lsl r12,r12,r8              @ 2^k
 umull r5,r6,r5,r12          @ shift up high word: r4:r5:r6 is now a_lo + 2^k a_hi
 sub r12,#1                  @ 2^k-1
 umlal r4,r5,r4,r12          @ shift up low word, adding in: r4:r5:r6 is now (a_lo + 2^k a_hi) + (2^k-1) a_lo = 2^k (a_lo + a_hi) = a shifted up by k
 bmi 91f                     @ use flags: will a be shifted down?
 cmp r7,#64                  @ shift up by two more words?
 bge 92f
 cmp r7,#32                  @ shift up by one more word?
 bge 93f
 adds r0,r0,r4               @ no more word shifts
 adcs r1,r1,r5
 adcs r2,r2,r6
 adcs r3,r3,#0               @ r0:r1:r2:r3: mn + a (cf dmul macro)
 WXMS r0,r1                  @ write sticky bits
 WXMO r2,r3                  @ write sticky+result bits
 NRDD                        @ as dmul macro tail: exponent computed in coprocessor is correct
 RDDM r0,r1
// todo optimize this based on final decision on saving_func_entry
  pop {r4-r8,lr}
  saving_func_return

93:
 adds r1,r1,r4
 adcs r2,r2,r5
 adcs r3,r3,r6               @ r0:r1:r2:r3: mn + (a<<32)
 WXMS r0,r1                  @ write sticky bits
 WXMO r2,r3                  @ write sticky+result bits
 NRDD
 RDDM r0,r1
// todo optimize this based on final decision on saving_func_entry
 pop {r4-r8,lr}
 saving_func_return

92:
 adds r2,r2,r4
 adcs r3,r3,r5               @ r0:r1:r2:r3: mn + (a<<64); note this cannot overflow as total shift was at most 74 (see above)
 WXMS r0,r1                  @ write sticky bits
 WXMO r2,r3                  @ write sticky+result bits
 NRDD
 RDDM r0,r1
// todo optimize this based on final decision on saving_func_entry
 pop {r4-r8,lr}
 saving_func_return

91:                          @ case where a (Q52) is shifted down relative to mn (Q124); the mod 32 part of the shift of a has already been done
@ r0:r1:r2:r3: mn
@ r4:r5:r6: a
@ r7: alignment shift required (negative)
 cmn r7,#32                  @ shift down one word?
 bge 94f
 cmn r7,#64                  @ shift down two words?
 bge 95f
@ here a is shifted entirely below the bottom of m
 orr r0,r0,#1                @ a is non-zero so ensure we set the sticky bit
 WXMS r0,r1                  @ write sticky bits
 WXMO r2,r3                  @ write sticky+result bits
 NRDD
 RDDM r0,r1
// todo optimize this based on final decision on saving_func_entry
 pop {r4-r8,lr}
 saving_func_return

94:
 adds r0,r0,r5               @ one word shift down
 adcs r1,r1,r6
 adcs r2,r2,#0
 adcs r3,r3,#0
 orr r0,r0,r4                @ contribution from a to sticky bits
 WXMS r0,r1                  @ write sticky bits
 WXMO r2,r3                  @ write sticky+result bits
 NRDD
 RDDM r0,r1
// todo optimize this based on final decision on saving_func_entry
 pop {r4-r8,lr}
 saving_func_return

95:
 adds r0,r0,r6               @ two word shift down
 adcs r1,r1,#0
 adcs r2,r2,#0
 adcs r3,r3,#0
 orr r0,r0,r4                @ contribution from a to sticky bits
 orr r0,r0,r5
 WXMS r0,r1                  @ write sticky bits
 WXMO r2,r3                  @ write sticky+result bits
 NRDD
 RDDM r0,r1
// todo optimize this based on final decision on saving_func_entry
 pop {r4-r8,lr}
 saving_func_return

@ ======================== ADDITION PATH, RESULT HAS COMPARABLE MAGNITUDE TO a  ========================

80:
@ here |mn|<~|a|
@ r0:r1:r2:r3: mn Q124
@ r4:r5 a IEEE packed
@ r7: -(shift down required to align mn with a), guaranteed negative
@ r8: em+en [biased +0x3ff*2]
@ r14: ea [biased +0x3ff]
 tst r3,#0x20000000
 bne 1f @ 2≤mn<4?
 adds r2,r2,r2               @ normalise so mn is 2..4 Q124; note that the contents of r0 and r1 are always destined for the sticky bit in this path
 adcs r3,r3,r3
 subs r7,r7,#1               @ correction to alignment shift
1:
@ now we construct an IEEE packed value in r2:r3 such that adding it to r4:r5 gives the correct final result
@ observe that the exponent of this constructed value will be at least two less than that of a (by the "|a| is big compared to |mn|" test above)
@ so the alignment shift in the final addition will be by at least two places; thus we can use bit 0 of the constructed
@ value as a sticky bit, and we still have one bit in hand for rounding
 subs r7,r7,#2               @ now r7 < -2
 orr r0,r0,r2,lsl#23         @ shift r2:r3 down 9 places, ORing excess into sticky bits
 lsrs r2,r2,#9
 orr r2,r2,r3,lsl#23
 lsrs r3,r3,#9
 orrs r0,r0,r1
 it ne
 orrne r2,r2,#1              @ sticky bit from bottom 64 bits of mn as shifted
@ r2:r3 mn 2..4 Q51, i.e. 1..2 Q52
@ r2b0 holds sticky bit; note that for alignment with a in r4:r5, r2:r3 will be shifted down at least one place

 lsrs r6,r5,#31              @ get sign of a (which in this path is the same as the sign of mn, and of the result)
 orr r3,r3,r6,lsl#31         @ set sign in mn

 adds r14,r7,r14             @ get exponent for mn relative to a; note this can go negative
 add r3,r3,r14,lsl#20        @ note that "implied" 1 is present in r3, giving an offset of 1 in the exponent
 bmi 1f                      @ negative? then we have just constructed a denormal (or less) and the addition will give an incorrect result
 dcp_dadd_m r0,r1,r2,r3,r4,r5
// todo optimize this based on final decision on saving_func_entry
 pop {r4-r8,lr}
 saving_func_return

1:
@ compare with similar code in subtraction path: here we cannot underflow
 cmn r7,#64                  @ if the alignment shift for mn is very large then the result is just a
 ble 82f
 add r3,r3,#0x40000000       @ ea cannot be very large (as adding r7 made it negative), so safe to add 1024 to exponents of both a and mn
 add r5,r5,#0x40000000
 dcp_dadd_m r0,r1,r2,r3,r4,r5
 sub r1,r1,#0x40000000       @ fix exponent
// todo optimize this based on final decision on saving_func_entry
 pop {r4-r8,lr}
 saving_func_return

90:
@ dcp_dmul_m tail then dadd ("mla path")
 WXMS r0,r1                  @ write sticky bits
 WXMO r2,r3                  @ write sticky+result bits
 NRDD
 RDDM r0,r1
 dcp_dadd_m r0,r1,r0,r1,r4,r5
// todo optimize this based on final decision on saving_func_entry
 pop {r4-r8,lr}
 saving_func_return

82:                          @ |mn| is very small compared to |a|, so result is a
 RDDM r0,r1                  @ clear the engaged flag
 movs r0,r4
 movs r1,r5
// todo optimize this based on final decision on saving_func_entry
 pop {r4-r8,lr}
 saving_func_return

@ ======================== SUBTRACTION PATH ========================

50:
 tst r14,#0x70000000         @ were any of the arguments zero/inf/NaN?
 bne 90b @ then use mla path which gives the correct result in all these cases
 ubfx r14,r5,#20,#11         @ r14=ea
@ now all operands are finite and non-zero
@ r0:r1:r2:r3: full product mn Q124 1≤mn<4
@ r4:r5 a IEEE packed (including sign bit; sign of mn is opposite as we are in the subtraction path)
@ r8: em+en [+0x3ff*2]
@ r14: ea [+0x3ff]
 subw r8,r8,#0x3fc           @ em+en+3
 subs r7,r8,r14              @ em+en-ea+3 (debiased)
 blt 80f                     @ branch if |a| is big compared to |mn|, more precisely if ea-(em+en)≥4 so e.g. if ea=0 then em+en≤-4 and mn<4.2^-4=1/4
 beq 94f                     @ branch if ea-(em+en)=3 e.g. if ea=0 then em+en=-3 and 1/8=2^-3≤mn<4.2^-3=1/2
@ in this branch, if e.g. em+en=0 (so 1≤mn<4) then ea≤2 and a<8
 rsbs r7,r7,#75              @ 75-(em+en-ea+3) = 72-(em+en-ea), shift up 0..74 that will be required for a (Q52) to align with mn (Q124, ending in 20 zeros)
 mvn r14,r5,lsr#31           @ save complement of sign of a
@ subtract (shifted) a from mn
 and r6,r7,#0x1f             @ k=shift mod 32
 mov r12,#1
 bfi r5,r12,#20,#12          @ insert implied 1 in a
 lsl r12,r12,r6              @ 2^k
 umull r5,r6,r5,r12
 sub r12,#1
 umlal r4,r5,r4,r12          @ shift a up by shift amount mod 32 (see comment in addition path)
@ r4:r5:r6: a shifted up by k=shift mod 32
 bmi 91f                     @ will a be shifted down?
 cmp r7,#64                  @ shift up by two more words?
 bge 92f
 cmp r7,#32                  @ shift up by one more word?
 bge 93f
 subs r0,r0,r4               @ no more word shifts; this cannot go negative or have bad cancellation
 sbcs r1,r1,r5
 sbcs r2,r2,r6
 sbcs r3,r3,#0               @ r0:r1:r2:r3: mn - a (cf dmul macro)
 WXMS r0,r1                  @ write sticky bits
 WXMO r2,r3                  @ write sticky+result bits
 NRDD                        @ as dmul macro tail: exponent and sign computed in coprocessor is correct
 RDDM r0,r1
// todo optimize this based on final decision on saving_func_entry
 pop {r4-r8,lr}
 saving_func_return

94:
@ here if ea-(em+en)=3 e.g. if ea=0 then em+en=-3 and 1/8=2^-3≤mn<4.2^-3=1/2
@ r0:r1:r2:r3: full product mn Q124 1≤mn<4
@ r4:r5 a IEEE packed (including sign bit; sign of mn is opposite as we are in the subtraction path)
 lsls r5,r5,#11              @ convert a to mantissa Q63 in r4:r5
 orrs r5,r5,r4,lsr#21
 lsls r4,r4,#11
 orrs r5,r5,0x80000000       @ implied 1
 movs r6,#0
 subs r0,r6,r0               @ compute |a|-|mn|
 sbcs r6,r6,r1
 sbcs r4,r4,r2
 sbcs r5,r5,r3
 WXMS r0,r6                  @ write sticky bits
 WXMO r4,r5                  @ write sticky+result bits
 NRDD
 RDDM r0,r1
 eor r1,r1,0x80000000        @ sign of result is opposite to that of product as yielded by coprocessor
// todo optimize this based on final decision on saving_func_entry
 pop {r4-r8,lr}
 saving_func_return

93:
 subs r1,r1,r4               @ shifting a up by one word: this cannot go negative or have bad cancellation
 sbcs r2,r2,r5
 sbcs r3,r3,r6
 WXMS r0,r1                  @ write sticky bits
 WXMO r2,r3                  @ write sticky+result bits
 NRDD
 RDDM r0,r1
// todo optimize this based on final decision on saving_func_entry
 pop {r4-r8,lr}
 saving_func_return

92:
 subs r2,r2,r4               @ shifting a up by two words: this /can/ go negative or have bad cancellation
 sbcs r3,r3,r5
 cmp r3,#0x01000000          @ check we have at least 57 bits of product so that dmul tail will round correctly (this test is slightly conservative - 55 needed?)
 blt 1f                      @ also trap case where result is negative
 WXMS r0,r1                  @ write sticky bits
 WXMO r2,r3                  @ write sticky+result bits
 NRDD
 RDDM r0,r1
// todo optimize this based on final decision on saving_func_entry
 pop {r4-r8,lr}
 saving_func_return

@ heavy cancellation case
@ r0:r1:r2:r3: result Q124, signed
@ r8: em+en+3
@ r14b0: save complement of sign of a
1:
 sub r8,r8,#1                @ em+en+2
 RDDM r6,r7                  @ clear engaged flag
 blo 2f                      @ if result is negative...
 movs r6,#0                  @ ... negate it...
 subs r0,r6,r0
 sbcs r1,r6,r1
 sbcs r2,r6,r2
 sbcs r3,r6,r3
 eor r14,r14,#1              @ ... and flip saved sign
2:                           @ now normalise result
 orrs r6,r2,r3               @ shift up by 64 possible?
 bne 7f
 movs r3,r1                  @ do it
 movs r2,r0
 movs r1,#0
 movs r0,#0
 sub r8,r8,#64               @ fix exponent
7:
 cmp r3,#0                   @ shift up by 32 possible?
 bne 8f
 movs r3,r2                  @ do it
 movs r2,r1
 movs r1,r0
 movs r0,#0
 sub r8,r8,#32
8:
 cmp r3,#0                   @ is result zero? return it
 beq 9f
 clz r6,r3                   @ k=amount of final shift
 subs r8,r8,r6               @ final exponent
 movs r7,#1
 lsls r7,r7,r6               @ r7=2^k
 muls r3,r3,r7
 subs r7,r7,#1               @ 2^k-1
 umlal r2,r3,r2,r7
 umlal r1,r2,r1,r7
 umlal r0,r1,r0,r7           @ r0:r1:r2:r3: normalised result
 orrs r0,r0,r1               @ any sticky bits below top 64?
 it ne
 orrne r2,r2,#1              @ or into sticky bit
 lsrs r0,r2,#11              @ align to mantissa position for IEEE format
 lsrs r1,r3,#11
 orr r0,r0,r3,lsl#21
 lsls r2,r2,#22              @ rounding bit in C, sticky bit in ~Z
 bcc 10f                     @ no rounding?
 beq 11f                     @ rounding tie?
 adcs r0,r0,#0               @ round up (C is set)
 adcs r1,r1,#0
 adds r8,r8,r1,lsr#20        @ candidate for exponent field
 ble 12f                     @ underflow? overflow cannot occur here as the result is smaller in magnitude than a
 bfi r1,r8,#20,#11           @ insert exponent
 orr r1,r1,r14,lsl#31        @ or in sign
// todo optimize this based on final decision on saving_func_entry
 pop {r4-r8,lr}
 saving_func_return

11:
 adcs r0,r0,#0               @ round up as above
 adcs r1,r1,#0
 bic r0,r0,#1                @ to even
 adds r8,r8,r1,lsr#20        @ candidate for exponent field
 ble 12f                     @ underflow?
 bfi r1,r8,#20,#11           @ insert exponent
 orr r1,r1,r14,lsl#31        @ or in sign
// todo optimize this based on final decision on saving_func_entry
 pop {r4-r8,lr}
 saving_func_return

10:
 adds r8,r8,r1,lsr#20        @ candidate for exponent field
 ble 12f                     @ underflow?
 bfi r1,r8,#20,#11           @ insert exponent
 orr r1,r1,r14,lsl#31        @ or in sign
9:
// todo optimize this based on final decision on saving_func_entry
 pop {r4-r8,lr}
 saving_func_return

12:
 mov r1,r14,lsl#31           @ underflow: return signed zero
 movs r0,#0
// todo optimize this based on final decision on saving_func_entry
 pop {r4-r8,lr}
 saving_func_return

91:                          @ case where a (Q52) is shifted down relative to mn (Q124); the mod 32 part of the shift of a has already been done
@ r0:r1:r2:r3: mn
@ r4:r5:r6: a
@ r7: alignment shift required (negative)
 cmn r7,#32                  @ shift down one word?
 bge 94f
 cmn r7,#64                  @ shift down two words?
 bge 95f
@ here a is shifted entirely below the bottom of m
 subs r0,r0,#1               @ subtract an epsilon (a is non-zero)
 sbcs r1,r1,#0
 sbcs r2,r2,#0
 sbcs r3,r3,#0
 orr r0,r0,#1                @ ensure the sticky bit is set (a is non-zero)
 WXMS r0,r1                  @ write sticky bits
 WXMO r2,r3                  @ write sticky+result bits
 NRDD
 RDDM r0,r1
// todo optimize this based on final decision on saving_func_entry
 pop {r4-r8,lr}
 saving_func_return

94:
 rsbs r4,r4,#0               @ one word shift down
 sbcs r0,r0,r5
 sbcs r1,r1,r6
 sbcs r2,r2,#0
 sbcs r3,r3,#0
 orr r0,r0,r4                @ sticky bits
 WXMS r0,r1                  @ write sticky bits
 WXMO r2,r3                  @ write sticky+result bits
 NRDD
 RDDM r0,r1
// todo optimize this based on final decision on saving_func_entry
 pop {r4-r8,lr}
 saving_func_return

95:
 movs r7,#0                  @ two words shift down
 subs r4,r7,r4
 sbcs r5,r7,r5
 sbcs r0,r0,r6
 sbcs r1,r1,r7
 sbcs r2,r2,r7
 sbcs r3,r3,r7
 orrs r0,r0,r4               @ sticky bits
 orrs r0,r0,r5
 WXMS r0,r1                  @ write sticky bits
 WXMO r2,r3                  @ write sticky+result bits
 NRDD
 RDDM r0,r1
// todo optimize this based on final decision on saving_func_entry
 pop {r4-r8,lr}
 saving_func_return

80:
@ here |a| is big compared to |mn|, more precisely ea-(em+en)≥4 so e.g. if ea=0 then em+en≤-4 and mn<4.2^-4=1/4
@ r0:r1:r2:r3: mn Q124
@ r4:r5: a IEEE packed
@ r7<0, em+en-ea+3 (debiased)
@ r14: ea [+0x3ff]
 lsrs r6,r3,#29
 bne 1f                      @ 2≤mn<4?
 adds r2,r2,r2               @ shift up one place
 adcs r3,r3,r3
 subs r7,r7,#1               @ fix exponent
1:                           @ now r2:r3 is mn Q61, sticky bits in r0:r1
 subs r7,r7,#3
@ r7=emn-ea <-3
 orr r0,r0,r2,lsl#23         @ gather sticky bits
 lsrs r2,r2,#9               @ adjust mn to Q52 ready to create packed IEEE version of mn
 orr r2,r2,r3,lsl#23
 lsrs r3,r3,#9
 orrs r0,r0,r1               @ or of all sticky bits
 it ne
 orrne r2,r2,#1              @ sticky bit from bottom 64 bits of mn

 mvn r6,r5,lsr#31            @ complement of sign of a
 orr r3,r3,r6,lsl#31         @ fix sign of mn so we do a subtraction

 adds r14,r7,r14             @ this can go negative; r14 is now at most ea[+0x3ff]-4
 add r3,r3,r14,lsl#20
@ the exponent field in r2:r3 (mn) is now at most ea[+0x3ff]-3
@ that means that in the dadd operation that follows, mn will be shifted down at least three places to align with a,
@ and a post-normalisation shift up of at most one place will be needed
@ therefore in the worst case r2b2 affects b0 of the result; r2b1 affects the rounding of the result; and r2b0 can be used as a sticky bit
 bmi 1f                      @ did exponent go negative?

 dcp_dadd_m r0,r1,r2,r3,r4,r5
// todo optimize this based on final decision on saving_func_entry
 pop {r4-r8,lr}
 saving_func_return

1:
 cmn r7,#64                  @ is mn being shifted well below the bottom of a?
 ble 82b                     @ then result is just a
 add r3,r3,#0x40000000       @ otherwise offset exponents by +1024
 add r5,r5,#0x40000000
 dcp_dadd_m r0,r1,r2,r3,r4,r5
 ubfx r2,r1,#20,#11          @ get exponent
 cmp r2,#0x400               @ too small?
 itte ls
 andls r1,r1,0x80000000      @ flush to signed zero
 movls r0,#0
 subhi r1,r1,#0x40000000     @ else fix exponent of result
// todo optimize this based on final decision on saving_func_entry
 pop {r4-r8,lr}
 saving_func_return


double_section fma_fast
@ cf saving_func macro: but here we need to record the SP before the state save possibly changes it
1:
 push {lr}              // 16-bit instruction
 bl __dcp_save_state    // 32-bit instruction
 b 1f                   // 16-bit instruction

@ r0:r1 m
@ r2:r3 n
@ [r13,#0] a
regular_func fma_fast
regular_func mla
 mov r12,sp                  @ save the SP
 PCMP apsr_nzcv              @ test the engaged flag
 bmi 1b
1:
 push {r4,r5,r14}
 dcp_dmul_m r0,r1,r0,r1,r2,r3,r0,r1,r2,r3,r4,r5,r14
 ldrd r2,r3,[r12,#0]        @ fetch a using original SP
 dcp_dadd_m r0,r1,r0,r1,r2,r3
// todo optimize this based on final decision on saving_func_entry
 pop {r4,r5,r14}
 saving_func_return

#endif