/*
 * Copyright (c) 2024 Raspberry Pi (Trading) Ltd.
 *
 * SPDX-License-Identifier: BSD-3-Clause
 */

#if !PICO_RP2040
#include "pico/asm_helper.S"

pico_default_asm_setup

.macro double_section name
#if PICO_DOUBLE_IN_RAM
.section RAM_SECTION_NAME(\name), "ax"
#else
.section SECTION_NAME(\name), "ax"
#endif
.endm

.macro double_wrapper_section func
double_section WRAPPER_FUNC_NAME(\func)
.endm

.global rtwopi
.global logtab0
.global exptab0
.global exptab1
.global exptab2
.global trigtab

@ load a 32-bit constant n into register rx
.macro movlong rx,n
 movw \rx,#(\n)&0xffff
 movt \rx,#((\n)>>16)&0xffff
.endm

double_section rtwopi

// 1/2π to plenty of accuracy, 256 bits per line
.long 0                      @ this allows values of e down to -32
rtwopi:
.long 0,0
.long 0x28BE60DB, 0x9391054A, 0x7F09D5F4, 0x7D4D3770, 0x36D8A566, 0x4F10E410, 0x7F9458EA, 0xF7AEF158
.long 0x6DC91B8E, 0x909374B8, 0x01924BBA, 0x82746487, 0x3F877AC7, 0x2C4A69CF, 0xBA208D7D, 0x4BAED121
.long 0x3A671C09, 0xAD17DF90, 0x4E64758E, 0x60D4CE7D, 0x272117E2, 0xEF7E4A0E, 0xC7FE25FF, 0xF7816603
.long 0xFBCBC462, 0xD6829B47, 0xDB4D9FB3, 0xC9F2C26D, 0xD3D18FD9, 0xA797FA8B, 0x5D49EEB1, 0xFAF97C5E
.long 0xCF41CE7D, 0xE294A4BA, 0x9AFED7EC, 0x47E35742, 0x1580CC11

double_section drrcore

@ input:
@ r0:r1 mantissa m Q52
@ r2 exponent e>=-32, typically offset by +12
@ r3 pointer to rtwopi table
@ output:
@ r0..r2 preserved
@ r7:r8 range reduced result
@ r3,r4,r5,r6,r9,r10 trashed
.thumb_func
drr_core:
 ldr r3,=rtwopi
 and r5,r2,#31               @ s=e%32
 mov r7,#1
 lsls r7,r7,r5               @ 1<<s
 asrs r8,r2,#5               @ k=e/32, k<=32 for double with e offset <32; with e offsets up to 12+64, k<=34
 umull r4,r5,r7,r0
 movs r6,#0
 umlal r5,r6,r7,r1
@ r2       e
@ r4:r5:r6 u0:u1:u2 = m<<(e%32); u2 is never more than 2<<20
@ r8       e/32
 add r3,r3,r8,lsl#2          @ p

 ldr r9,[r3,#16]             @ a0=p[4]
 umull r10,r7,r9,r6          @ r0=a0*u2 hi, discard lo
@ r7  r0
 ldr r9,[r3,#12]             @ a1=p[3]
 umull r10,r8,r9,r5          @ r1=a1*u1 hi, discard lo
 umaal r7,r8,r9,r6           @ r0:r1=r0+r1+a1*u2
@ r7:r8  r0:r1
 ldr r9,[r3,#8]              @ a2=p[2]
 mla r8,r9,r6,r8             @ r1+=a2*u2
 umlal r7,r8,r9,r5           @ r0:r1+=a2*u1
 umull r10,r6,r9,r4          @ r0x=a2*u0 hi, discard lo; u2 no longer needed
@ r7:r8 r0:r1
@ r6 r0x
 ldr r9,[r3,#4]              @ a3=p[1]
 mla r8,r9,r5,r8             @ r1+=a3*u1
 umlal r7,r8,r9,r4           @ r0:r1+=a3*u0
@ r7:r8 r0:r1
@ r6 r0x
 ldr r9,[r3,#0]              @ a4=p[0]
 mla r8,r9,r4,r8             @ r1+=a4*u0
@ r7:r8 r0:r1
@ r6 r0x
 adds r7,r6
 adc r8, r8, #0
 bx r14

double_wrapper_section tan
wrapper_func tan
 push {r14}
 bl sincos_raw
 bl __aeabi_ddiv
 pop {r15}

double_wrapper_section sin_cos

10:                          @ process Inf/NaN for dsin and dcos
 orrs r2,r0,r1,lsl#12
 it eq
 orreq r1,#0x80000000
 orr r1,#0x00080000
 movs r3,r1                  @ copy results to cosine output
 movs r2,r0
 bx r14

@ case where angle is very small (<2^-32) before reduction
32:
 cbnz r3,33f
 movs r0,#0                  @ flush denormal
 movs r1,#0
33:
 movs r2,#0                  @ return x for sine, 1 for cosine
 movlong r3,0x3ff00000
 tst r12,#2
 itt ne
 movne r0,r2                 @ calculating cosine? move to result registers
 movne r1,r3
 bx r14

@ case where angle is fairly small after reduction
30:
 movs r2,#0
 movs r3,#0
 movs r4,#0
 movs r5,#0x80000000
 b 31f

40:
@ case where range-reduced angle is very small
@ here
@ r0:r1 mantissa
@ r2 exponent+12
@ r7:r8 abs range-reduced angle < 2^-10 Q64
@ r12b31: dsincos flag
@ r12b30: original argument sign
@ r12b2..1: quadrant count
@ r12b0: sign of reduced angle
 push {r12}
 movs r12,#0
2:
 add r12,#4
 add r2,#4
 bl drr_core                 @ repeat range reduction with extra factor of 2^4 (, 2^8, 2^12, 2^16,...)
 eors r9,r8,r8,asr#31        @ we want to keep the sign in r8b31 for later
 cmp r9,#1<<24               @ ≥ 2^-8?
 bhs 1f                      @ loop until the result is big enough
 cmp r12,#64                 @ safety net
 bne 2b
1:
 eors r7,r7,r8,asr#31
@ here r7:r9 is the abs range-reduced angle Q64+r12, 2^-8..2^-4 in Q64 terms

@ 2π=6.487ED511 0B4611A6 (2633...)
 movlong r6,0x0B4611A6       @ 2π Q64 low fractional word
 umull r10,r0,r9,r6
 movlong r6,0x487ED511       @ 2π Q64 high fractional word
 umull r10,r1,r7,r6
 umaal r0,r1,r9,r6
 movs r6,#6                  @ 2π integer part
 umlal r0,r1,r7,r6
 mla r1,r9,r6,r1

@ here
@ r0:r1 θ, abs range reduced angle θ 0..π/4 Q64+r12
@ r8b31: sign of reduced angle
@ r12: excess exponent ≥4, multiple of 4
@ r0:r1 / 2^r12 < ~ 2π * 2^-10 so for sin we need to go to term in x^5
 rsbs r10,r12,#32
 bmi 1f                      @ very small result?

 lsr r3,r1,r12
 lsr r2,r0,r12
 lsl r9,r1,r10
 orr r2,r9
@ r2:r3 θ Q64 (with r12 bits of loss of precision)
 umull r9,r4,r2,r3
 umull r9,r5,r2,r3
 umaal r4,r5,r3,r3
@ r4:r5 θ² Q64 = θ²/2 Q65 < ~ 4π² * 2^-20
 umull r9,r6,r4,r5
 umull r9,r7,r4,r5
 umaal r6,r7,r5,r5
@ r6:r7 θ⁴ Q64 < ~ 16π⁴ * 2^-40 < 2^-29
 lsrs r6,#3
 orrs r6,r6,r7,lsl#29
@ r6 θ⁴ Q61
 mov r9,#0xaaaaaaaa          @ 2/3 Q32
 umull r6,r7,r6,r9
@ r7 θ⁴ * 2/3 Q61 = θ⁴/24 Q65
 rsbs r6,r4,r7
 sbcs r7,r5,r5,lsl#1
@ r6:r7  - θ²/2 + θ⁴/24 Q65, <=0
 asrs r9,r7,#12
 lsrs r6,#12
 adcs r6,r6,r7,lsl#20
 adcs r7,r9,#0x0ff00000
 adds r7,#0x30000000
 push {r6,r7}                @ packed cos θ

@ here
@ r0:r1 θ, abs range reduced angle θ 0..π/4 Q64+r12
@ r4:r5 θ² Q64 = θ²/2 Q65
 umull r9,r2,r0,r5
 umull r9,r3,r1,r4
 umaal r2,r3,r1,r5
@ r2:r3 θ³ Q64+r12
 umull r9,r6,r2,r5
 umull r9,r7,r3,r4
 umaal r6,r7,r3,r5
@ r6:r7 θ⁵ Q64+r12; in fact r7 is always 0
 mov r9,#0xaaaaaaaa          @ 2/3 Q32
 umull r10,r4,r2,r9
 movs r5,#0
 umlal r4,r5,r3,r9
 adds r4,r4,r5
 adcs r5,r5,#0
@ r4:r5 θ³*2/3 Q64+r12 = θ³/6 Q66+r12
 mov r9,#0x88888888          @ 8/15 Q32
 umull r2,r3,r6,r9
@ r3 θ⁵*8/15 Q64+r12 = θ⁵/120 Q70+r12
 subs r4,r4,r3,lsr#4
 sbc r5,r5,#0
@ r4:r5 θ³/6-θ⁵/120 Q66+r12
 subs r0,r0,r4,lsr#2
 sbc r1,r1,r5,asr#2
 subs r0,r0,r5,lsl#30
 sbc r1,r1,#0
@ r0:r1 θ-θ³/6+θ⁵/120 Q64+r12
 rsb r4,r12,#0x400
 sub r4,#14
 bl dpack_q
 pop {r6,r7,r12}             @ get cosine, flags
 b 2f

1:
@ case where range reduction result has excess exponent ≥ 32
@ so sin x=x, cos x=1
 rsb r4,r12,#0x400
 sub r4,#14
 bl dpack_q
 pop {r12}                   @ get flags
 movs r6,#0
 movlong r7,0x3ff00000
2:
@ here r0:r1 is packed sine, r6:r7 is packed cosine
 lsrs r8,#31
 bfi r12,r8,#0,#1

 asrs r9,r12,#1
 bmi 23f                     @ doing dsincos?
 asrs r9,#1
 bcc 21f                     @ need sine?
@ need cosine:
 ands r12,#4
 orrs r1,r7,r12,lsl#29       @ insert sign
 movs r0,r6
 pop {r4-r10,r15}

21:
 eors r12,r12,r12,lsr#2
 orrs r1,r1,r12,lsl#31       @ insert sign
 pop {r4-r10,r15}

23:
 ands r4,r12,#4
 eors r7,r7,r4,lsl#29        @ insert sign
 push {r6,r7}
 b 20f

@ sincos à la GNU with pointers to where to put results
wrapper_func sincos
 push {r2-r5, lr}
 bl sincos_raw
 pop {r4-r5}
 stmia r4!, {r0, r1}
 stmia r5!, {r2, r3}
 pop {r4, r5, pc}

@ sincos with results in r0:r1 and r2:r3
.thumb_func
sincos_raw:
 ands r12,r1,#1<<31
 lsrs r12,#1                 @ save argument sign in r12b30
 orrs r12,r12,#1<<31         @ flag we want both results in r12b31
 b 1f

wrapper_func sin
 lsrs r12,r1,#29             @ negative argument -> +2 quadrants
 ands r12,#4
 b 1f

wrapper_func cos
 movs r12,#2                 @ cos -> +1 quadrant
1:
 ubfx r3,r1,#20,#11          @ get exponent
 sub r2,r3,#0x3ff
 cmp r2,#0x400
 beq 10b                     @ Inf or NaN?
 cmn r2,#32
 blt 32b                     @ very small argument?
 movs r3,#1
 bfi r1,r3,#20,#12           @ fix implied 1 in mantissa
 push {r4-r10,r14}
 add r2,#12                  @ e+12
 bl drr_core
@ r7:r8 θ/2π 0..1 Q64
 lsrs r4,r8,#30              @ quadrant count
 adc r4,r4,#0                @ rounded
 sub r8,r8,r4,lsl#30         @ now -0.125≤r7:r8<+0.125 Q64
 add r12,r12,r4,lsl#1
 orr r12,r12,r8,lsr#31       @ sign into r12b0
@ r12b2..1: quadrant count
@ r12b0: sign of reduced angle
 eors r7,r7,r8,asr#31
 eors r8,r8,r8,asr#31        @ absolute value of reduced angle 0≤r7:r8<0.125 Q64
 cmp r8,#1<<22               @ < 2^-10?
 blo 40b

@ 2π=6.487ED511 0B4611A6 (2633...)
 movlong r9,0x0B4611A6       @ 2π Q64 low fractional word
 umull r10,r0,r8,r9
 movlong r9,0x487ED511       @ 2π Q64 high fractional word
 umull r10,r1,r7,r9
 umaal r0,r1,r8,r9
 movs r9,#6                  @ 2π integer part
 umlal r0,r1,r7,r9
 mla r1,r8,r9,r1
@ r0:r1 range reduced angle θ 0..π/4 Q64

 cmp r1,#1<<25               @ θ < 2^-7?
 blo 30b
 lsrs r2,r1,#27
 ldr r3,=trigtab
 add r3,r3,r2,lsl#4
 ldmia r3,{r2-r5}
31:
 subs r0,r0,r2
 sbcs r1,r1,r3               @ ε=Θ-φ Q64
 bmi 2f                      @ ε negative?

 asrs r6,r12,#1
 bmi 5f                      @ doing dsincos?
 asrs r6,#1
 bcs 3f                      @ need cosine?

@ here ε is positive, we need the sin of θ and the sign of the result is r12b0^r12b2
 bl dsc_h0
 bl dsc_h0s
 bl dpack_q63
21:
 eors r12,r12,r12,lsr#2
 orrs r1,r1,r12,lsl#31       @ insert sign
 pop {r4-r10,r15}

2:
 asrs r6,r12,#1
 bmi 6f                      @ doing dsincos?
 asrs r6,#1
 bcs 4f                      @ need cosine?

@ here ε is negative, we need the sin of θ and the sign of the result is r12b0^r12b2
 bl dsc_h1
 bl dsc_h1s
 bl dnegpack_q63
 eors r12,r12,r12,lsr#2
 orrs r1,r1,r12,lsl#31       @ insert sign
 pop {r4-r10,r15}

@ here ε is positive, we need the cos of θ and the sign of the result is r12b2
3:
 bl dsc_h0
 bl dsc_h0c
 bl dnegpack_q63
22:
 ands r12,#4
 orrs r1,r1,r12,lsl#29       @ insert sign
 pop {r4-r10,r15}

@ here ε is negative, we need the cos of θ and the sign of the result is r12b2
4:
 bl dsc_h1
 bl dsc_h1c
15:
 bl dpack_q63
 ands r12,#4
 orrs r1,r1,r12,lsl#29       @ insert sign
 pop {r4-r10,r15}

5:
@ dsincos, ε positive
 bl dsc_h0
 push {r2-r7}
 bl dsc_h0c
 bl dnegpack_q63
 ands r4,r12,#4
 eors r1,r1,r4,lsl#29        @ negate cosine in quadrants 2 and 3
 pop {r2-r7}
 push {r0,r1}
 bl dsc_h0s
 bl dpack_q63
20:
 eors r4,r12,r12,lsr#1
 eors r4,r4,r12,lsr#2
 eors r1,r1,r4,lsl#31        @ negate sine on b0^b1^b2
 tst r12,#2                  @ exchange sine and cosine in odd quadrants
 ittte ne
 movne r2,r0
 movne r3,r1
 popne {r0,r1}
 popeq {r2,r3}
 ands r4,r12,#1<<30
 eors r1,r1,r4,lsl#1         @ negate sine result if argument was negative
 pop {r4-r10,r15}

6:
@ dsincos, ε negative
 bl dsc_h1
 push {r2-r7}
 bl dsc_h1c
 bl dpack_q63
 ands r4,r12,#4
 eors r1,r1,r4,lsl#29        @ negate cosine in quadrants 2 and 3
 pop {r2-r7}
 push {r0,r1}
 bl dsc_h1s
 bl dnegpack_q63
 b 20b

@ sin/cos power series for negative ε
dsc_h1:
 rsbs r0,r0,#0
 sbc r1,r1,r1,lsl#1
@ drop into positive ε code

@ sin/cos power series for positive ε
dsc_h0:
@ r0:r1 ε Q64
@ r4:   sin φ Q32
@ r5:   cos φ Q32
 umull r6,r7,r1,r1
 umull r2,r3,r0,r1
 lsrs r7,#1
 rrx r6,r6
 adds r2,r6,r3
 adc r3,r7,#0
@ r2:r3 ε²/2 Q64
 umull r7,r6,r3,r0
 umlal r7,r6,r2,r1
 movs r7,#0
 umlal r6,r7,r3,r1
@ r6:r7 ε³/2 Q64
 mov r8,#0x55555555
 umull r9,r6,r6,r8
 mov r9,#0
 umlal r6,r9,r7,r8
 adds r6,r6,r9
 adcs r7,r9,#0
@ r6:r7 ε³/6 Q64
 subs r6,r0,r6
 sbcs r7,r1,r7
@ r6:r7 ε-ε³/6 Q64
 mov r0,r3,lsl#12            @ ε²/2 Q44
 orr r0,r0,r2,lsr#20
 umull r0,r9,r0,r0           @ r9: ε⁴/4 Q88-32=Q56 = ε⁴/32 Q59
 umlal r0,r9,r9,r8           @ r9: ε⁴/24 Q59

 umull r0,r8,r9,r1           @ ε⁵/24 Q59
 mov r0,#0x33333333
 umull r0,r8,r8,r0           @ r8: ε⁵/120 Q59
@ r6:r7 ε-ε³/6+ε⁵/120 Q64
 smmulr r10,r8,r1            @ r10: ε⁶/120 Q59
 movlong r0,0x2aaaaaaa
 smmlsr r9,r0,r10,r9         @ ε⁴/24-ε⁶/720
 subs r2,r2,r9,lsl#5
 sbcs r3,r3,r9,lsr#27
@ r2:r3 ε²/2-ε⁴/24+ε⁶/720 Q64
 smmulr r10,r10,r1           @ r10: ε⁷/120 Q59
 mov r0,#0x6180000           @ 1/42 Q64
 smmlsr r8,r10,r0,r8         @ r8: ε⁵/120-ε⁷/5040 Q59
 adds r6,r6,r8,lsl#5
 adcs r7,r7,r8,lsr#27
 bx r14

dsc_h0s:
@ postprocess for sine, positive ε
@ r2:r3 1-cos |ε| Q64
@ r4:   sin φ Q31
@ r5:   cos φ Q31
@ r6:r7 sin |ε| Q64
 umull r8,r0,r2,r4
 mov r1,#0
 umlal r0,r1,r3,r4
@ r0:r1 sin φ(1-cos ε)
 umull r8,r3,r6,r5
 umlal r3,r4,r7,r5
@ r3:r4 sin φ+cos φ.sin ε
 subs r0,r3,r0
 sbc r1,r4,r1
@ r0:r1 sin φ+cos φ.sin ε - sin φ(1-cos ε) = sin φ.cos ε + cos φ.sin ε = sin(φ+ε)
 bx r14

dsc_h1s:
@ postprocess for sine, negative ε
@ r2:r3 1-cos |ε| Q64
@ r4:   sin φ Q31
@ r5:   cos φ Q31
@ r6:r7 sin |ε| Q64
 umull r8,r0,r2,r4
 mov r1,#0
 umlal r0,r1,r3,r4
@ r0:r1 sin φ(1-cos ε)
 umull r8,r3,r6,r5
 rsbs r4,r4,#0
 umlal r3,r4,r7,r5
@ r3:r4 -sin φ-cos φ.sin ε
 adds r0,r3,r0
 adc r1,r4,r1
@ r0:r1 -sin φ-cos φ.sin ε + sin φ(1-cos ε) = -sin φ.cos ε - cos φ.sin ε = -sin(φ+ε)
 bx r14

dsc_h0c:
@ postprocess for cosine, positive ε
@ r2:r3 1-cos |ε| Q64
@ r4:   sin φ Q32
@ r5:   cos φ Q32
@ r6:r7 sin |ε| Q64
 umull r8,r0,r2,r5
 mov r1,#0
 umlal r0,r1,r3,r5
@ r0:r1 cos φ(1-cos ε)
 umull r8,r3,r6,r4
 rsbs r5,#0
 umlal r3,r5,r7,r4
@ r3:r5 -cos φ+sin φ.sin ε
 adds r0,r3,r0
 adc r1,r5,r1
@ r0:r1 -cos φ+sin φ.sin ε + cos φ(1-cos ε) = - cos φ.cos ε + sin φ.sin ε = -cos(φ+ε)
 bx r14

dsc_h1c:
@ postprocess for cosine, negative ε
@ r2:r3 1-cos |ε| Q64
@ r4:   sin φ Q32
@ r5:   cos φ Q32
@ r6:r7 sin |ε| Q64
 umull r8,r0,r2,r5
 mov r1,#0
 umlal r0,r1,r3,r5
@ r0:r1 cos φ(1-cos ε)
 umull r8,r3,r6,r4
 umlal r3,r5,r7,r4
@ r3:r5 cos φ-sin φ.sin ε
 subs r0,r3,r0
 sbc r1,r5,r1
@ r0:r1 cos φ-sin φ.sin ε - cos φ(1-cos ε) = cos φ.cos ε - sin φ.sin ε = cos(φ+ε)
 bx r14

double_section dpack

@ dnegpack: negate and pack
@ dpack_q63:
@ input
@ r0:r1 Q63 result, must not be zero
@ r4    exponent offset [dpack_q only]
@ output
@ r0:r1 IEEE double
@ trashes r2,r3,r4
.thumb_func
dnegpack_q63:
 rsbs r0,r0,#0
 sbc r1,r1,r1,lsl#1
.thumb_func
dpack_q63:
 mov r4,#0x3ff-12            @ exponent
.thumb_func
dpack_q:
 clz r2,r1
 cmp r2,#12
 bhs 1f
 adds r2,#21
 lsls r3,r1,r2
 rsb r2,#32
 lsrs r0,r0,r2               @ save rounding bit in carry
 lsr r1,r1,r2
 add r2,r2,r4
 adcs r0,r0,r3               @ with rounding
 adc r1,r1,r2,lsl#20         @ insert exponent
 bx r14

1:
 cbz r1,2f
 rsb r2,#43
 lsrs r3,r0,r2
 rsb r2,#32
 lsls r1,r1,r2
 lsls r0,r0,r2
 orrs r1,r3
 sub r2,r4,r2
 add r1,r1,r2,lsl#20
 bx r14

2:
 movs r1,r0
 mov r0,#0
 sub r4,#32
 bne dpack_q
 bx r14


double_section dreduce

@ input:
@ r0:r1 x, double (only mantissa bits used)
@ r2    quotient offset
@ r3    exponent e of x with 0x3ff bias removed, -32≤e<12 so 2^-32≤x<2^12
@ r4:r5 r Q64, 0.5≤r<1
@ r6    1/r underestimate Q31
@ output:
@ r0:r1 x mod r Q64 [possibly slightly > r?]
@ r2    quotient+offset
@ r4:r5 r preserved
@ trashes r7,r8
@ increases r2 by up to 2^13
@ this version only used by dexp
.thumb_func
dreduce:
 movs r7,#1
 bfi r1,r7,#20,#12           @ insert implied 1, clear exponent and sign
 lsls r8,r7,r3
 beq 1f                      @ e<0, x<1
 umull r0,r7,r0,r8
 mla r1,r1,r8,r7
@ r0:r1 x Q52
 umull r7,r8,r1,r6           @ Q83-32=Q51
 lsrs r6,r8,#19              @ q Q0
 adds r2,r2,r6
 umull r7,r8,r6,r4
 mla r8,r6,r5,r8             @ r7:r8 q*r Q64
 lsls r1,#12
 orrs r1,r1,r0,lsr#20
 rsbs r0,r7,r0,lsl#12
 sbc r1,r1,r8                @ x-qr Q64
@ check we never return slightly more than r
 cmp r1,r5                   @ quick comparison
 it lo
 bxlo r14
 b 2f

1:
 adds r3,#12
 movs r7,#1
 lsls r8,r7,r3
 beq 1f                      @ e<-12
 umull r0,r7,r0,r8
 mla r1,r1,r8,r7             @ x Q64
 cmp r1,r5                   @ quick comparison
 it lo
 bxlo r14
2:
 it eq
 cmpeq r0,r4
 it lo
 bxlo r14
 subs r0,r0,r4               @ subtract one r
 sbc r1,r1,r5
 adds r2,#1
 bx r14

1:
@ here e<-12, have to shift r0:r1 down by -r3 places
 add r3,#32
 lsls r6,r1,r3
 rsbs r3,#32
 lsrs r0,r0,r3
 lsrs r1,r1,r3
 orrs r0,r0,r6
 bx r14

double_section exptab

.align 2
exptab0:
.quad 0x0000000000000000,0x0f85186008b15304,0x1e27076e2af2e5c8,0x2f57120421b2120d
.quad 0x3f7230dabc7c5512,0x4e993155a517a717,0x5fabe0ee0abf0d9d,0x6fad36769c6defe3
.quad 0x7ebd623de3cc7b69,0x8f42faf3820681f0,0x9ec813538ab7d537,0xaf70154920b3ab7f
exptab1:
.quad 0x0000000000000000,0x00ff805515885e02,0x01fe02a6b1067890,0x02fb88ebf0214edc
.quad 0x03f815161f807c7a,0x04f3a910d1a95d3c,0x05ee46c1f56c46aa,0x06e7f009ebe465ff
.quad 0x07e0a6c39e0cc013,0x08d86cc491ecbfe1,0x09cf43dcff5eafd5,0x0ac52dd7e4726a46
.quad 0x0bba2c7b196e7e23,0x0cae41876471f5bf,0x0da16eb88cb8df61,0x0e93b5c56d85a909
.quad 0x0f85186008b15331,0x1075983598e47130
exptab2:
.quad 0x000fff8005551559,0x002ffb808febc309,0x004ff3829a0e91b1,0x006fe78722fde71f
.quad 0x008fd78f299aa0c3,0x00afc39bac66434f,0x00cfabada9832a41,0x00ef8fc61eb4b74f
.quad 0x010f6fe6095f81b6,0x012f4c0e66898567,0x014f244032da521a,0x016ef87c6a9b3a48
.quad 0x018ec8c409b781ff,0x01ae95180bbc8d9c,0x01ce5d796bda1070,0x01ee21e924e23b3a
logtab0:
.quad 0xa0ec7f4233957338,0x918986bdf5fa1431,0x8391f2e0e6fa026b,0x7751a813071282e6
.quad 0x6a73b26a68212621,0x5fabe0ee0abf0d9d,0x546ab61cb7e0b419,0x48a507ef3de59695
.quad 0x3c4e0edc55e5cbd1,0x32a4b539e8ad68ce,0x289a56d996fa3ccb,0x21aefcf9a11cb2c9
.quad 0x16f0d28ae56b4b86,0x0f85186008b15304,0x07e0a6c39e0cc002,0x0000000000000000

.align 2
exprrdata:
.quad 0xB17217F7D1CF79AC     @ ln2 Q64
.long 0xB8AA3B29             @ 1/ln2 Q31, rounded down

double_wrapper_section exp

2:
@ could use dadd macro to calculate x+1 here
 lsl r0,r1,#11
 orr r0,#0x80000000
 lsls r1,#1
 adc r3,r3,#32
 movlong r1,0x3ff00000
 rsb r3,#11
 lsr r0,r3
 it cc
 bxcc r14
 rsbs r0,#0
 sbc r1,r1,#0
 bx r14

wrapper_func exp
 movs r12,r1,lsr#31          @ save sign
 ubfx r3,r1,#20,#11          @ get exponent
 sub r3,r3,#0x3ff
 cmp r3,#12
 bge 20f                     @ overflow, Inf or NaN?
 cmn r3,#0x20
 ble 2b                      @ <2^-32? return x+1
 push {r4-r8,r14}
 ldr r4,=exprrdata
 ldmia r4,{r4-r6}
 mov r2,#0
 bl dreduce
 tst r12,#1
 beq 1f
 mvn r2,r2                   @ quotient is now signed
 subs r0,r4,r0
 sbc r1,r5,r1
1:
 add r12,r2,#0x3fe           @ exponent offset
 mov r3,#0x7fe
 cmp r12,r3
 bhs 1f                      @ under/overflow
 lsrs r2,r1,#28
 ldr r3,=exptab0
 add r3,r3,r2,lsl#3
 ldmia r3,{r2-r3}
 and r5,r2,#63
 orr r5,#64                  @ y=(t&0x3f)+0x40; Q6
 subs r0,r2
 sbcs r1,r3
 lsrs r2,r1,#24
 ldr r3,=exptab1
 add r3,r3,r2,lsl#3
 ldmia r3,{r3-r4}
 add r2,#256
 muls r5,r5,r2               @ y Q14
 subs r0,r3
 sbcs r1,r4
 lsrs r2,r1,#21
 ldr r3,=exptab2
 add r3,r3,r2,lsl#3
 ldmia r3,{r3-r4}
 add r2,r2,r2
 add r2,#4096
 mla r5,r5,r2,r5             @ y Q26
 subs r0,r3
 sbcs r1,r4

 movs r2,r1,lsl#10
 orrs r2,r2,r0,lsr#22
 adc r2,r2,#0                @ ε Q42, rounded
 smull r3,r4,r2,r2           @ ε² Q84-32=Q52
 lsrs r3,#21
 orrs r3,r3,r4,lsl#11
 adds r0,r0,r3
 adc r1,r1,r4,lsr#21         @ ε+ε²/2 Q64
 smull r3,r4,r4,r2           @ Q52*Q42=Q94; Q94-32=Q62
 mov r3,#0x55555555          @ 1/6 Q33
 smull r3,r4,r3,r4           @ ε³/6 Q63
 smmulr r3,r4,r1             @ ε⁴/6+ε⁵/12 Q63+Q32-32=Q63
 add r4,r4,r3,lsr#2
 adds r2,r0,r4,lsl#1
 adc r3,r1,r4,asr#31
 lsls r1,r5,#3               @ y Q29
 umull r4,r0,r1,r2           @ εlo * y Q61+32
 smlal r0,r1,r1,r3           @ εhi * y + y Q61
@ assert result is in range 1..2
 lsrs r0,#9
 adcs r0,r0,r1,lsl#23
 lsr r1,#9
 adcs r1,r1,r12,lsl#20
 pop {r4-r8,r15}

20:
@ process Inf/NaN for dexp
 cmp r3,#0x400
 bne 22f
 orrs r2,r0,r1,lsl#12
 ite eq
 biceq r1,r1,r1,asr#31       @ +Inf -> + Inf; -Inf -> +0
 orrne r1,r1,#0x00080000
 bx r14

22:
 movs r0,#0
 movs r1,#0
 tst r12,#1
 it eq
 movteq r1,0x7ff0
 bx r14

1:                           @ under/overflow
 mov r0,#0
 mov r1,#0
 it ge
 movtge r1,#0x7ff0
 pop {r4-r8,r15}


double_wrapper_section log

1:
 movlong r1,0xfff00000       @ return -Inf
 movs r0,#0
 bx r14

4:
 orrs r2,r0,r1,lsl#12
 it ne
 orrne r1,#0x00080000
 bx r14

10:
 mvns r5,r6,asr#22           @ very small argument?
 bne 10f
 mov r4,#4096
 b 11f

@ check for argument near 1: here
@ r1 : mantissa
@ r12: exponent, -1 or 0
12:
 eor r3,r12,r1,lsr#12
 lsls r3,r3,#24              @ check 8 bits of mantissa
 bne 12f                     @ not very close to 1
 cmp r12,#0
 bne 13f
@ argument is 1+ε, result will be positive
 lsls r1,#19
 orrs r1,r1,r0,lsr#13
 lsls r0,#19
@ r0:r1 ε Q71 0≤ε<2^-8
 clz r4,r1                   @ r4≥1
 cmp r4,#32
 bhs 14f
 movs r5,#1
 lsls r5,r4
 umull r2,r3,r0,r5
 mla r3,r1,r5,r3             @ r2:r3 ε Q71+r4
 umull r12,r5,r0,r3
 umull r12,r6,r1,r2
 umaal r5,r6,r1,r3           @ r5:r6 ε² Q142+r4-64 = Q78+r4

 subs r2,r2,r5,lsr#8
 sbc r3,r3,#0
 subs r2,r2,r6,lsl#24
 sbcs r3,r3,r6,lsr#8

 umull r12,r7,r0,r6
 umull r12,r8,r1,r5
 umaal r7,r8,r1,r6           @ r7:r8 ε³ Q149+r4-64 = Q85+r4: when ε is nearly 2^-8, r4=1 and Q86, so r8<0x40000000
 mov r5,#0x55555555          @ ~1/3 Q32

 umull r12,r6,r7,r5
 movs r12,#0
 umlal r6,r12,r8,r5
 adds r6,r6,r12
 adc r12,r12,#0              @ multiply by 0x5555555555555555

 adds r2,r2,r6,lsr#14
 adc r3,r3,#0
 adds r2,r2,r12,lsl#18
 adc r3,r3,r12,lsr#14

 smmulr r5,r8,r1             @ ε⁴ Q53+r4+Q71-Q64=Q60+r4 ~ 2^-32
 movs r7,#0x33333333         @ 1/5 Q32
 smmulr r6,r5,r1             @ ε⁵ Q60+r4+q71-Q64=Q67+r4 ~ 2^-40
 smmulr r8,r6,r7             @ ε⁵/5 Q67+r4 ~ 2^-42
 sub r7,r5,r8,lsr#5
 smmulr r5,r6,r1             @ ε⁶ Q67+r4+q71-Q64=Q74+r4 ~ 2^-48
 movt r6,#0x2a80             @ 1/6 Q32 fiddled slightly (PMC)
 smmulr r5,r6,r5             @ ε⁶/6 Q75+r4 ~ 2^-50
 add r7,r7,r5,lsr#12

 subs r0,r2,r7,lsl#9
 sbc r1,r3,r7,lsr#23

 rsb r4,#0x400
 sub r4,#0x15
 bl dpack_q
 pop {r4-r8,r15}

@ here we have positive ε sufficiently small we need (at most) a quadratic term
14:
@ here r0=ε Q71, 0≤ε<2^-40
 clz r4,r0
 lsls r1,r0,r4               @ ε Q71+r4
 umull r2,r3,r0,r1           @ ε² Q142+r4
 mov r0,#0
 subs r0,r0,r3,lsr#8
 sbc r1,r1,#0
 rsb r4,#0x400
 sub r4,#0x35
 bl dpack_q
 pop {r4-r8,r15}


13:                          @ argument is 1-ε, result will be negative
 movs r1,r1,lsl#18
 orrs r1,r1,r0,lsr#14
 movs r0,r0,lsl#18
 rsbs r0,#0
 sbc r1,r1,r1,lsl#1
@ r0:r1 -ε Q71 -2^-9≤ε<0
 clz r4,r1
 cmp r4,#32
 bhs 15f
 subs r4,#1                  @ 0≤r4<31
 movs r5,#1
 lsls r5,r4
 umull r2,r3,r0,r5
 mla r3,r1,r5,r3             @ r2:r3 ε Q71+r4
 umull r12,r5,r0,r3
 umull r12,r6,r1,r2
 umaal r5,r6,r1,r3           @ r5:r6 ε² Q142+r4-64 = Q78+r4

 adds r2,r2,r5,lsr#8
 adc r3,r3,#0
 adds r2,r2,r6,lsl#24
 adcs r3,r3,r6,lsr#8

 umull r12,r7,r0,r6
 umull r12,r8,r1,r5
 umaal r7,r8,r1,r6           @ r7:r8 ε³ Q149+r4-64 = Q85+r4: when ε is nearly 2^-8, r4=0 and Q85, so r8<0x20000000
 mov r5,#0x55555555          @ ~1/3 Q32

 umull r12,r6,r7,r5
 movs r12,#0
 umlal r6,r12,r8,r5
 adds r6,r6,r12
 adc r12,r12,#0              @ multiply by 0x5555555555555555

 adds r2,r2,r6,lsr#14
 adc r3,r3,#0
 adds r2,r2,r12,lsl#18
 adc r3,r3,r12,lsr#14

 smmulr r5,r8,r1             @ ε⁴ Q53+r4+Q71-Q64=Q60+r4 ~ 2^-32
 movs r7,#0x33333333         @ 1/5 Q32
 smmulr r6,r5,r1             @ ε⁵ Q60+r4+q71-Q64=Q67+r4 ~ 2^-40
 smmulr r8,r6,r7             @ ε⁵/5 Q67+r4 ~ 2^-42
 add r7,r5,r8,lsr#5
 smmulr r5,r6,r1             @ ε⁶ Q67+r4+q71-Q64=Q74+r4 ~ 2^-48
 movt r6,#0x2a80             @ 1/6 Q32 fiddled slightly (PMC)
 smmulr r5,r6,r5             @ ε⁶/6 Q75+r4 ~ 2^-50
 add r7,r7,r5,lsr#12

 adds r0,r2,r7,lsl#9
 adc r1,r3,r7,lsr#23

 rsb r4,#0x400
 sub r4,#0x15
 bl dpack_q
 orr r1,r1,#1<<31
 pop {r4-r8,r15}

@ here we have negative ε sufficiently small we need (at most) a quadratic term
@ here r0=ε Q71, |ε|<2^-41
15:
 clz r4,r0
 lsls r1,r0,r4               @ ε Q71+r4
 umull r2,r3,r0,r1           @ ε² Q142+r4
 mov r0,r3,lsr#8
 rsb r4,#0x400
 sub r4,#0x35
 bl dpack_q
 eors r1,r1,#0x80000000
 pop {r4-r8,r15}

wrapper_func log
 lsls r12,r1,#1
 bcs 1b                      @ x<0?
 lsrs r12,#21
 beq 1b                      @ x==0/denormal?
 sub r12,#0x3ff
 cmp r12,#0x400              @ +Inf/NaN?
 beq 4b
 movs r2,#1
 bfi r1,r2,#20,#12           @ set implied 1, clear exponent Q52
 push {r4-r8,r14}
 cmp r12,r12,asr#31          @ exponent = -1 or 0?
 beq 12b
12:
 lsrs r4,r1,#16
 ldr r5,=logtab0-16*8
 add r5,r5,r4,lsl#3
 ldmia r5,{r2-r3}
 and r5,r2,#63
 add r5,#64
 umull r0,r6,r5,r0
 mla r1,r5,r1,r6             @ Q59

 mvn r4,r1,asr#19
 and r4,#31
 ldr r5,=exptab1
 add r5,r5,r4,lsl#3
 ldmia r5,{r5-r6}
 adds r2,r2,r5
 adc r3,r3,r6
 add r4,#256
 umull r0,r6,r4,r0
 mla r6,r4,r1,r6             @ r0:r6 Q67

 mvns r4,r6,asr#24
 beq 10b                     @ small argument at this stage?
10:
 ldr r5,=exptab2
 add r5,r5,r4,lsl#3
 ldmia r5,{r1,r5}
 adds r2,r2,r1
 adc r3,r3,r5
 mov r5,#4097
 add r4,r5,r4,lsl#1          @ 4097+2k
11:
 lsls r4,#17
 umull r5,r0,r4,r0
 rsb r1,r4,r4,lsl#3
 umlal r0,r1,r4,r6           @ Q96=Q64

@ r0:r1 ε Q64
@ r2:r3 y Q64

 eor r4,r0,r1,asr#31
 eor r5,r1,r1,asr#31         @ r4:r5 |ε| Q64
 umull r6,r7,r5,r5
 umull r4,r8,r4,r5
 lsrs r7,#1
 rrx r6,r6
 adds r6,r6,r8
 adc r7,r7,#0                @ r6:r7 ε²/2 Q64

 movs r4,r1,lsl#10
 orrs r4,r4,r0,lsr#22
 adc r4,r4,#0                @ ε Q42, rounded

 subs r0,r0,r6
 sbc r1,r1,r7                @ r0:r1 ε-ε²/2 Q64

 smmulr r5,r4,r4             @ ε² Q42+42-32=Q52
 smmulr r6,r5,r4             @ ε³ Q52+42-32=Q62
 smmulr r7,r5,r5             @ ε⁴ Q52+52-32=Q72 ε⁴/4 Q74
 mov r4,#0x55555555          @ 1/3 Q32
 smmulr r6,r6,r4             @ ε³/3 Q62
 subs r6,r6,r7,lsr#12        @ ε³/3-ε⁴/4 Q62

 adds r0,r0,r6,lsl#2         @ Q64
 adc r1,r1,r6,asr#30

 subs r0,r2,r0
 sbc r1,r3,r1
 ldr r2,=exprrdata
 ldmia r2,{r2,r5}
 adds r12,#1
 ble 1f
@ positive result
 umull r2,r3,r2,r12
 movs r4,#0
 umlal r3,r4,r5,r12
 subs r2,r2,r0
 sbcs r3,r3,r1
 sbc r4,r4,#0
 movs r1,#0x40000000
 b 2f                        @ to pack result

@ negative result
1:
 rsbs r12,#0
 umull r2,r3,r2,r12
 movs r4,#0
 umlal r3,r4,r5,r12
 adds r2,r0,r2
 adcs r3,r1,r3
 adc r4,r4,#0
 movs r1,#0xc0000000
2:
 cbnz r4,2f
 movs r4,r3
 movs r3,r2
 movs r2,#0
 subs r1,#32<<20
1:
@ here r3:r4 is guaranteed nonzero
 cbnz r4,2f
 movs r4,r3
 movs r3,#0
 subs r1,#32<<20
2:
 clz r5,r4
 sub r6,r5,#0x1d
 sub r1,r1,r6,lsl#20
 lsls r4,r5
 lsls r0,r3,r5
 rsb r5,#32
 lsrs r3,r5
 orrs r4,r4,r3
 lsrs r2,r5
 orrs r0,r0,r2
@ now r0:r4 is normalised to Q63
 lsrs r0,#11
 adcs r0,r0,r4,lsl#21        @ with rounding
 adc r1,r1,r4,lsr#11
 pop {r4-r8,r15}

@===========================================

double_section trigtab
trigtab:
//     φ Q64 lo    φ Q64 hi    sin φ Q31   cos φ Q31
.long 0x25735c0b, 0x03f574a9, 0x01fab529, 0x7ffc1500
.long 0x00aa2feb, 0x0c44d954, 0x0621d2c8, 0x7fda601f
.long 0x42e86336, 0x13d9d3cf, 0x09ea5e3c, 0x7f9d88f0
.long 0x046dc42f, 0x1c0a86d9, 0x0dfe171f, 0x7f3b9ed0
.long 0xec509ba7, 0x23ebf53e, 0x11e6e7bc, 0x7ebdefc7
.long 0x039c1cd2, 0x2bf112b2, 0x15dcf546, 0x7e1e7749
.long 0x3d7a05ca, 0x33f293f4, 0x19cbc014, 0x7d5fac9c
.long 0x7aefa0a0, 0x3c19321d, 0x1dc6221d, 0x7c7d2f2f
.long 0x12fb0fb5, 0x4450ef70, 0x21c10c53, 0x7b782235
.long 0x476b8d5c, 0x4bf1a045, 0x256adc18, 0x7a68ad16
.long 0x576940c1, 0x53ead96d, 0x29361639, 0x792f2d3c
.long 0xd34eeadf, 0x5bb34289, 0x2ce039d6, 0x77e02625
.long 0x31ea5069, 0x641107d9, 0x30c4d3c0, 0x76586270
.long 0xf36756cb, 0x6bfda2b3, 0x34687e1c, 0x74c77a22
.long 0x2f7aed0e, 0x73e07531, 0x37fae7cc, 0x731c1127
.long 0xfcc48ec0, 0x7c14790e, 0x3ba3a70e, 0x7141cd8e
.long 0x3b04b713, 0x83547b8f, 0x3ed289d4, 0x6f85d8eb
.long 0x135b369a, 0x8c0b6fd9, 0x4294ead4, 0x6d51efa3
.long 0x4aca525f, 0x9439d326, 0x460a6e70, 0x6b230540
.long 0x41c44083, 0x9c0e0b34, 0x4948b03d, 0x68f1ef07
.long 0xa36d08bf, 0xa47961fd, 0x4cb1f285, 0x667a849d
.long 0xc8f81636, 0xabe547d2, 0x4fa2221e, 0x6436622f
.long 0xd0c8c9ad, 0xb3feade1, 0x52c37024, 0x61a4af86
.long 0x64730e73, 0xbbfe356a, 0x55c5f028, 0x5f02a3a3
.long 0xb08ca5c6, 0xc412b955, 0x58ba9208, 0x5c4194a3
.long 0x54530090, 0xcbb02d37, 0x5b6ef419, 0x59938ed8
.long 0xa18d7c36, 0xd3aa3c6e, 0x5e2e0225, 0x56af32fc
.long 0x48a2c7d0, 0xdbff19f6, 0x60f35332, 0x5392ed7e
.long 0x7aad9a94, 0xe422ade1, 0x638edfca, 0x50732bc7
.long 0x19ef15ff, 0xebfb85bd, 0x65fa18bb, 0x4d5c61c7
.long 0x7e0f96bd, 0xf44c4b49, 0x686f81f7, 0x4a0217e6
.long 0x1def09eb, 0xfc4dbdfd, 0x6ab2d2c4, 0x46b4e413
// maximum e=0.002617 = 0.167497*2^-6

double_wrapper_section atan2

@ datan small reduced angle case I
20:
@ r0:r1 has z=y/x in IEEE format, <2^-11
@ r2 e+11
 rsbs r12,r2,#0              @ shift down of mantissa required to get to Q63 >0
 subs r10,r12,#32
 bge 1f                      @ very small reduced angle?
 bfi r1,r3,#20,#12           @ fix up mantissa
 cmp r7,#4
 bhs 2f                      @ at least one quadrant to add? then don't need extreme accuracy
@ otherwise we need to do a power series for accuracy

 rsbs r10,#0
 lsr r3,r1,r12
 lsr r2,r0,r12
 lsl r9,r1,r10
 orr r2,r9
@ r2:r3 z Q63 (with r12 bits of loss of precision)
 lsls r1,#11
 orrs r1,r1,r0,lsr#21
 lsls r0,#11
@ r0:r1 z Q74+r12
 umull r9,r4,r2,r3
 umull r9,r5,r2,r3
 umaal r4,r5,r3,r3
@ r4:r5 z² Q62 < 2^-22
 umull r9,r2,r0,r5
 umull r9,r3,r1,r4
 umaal r2,r3,r1,r5
@ r2:r3 z³ Q72+r12 <2^-33
 umull r9,r6,r2,r5
 umull r9,r8,r3,r4
 umaal r6,r8,r3,r5
@ r6:r8 z⁵ Q70+r12 <2^-55; in fact r8 is always 0
 mov r9,#0xaaaaaaaa          @ 2/3 Q32
 umull r10,r4,r2,r9
 movs r5,#0
 umlal r4,r5,r3,r9
 adds r4,r4,r5
 adcs r5,r5,#0
@ r4:r5 z³*2/3 Q72+r12 = z³/3 Q73+r12
 mov r9,#0x33333333          @ 1/5 Q32
 umull r2,r3,r6,r9
@ r3 z⁵*1/5 Q70+r12
 subs r4,r4,r3,lsl#3
 sbc r5,r5,#0
@ r4:r5 z³/3-z⁵/5 Q73+r12
 subs r0,r0,r4,lsl#1
 sbc r1,r1,r5,lsl#1
 sub r1,r1,r4,lsr#31
@ r0:r1 z-z³/3+z⁵/5 Q74+r12
 rsb r4,r12,#0x400
 sub r4,#24
 b 60f                       @ pack and return

2:
 rsbs r10,#0
 lsls r4,r1,r10
 lsrs r0,r0,r12
 lsrs r1,r1,r12
 orrs r0,r0,r4               @ shift down r12 places
 b 50f


1:
 cmp r7,#4
 bhs 2f                      @ at least one quadrant to add?
 eors r1,r1,r7,lsl#31        @ no: return y/x with the correct sign
 pop {r4-r11,r15}

2:
 bfi r1,r3,#20,#12           @ fix up mantissa
 usat r10,#6,r10             @ saturate r10 to 63
 lsrs r0,r1,r10
 movs r1,#0                  @ shift down 32+r10 places
 b 40f

@ datan small reduced angle case II
10:
 lsrs r1,#1
 rrxs r0,r0
 movs r2,#0
 movs r3,#0
 movs r6,#0
 mov r7,#1<<30
 b 11f

@ case where reduced (x',y') has x' infinite
71:
 sbfx r4,r1,#20,#11
 movs r0,#0
 movs r1,#0
 cmn r4,#1                   @ y' also infinite?
 bne 80f
 movt r1,#0x3ff0             @ both infinite: pretend ∞/∞=1
 b 80f

@ case where reduced (x',y') has y' zero
70:
 ubfx r4,r3,#20,#11
 movs r0,#0
 movs r1,#0
 cbnz r4,80f                 @ x' also zero?
 tst r7,#4
 beq 80f                     @ already in quadrants 0/±2? then 0/0 result will be correct
 tst r7,#2
 ite eq
 addeq r7,#6
 subne r7,#6                 @ fix up when in quadrants ±0
 b 80f

90:
 movs r0,r2
 movs r1,r3
91:
 orrs r1,r1,#0x00080000
 bx r14

wrapper_func atan2
 cmp r2,#1                   @ set C if low word is non-zero
 adc r12,r3,r3
 cmn r12,#0x00200000         @ y NaN?
 bhi 90b
 cmp r0,#1                   @ set C if low word is non-zero
 adc r12,r1,r1
 cmn r12,#0x00200000         @ x NaN?
 bhi 91b
 push {r4-r11,r14}
 lsrs r7,r1,#31              @ b31..2: quadrant count; b1: sign to apply before addition; b0: sign to apply after addition
 bic r1,#1<<31
@ y now positive
 movs r3,r3
 bpl 1f

@ here y positive, x negative
 adds r7,#10
 bic r3,r3,#1<<31
 cmp r3,r1
 bhi 4f                      @ |x| > y: 3rd octant
@ |x| < y: 2nd octant
 subs r7,#6
 b 3f

1:
@ here x and y positive
 cmp r3,r1
 bhi 4f
@ x < y: 1st octant
 adds r7,#6
3:
 movs r4,r2                  @ exchange x and y
 movs r5,r3
 movs r2,r0
 movs r3,r1
 movs r0,r4
 movs r1,r5
4:

@ here
@ r0:r1 y'
@ r2:r3 x'
@ where both x' and y' are positive, y'/x' < 1+δ, and the final result is
@ ± (Q.π/2 ± atn y/x) where 0≤Q≤2 is a quadrant count in r7b3..2, the inner negation
@ is given by r7b1 and the outer negation by r7b0. x' can be infinite, or both x' and
@ y' can be infinite, but not y' alone.

 sbfx r4,r3,#20,#11
 cmn r4,#1
 beq 71b                     @ x' infinite?
 ubfx r4,r1,#20,#11
 cmp r4,#0
 beq 70b                     @ y' zero?
 bl __aeabi_ddiv
80:
@ r0:r1 y/x in IEEE format, 0..1
 lsr r2,r1,#20               @ exponent
 movs r3,#1
 subs r2,#0x3ff-11
 bmi 20b
 bfi r1,r3,#20,#12           @ fix up mantissa
 movs r3,#1
 lsl r3,r2
 umull r0,r4,r0,r3
 mla r1,r1,r3,r4
50:
 push {r7}                   @ save flags

@ from here atan2(y,1) where 1 implied
@ r0:r1 y Q63 0≤y<1+δ

 lsrs r2,r1,#16
 cmp r2,#0x100
 blo 10b                     @ small angle?
 mul r3,r2,r2                @ y^2
 movw r4,#0x895c
 muls r2,r2,r4               @ y*0x895c
 movw r5,#0x1227
 lsrs r3,#14
 mls r2,r3,r5,r2
 subs r2,#0x330000           @ Chebyshev approximation to atn(y)
 lsrs r2,#25
 ldr r3,=trigtab
 add r3,r3,r2,lsl#4
 ldmia r3,{r2-r5}
 lsrs r3,#1
 rrxs r2,r2
@ r2:r3 phi0 Q63
@ r4    sphi0
@ r5    cphi0
 umull r12,r6,r4,r0
 movs r7,#0
 umlal r6,r7,r4,r1
 adds r6,r6,r5,lsl#31
 adc r7,r7,r5,lsr#1          @ x0= ((i128)cphi0<<31)+(((i128)sphi0*(i128)y)>>32); // Q62
@ r6:r7 x0
 umull r12,r0,r5,r0
 movs r8,#0
 umlal r0,r8,r5,r1
 subs r0,r0,r4,lsl#31
 sbc r1,r8,r4,lsr#1          @ y0=-((i128)sphi0<<31)+(((i128)cphi0*(i128)y)>>32); // Q62
11:
@ r0:r1 y0
@ r2:r3 phi0
@ r6:r7 x0

 lsls r4,r1,#6
 orr r4,r4,r0,lsr#26
 lsrs r5,r7,#15
 sdiv r4,r4,r5               @  t2=(y0>>26)/(x0>>47); // Q62-26/Q62-47=Q21

 mul r5,r4,r4                @ t2_2 Q42
 add r3,r3,r4,lsl#10         @ phi0+t2
 smull r8,r9,r4,r5           @ t2_3 Q63
 mov r10,r9,lsl#16
 orr r10,r10,r8,lsr#16
 smmulr r10,r10,r5           @ t2_5 Q57
 mov r12,#0x66666666         @ 1/5 Q33
 smmulr r11,r10,r5           @ t2_7 Q67
 smmulr r10,r10,r12          @ t2_5/5 Q57+33-32=Q58

 movlong r12,0x124925        @ 1/7 Q23
 smmulr r11,r11,r12          @ t2_7/7 Q67+23-32=Q58
 mov r12,#0x55555555
 sub r11,r11,r11,asr#12      @ Q58 PMC correction
 sub r10,r10,r11

 adds r2,r2,r10,lsl#5
 adc r3,r3,r10,asr#27        @ Q63 phi0 + t_2 + t2_5/5 - t2_7/7 + t2_7/7/4096
 umull r5,r10,r8,r12
 mov r11,#0
 smlal r10,r11,r9,r12        @ t2_3 * 0x55555555
 adds r10,r10,r11
 adc r11,r11,r11,asr#31      @ t2_3/3 Q63
 subs r2,r2,r10
 sbc r3,r3,r11               @ Q63 phi0+phi1

 lsls r4,r4,#11              @ t2 Q32
 umull r5,r8,r4,r0           @ t2*y0l
 it mi
 submi r8,r8,r0              @ correction if t2 is negative
 mov r9,r8,asr#31            @ sign extend
 smlal r8,r9,r4,r1           @ t2*y0h
@ r8:r9 (t2*y0)<<11

 umull r5,r10,r4,r6          @ t2*x0l
 it mi
 submi r10,r10,r6            @ correction if t2 is negative
 mov r11,r10,asr#31          @ sign extend
 smlal r10,r11,r4,r7
@ r10:r11 (t2*x0)<<11

 adds r6,r8
 adc r7,r7,r9
 subs r0,r10
 sbc r1,r1,r11
@ r0:r1 y1=y0-t2*x0
@ r2:r3 phi0+phi1
@ r6:r7 x1=x0+t2*y0

 mov r4,#0xffffffff
 lsrs r5,r7,#14
 udiv r4,r4,r5               @ rx1 Q16
 lsrs r5,r0,#11
 orrs r5,r5,r1,lsl#21        @ N set according to y1, hence also t3
 smmul r5,r4,r5              @ t3=(y1>>11)*rx1 Q35
 lsr r6,r6,#3
 orr r6,r6,r7,lsl#29
 umull r11,r8,r5,r6          @ t3*x1l
 lsr r10,r7,#3
 it mi
 submi r8,r8,r6              @ correction if t3 is negative
 mla r8,r5,r10,r8
 adds r2,r2,r5,lsl#28
 adc r3,r3,r5,asr#4
 sub r0,r0,r8
@ r0: y2
@ r2:r3 phi0+phi1+phi2
@ r4: rx1
@ r5: t3

 smull r8,r9,r0,r4           @ y2*rx1
@ stall
 lsrs r8,#14
 orr r8,r8,r9,lsl#18         @ t4
 smmlsr r0,r8,r7,r0
 adds r2,r2,r8,asr#1
 adc r3,r3,r8,asr#31
@ r0: y3
@ r4: rx1
 mul r4,r4,r0
 adds r0,r2,r4,asr#15
 adc r1,r3,r4,asr#31
@ r0:r1 result over reduced range Q63
 pop {r7}                    @ restore flags
40:
 lsrs r1,#1
 rrxs r0,r0
@ r0:r1 result over reduced range Q62
 lsl r6,r7,#30               @ b1 -> b31
 eor r0,r0,r6,asr#31         @ negate if required
 eor r1,r1,r6,asr#31
 movlong r2,0x10B4611A       @ π/2 Q62 low word
 movlong r3,0x6487ED51       @ π/2 Q62 high word
 lsr r6,r7,#2                @ quadrants to add
 umlal r0,r1,r6,r2
 mla r1,r6,r3,r1
 mov r4,#0x400-12            @ for packing Q62
60:
 bl dpack_q
 eors r1,r1,r7,lsl#31
 pop {r4-r11,r15}

#endif