/*
 * Copyright (c) 2024 Raspberry Pi (Trading) Ltd.
 *
 * SPDX-License-Identifier: BSD-3-Clause
 */

#include "pico/asm_helper.S"

#if !HAS_DOUBLE_COPROCESSOR
#error attempt to compile double_aeabi_dcp when there is no DCP
#else

#include "hardware/dcp_instr.inc.S"
#include "hardware/dcp_canned.inc.S"

pico_default_asm_setup

.macro double_section name
#if PICO_DOUBLE_IN_RAM
.section RAM_SECTION_NAME(\name), "ax"
#else
.section SECTION_NAME(\name), "ax"
#endif
.endm

.macro double_wrapper_section func
double_section WRAPPER_FUNC_NAME(\func)
.endm

// ============== STATE SAVE AND RESTORE ===============

.macro saving_func type func, opt_label1='-', opt_label2='-'
  // Note we are usually 32-bit aligned already at this point, as most of the
  // function bodies contain exactly two 16-bit instructions: bmi and bx lr.
  // We want the PCMP word-aligned.
.p2align 2
  // When the engaged flag is set, branch back here to invoke save routine and
  // hook lr with the restore routine, then fall back through to the entry
  // point. The engaged flag will be clear when checked a second time.
1:
  push {lr}              // 16-bit instruction
  bl generic_save_state  // 32-bit instruction
  b 1f                   // 16-bit instruction
.ifnc \opt_label1,'-'
regular_func \opt_label1
.endif
.ifnc \opt_label2,'-'
regular_func \opt_label2
.endif
  // This is the actual entry point:
\type\()_func \func
  PCMP apsr_nzcv
  bmi 1b
1:
.endm

.macro saving_func_return
  bx lr
.endm

double_section __rp2350_dcp_engaged_state_save_restore
.thumb_func
generic_save_state:
  sub sp, #24
  push {r0, r1}
  // do save here
  PXMD r0, r1
  strd r0, r1, [sp, #8 + 0]
  PYMD r0, r1
  strd r0, r1, [sp, #8 + 8]
  REFD r0, r1
  strd r0, r1, [sp, #8 + 16]
  pop {r0, r1}
  blx lr
  // <- wrapped function returns here
  // fall through into restore:
.thumb_func
generic_restore_state:
  // do restore here
  pop {r12, r14}
  WXMD r12, r14
  pop {r12, r14}
  WYMD r12, r14
  pop {r12, r14}
  WEFD r12, r14
  pop {pc}

// ============== ARITHMETIC FUNCTIONS ===============

double_wrapper_section __aeabi_dadd
saving_func wrapper __aeabi_dadd
  dcp_dadd_m r0,r1,r0,r1,r2,r3
  saving_func_return

double_wrapper_section __aeabi_dsub
saving_func wrapper __aeabi_dsub
  dcp_dsub_m r0,r1,r0,r1,r2,r3
  saving_func_return

double_wrapper_section __aeabi_drsub
saving_func wrapper __aeabi_drsub
  dcp_dsub_m r0,r1,r2,r3,r0,r1
  saving_func_return

double_wrapper_section __aeabi_dmul
saving_func wrapper __aeabi_dmul

 // todo optimize this based on final decision on saving_func_entry
  push {r4,r14}
  dcp_dmul_m r0,r1,r0,r1,r2,r3,r0,r1,r2,r3,r4,r12,r14
 // todo optimize this based on final decision on saving_func_entry
  pop {r4,lr}
  saving_func_return

double_section ddiv_fast
saving_func regular ddiv_fast
  dcp_ddiv_fast_m r0,r1,r0,r1,r2,r3,r0,r1,r2,r3,r12
  saving_func_return

double_wrapper_section __aeabi_ddiv
saving_func wrapper __aeabi_ddiv
@ with correct rounding
  dcp_ddiv_m r0,r1,r0,r1,r2,r3,r0,r1,r2,r3,r12
  saving_func_return

double_section sqrt_fast
saving_func regular sqrt_fast
  dcp_dsqrt_fast_m r0,r1,r0,r1,r0,r1,r2,r3,r12
  saving_func_return

double_wrapper_section sqrt
saving_func wrapper sqrt
@ with correct rounding
  dcp_dsqrt_m r0,r1,r0,r1,r0,r1,r2,r3,r12
  saving_func_return

double_section dclassify
saving_func regular dclassify
  dcp_dclassify_m apsr_nzcv,r0,r1
  saving_func_return

// ============== CONVERSION FUNCTIONS ===============

double_wrapper_section __aeabi_d2f
saving_func wrapper __aeabi_d2f double2float
@ with rounding
  dcp_double2float_m r0,r0,r1
  saving_func_return

double_wrapper_section __aeabi_i2d
saving_func wrapper __aeabi_i2d int2double
  dcp_int2double_m r0,r1,r0
  saving_func_return

double_wrapper_section __aeabi_ui2d
saving_func wrapper __aeabi_ui2d uint2double
  dcp_uint2double_m r0,r1,r0
  saving_func_return

double_section double2fix_z
saving_func regular double2fix_z
  ubfx r3, r1, #20, #11
  adds r3, r2
  beq 1f // very small; we don't care that we might make a denormal
  asrs ip, r3, #11
  beq 1f
  ite pl
  movpl r3, #0x7ff
  movsmi r3, #0
1:
  bfi r1, r3, #20, #11
  b double2int_z_entry

double_section double2ufix
saving_func regular double2ufix_z double2ufix
double2ufix_z_entry:
  ubfx r3, r1, #20, #11
  adds r3, r2
  beq 1f // very small; we don't care that we might make a denormal
  asrs ip, r3, #11
  beq 1f
  ite pl
  lsrspl r3, r1, #20 // 0x7ff
  movsmi r3, #0
1:
  bfi r1, r3, #20, #11
  b double2uint_z_entry

double_section double2fix
saving_func regular double2fix
  ubfx r3, r1, #20, #11
  cbz r3, 2f // 0 or denormal
  adds r3, r2
  beq 1f // very small; we don't care that we might make a denormal
  asrs ip, r3, #11
  beq 1f
  ite pl
  movpl r3, #0x7ff
  movsmi r3, #0
1:
  bfi r1, r3, #20, #11
  b double2int_entry
2:
  movs r0, #0
saving_func_return


double_section double2int
saving_func regular double2int
double2int_entry:
  lsls r2, r1, #1
  bcc double2int_z_entry // positive is ok for int64_z
  lsrs r3, r2, #21
  beq double2int_z_entry // 0 or -0 or denormal is ok for int_z

  lsrs r2, #21
  adds r2, #1
  subs r2, r2, #0x400
  bcc 1f // <1 means subtract 1
  cmp r2, #31
  bge double2int_z_entry // must be an integer or maxed out
  lsls r3, r1, #12
  adds r3, r3, r0, lsr #20 // r3 now has highest 32 mantissa bits
  lsls r3, r2
  orrs r3, r3, r0, lsl #12 // these bits are all guaranteed to be in the fraction
  beq double2int_z_entry // integer
1:
  dcp_double2int_m r0,r0,r1
  subs r0, #1
saving_func_return

double_wrapper_section __aeabi_d2iz
saving_func wrapper __aeabi_d2iz double2int_z
double2int_z_entry:
@ with truncation towards 0
  dcp_double2int_m r0,r0,r1
  // note: this works with either saved or not saved call as it is just a `bx lr`
  saving_func_return

double_wrapper_section __aeabi_d2uiz
saving_func wrapper __aeabi_d2uiz double2uint double2uint_z
double2uint_z_entry:
@ with truncation towards 0
  dcp_double2uint_m r0,r0,r1
  saving_func_return

double_section double2int_r
saving_func regular double2int_r
@ with rounding
  dcp_double2int_r_m r0,r0,r1
  saving_func_return

double_section double2uint_r
saving_func regular double2uint_r
@ with rounding
  dcp_double2uint_r_m r0,r0,r1
  saving_func_return

// ============== COMPARISON FUNCTIONS ===============

double_wrapper_section __aeabi_dcmpun
saving_func wrapper __aeabi_dcmpun
  dcp_dcmp_m r0,r0,r1,r2,r3
  // extract unordered bit
  ubfx r0, r0, #28, #1
  saving_func_return

double_wrapper_section __aeabi_dcmp
saving_func wrapper __aeabi_cdrcmple
  dcp_dcmp_m apsr_nzcv,r2,r3,r0,r1 // with arguments reversed
  bvs cmp_nan
  saving_func_return

// these next two can be the same function in the absence of exceptions
saving_func wrapper __aeabi_cdcmple
//wrapper_func __aeabi_dcmp
  dcp_dcmp_m apsr_nzcv,r0,r1,r2,r3
  bvs cmp_nan
  saving_func_return

// It is not clear from the ABI documentation whether cdcmpeq must set the C flag
// in the same way as cdcmple. If not, we could save the "bvs" below; but we
// err on the side of caution.
saving_func wrapper __aeabi_cdcmpeq
//wrapper_func __aeabi_dcmp
  dcp_dcmp_m apsr_nzcv,r0,r1,r2,r3
  bvs cmp_nan
  saving_func_return

// If the result of a flag-setting comparison is "unordered" then we need to set C and clear Z.
// We could conceivably just do lsrs r12,r14,#1, or even cmp r14,r14,lsr#1 as (a) r14 here is a
// return address and r14b0=1 for Thumb mode; (b) we are unlikely to be returning to address 0.
cmp_nan:
  movs r12, #3 // r12 does not need to be preserved by the flag-setting comparisons
  lsrs r12, #1 // set C, clear Z
  saving_func_return

// int FUNC_NAME(__aeabi_dcmpeq)(double, double)         result (1, 0) denotes (=, ?<>) [2], use for C == and !=
double_wrapper_section __aeabi_dcmpeq
saving_func wrapper __aeabi_dcmpeq
  dcp_dcmp_m r0,r0,r1,r2,r3
  // extract Z
  ubfx r0, r0, #30, #1
  saving_func_return

// int FUNC_NAME(__aeabi_dcmplt)(double, double)         result (1, 0) denotes (<, ?>=) [2], use for C <
double_wrapper_section __aeabi_dcmplt
saving_func wrapper __aeabi_dcmplt
  dcp_dcmp_m apsr_nzcv,r2,r3,r0,r1
  ite hi
  movhi r0,#1
  movls r0,#0
  saving_func_return

// int FUNC_NAME(__aeabi_dcmple)(double, double)         result (1, 0) denotes (<=, ?>) [2], use for C <=
double_wrapper_section __aeabi_dcmple
saving_func wrapper __aeabi_dcmple
  dcp_dcmp_m apsr_nzcv,r2,r3,r0,r1
  ite hs
  movhs r0,#1
  movlo r0,#0
  saving_func_return

// int FUNC_NAME(__aeabi_dcmpge)(double, double)         result (1, 0) denotes (>=, ?<) [2], use for C >=
double_wrapper_section __aeabi_dcmpge
saving_func wrapper __aeabi_dcmpge
  dcp_dcmp_m apsr_nzcv,r0,r1,r2,r3
  ite hs
  movhs r0,#1
  movlo r0,#0
  saving_func_return

// int FUNC_NAME(__aeabi_dcmpgt)(double, double)         result (1, 0) denotes (>, ?<=) [2], use for C >
double_wrapper_section __aeabi_dcmpgt
saving_func wrapper __aeabi_dcmpgt
  dcp_dcmp_m apsr_nzcv,r0,r1,r2,r3
  ite hi
  movhi r0,#1
  movls r0,#0
  saving_func_return

#endif