/* * Copyright (c) 2024 Raspberry Pi (Trading) Ltd. * * SPDX-License-Identifier: BSD-3-Clause */ #include "pico/asm_helper.S" #include "hardware/hazard3.h" // This file reimplements some common single-precision soft float routines // from libgcc. It targets the RV32IMBZbkb dialect (plus optionally Xh3bextm) // and is tuned for Hazard3 execution timings. // Subnormal values are always flushed to zero on both input and output. // Rounding is always to nearest (even on tie). pico_default_asm_setup .macro float_section name #if PICO_FLOAT_IN_RAM .section RAM_SECTION_NAME(\name), "ax" #else .section SECTION_NAME(\name), "ax" #endif .endm float_section __addsf3 .global __subsf3 .p2align 2 __subsf3: binvi a1, a1, 31 .global __addsf3 __addsf3: // Unpack exponent: h3.bextmi a2, a0, 23, 8 h3.bextmi a3, a1, 23, 8 // Flush-to-zero => 0 + y = y applies, including nan, with the sole // exception of y being subnormal (which also needs to be flushed) beqz a2, __addsf_return_y_flushed // Don't have to handle this case for x + 0 = 0 because we already know x // is nonzero beqz a3, __addsf_return_x // Unpack significand, plus 3 extra zeroes for working space: slli a4, a0, 9 slli a5, a1, 9 // check nan/inf on input li t0, 255 beq a2, t0, __addsf_x_nan_inf beq a3, t0, __addsf_y_nan_inf // (finish unpacking significand) srli a4, a4, 6 srli a5, a5, 6 // If we're still on the straight path then we are adding two normal // values. Add implicit one (1.xx...xx000) bseti a4, a4, 23 + 3 bseti a5, a5, 23 + 3 // Negate if sign bit is set bgez a0, 1f neg a4, a4 1: // (tuck this 16-bit here to avoid alignment penalty) li t1, 25 bgez a1, 1f neg a5, a5 1: bltu a2, a3, __addsf_ye_gt_xe // The main body is repeated twice with different register assignments. // lhs is the more-significant addend: .macro addsf_core packed_lhs, packed_rhs, sig_lhs, sig_rhs, exp_lhs, exp_rhs, rhs_is_x sub \packed_rhs, \exp_lhs, \exp_rhs // If there is a large exponent difference then there is no effect on lhs .if \rhs_is_x bgeu \packed_rhs, t1, __addsf_return_y .else bgeu \packed_rhs, t1, __addsf_return_x .endif // Shift rhs down to correct relative significance sra \packed_lhs, \sig_rhs, \packed_rhs // Set sticky bit if ones were shifted out sll \packed_rhs, \packed_lhs, \packed_rhs sltu \packed_rhs, \packed_rhs, \sig_rhs or \packed_lhs, \packed_lhs, \packed_rhs // Add significands add \sig_lhs, \sig_lhs, \packed_lhs // Detect exact cancellation (may be beyond max normalisation shift; also // IEEE 754 requires +0 for exact cancellation, no matter input signs) beqz \sig_lhs, __addsf_return_0 // Convert two's complement back to sign + magnitude srai \exp_rhs, \sig_lhs, 31 xor \sig_lhs, \sig_lhs, \exp_rhs sub \sig_lhs, \sig_lhs, \exp_rhs // Renormalise significand: bit 31 is now implicit one clz \packed_lhs, \sig_lhs sll \sig_lhs, \sig_lhs, \packed_lhs // Adjust exponent addi \packed_lhs, \packed_lhs, -5 sub \exp_lhs, \exp_lhs, \packed_lhs // Round to nearest, even on tie (bias upward if above odd number) bexti \packed_lhs, \sig_lhs, 8 addi \sig_lhs, \sig_lhs, 127 add \sig_lhs, \sig_lhs, \packed_lhs // Exponent may increase by one due to rounding up from all-ones; this is // detected by clearing of implicit one (there is a carry-out too) bgez \sig_lhs, 3f 4: // Detect underflow/overflow bgeu \exp_lhs, t0, 1f // Pack and return packh \exp_lhs, \exp_lhs, \exp_rhs slli \exp_lhs, \exp_lhs, 23 slli \sig_lhs, \sig_lhs, 1 srli \sig_lhs, \sig_lhs, 9 add a0, \sig_lhs, \exp_lhs ret 1: bgez \exp_lhs, 2f // Signed zero on underflow slli a0, \exp_rhs, 31 ret 2: // Signed infinity on overflow packh a0, t0, \exp_rhs slli a0, a0, 23 ret 3: // Exponent increase due to rounding (uncommon) srli \sig_lhs, \sig_lhs, 1 addi \exp_lhs, \exp_lhs, 1 j 4b .endm __addsf_xe_gte_ye: addsf_core a0, a1, a4, a5, a2, a3, 0 .p2align 2 __addsf_ye_gt_xe: addsf_core a1, a0, a5, a4, a3, a2, 1 __addsf_x_nan_inf: // When at least one operand is nan, we must propagate at least one of // those nan payloads (sign of nan result is unspecified, which we take // advantage of by implementing x - y as x + -y). Check x nan vs inf: bnez a4, __addsf_return_x __addsf_x_inf: // If x is +-inf, need to distinguish the following cases: bne a3, t0, __addsf_return_x // y is neither inf nor nan -> return x (propagate inf) bnez a5, __addsf_return_y // y is nan: -> return y (propagate nan) xor a5, a0, a1 srli a5, a5, 31 beqz a5, __addsf_return_x // y is inf of same sign -> return either x or y (x is faster) li a0, -1 // y is inf of different sign -> return nan ret __addsf_y_nan_inf: // Mirror of __addsf_x_nan_inf bnez a5, __addsf_return_y __addsf_y_inf: bne a2, t0, __addsf_return_y bnez a4, __addsf_return_x xor a4, a0, a1 srli a4, a4, 31 beqz a4, __addsf_return_x li a0, -1 ret __addsf_return_y_flushed: bnez a3, 1f srli a1, a1, 23 slli a1, a1, 23 1: __addsf_return_y: mv a0, a1 __addsf_return_x: ret __addsf_return_0: li a0, 0 ret float_section __mulsf3 .global __mulsf3 .p2align 2 __mulsf3: // Force y to be positive (by possibly negating x) *before* unpacking. // This allows many special cases to be handled without repacking. bgez a1, 1f binvi a0, a0, 31 1: // Unpack exponent: h3.bextmi a2, a0, 23, 8 h3.bextmi a3, a1, 23, 8 // Check special cases li t0, 255 beqz a2, __mulsf_x_0 beqz a3, __mulsf_y_0 beq a2, t0, __mulsf_x_nan_inf beq a3, t0, __mulsf_y_nan_inf // Finish unpacking sign srai a6, a0, 31 // Unpack significand (with implicit one in MSB) slli a4, a0, 8 slli a5, a1, 8 bseti a4, a4, 31 bseti a5, a5, 31 // Get full 64-bit multiply result in a4:a1 (one cycle each half) // Going from Q1.23 to Q2.46 (both left-justified) mul a1, a4, a5 mulhu a4, a4, a5 // Normalise (shift left by either 0 or 1) -- bit 8 is the LSB of the // final significand (ignoring rounding) clz a0, a4 sll a4, a4, a0 sub a2, a2, a0 add a2, a2, a3 // Subtract redundant bias term (127), add 1 for normalisation correction addi a2, a2, -126 blez a2, __mulsf_underflow // Gather sticky bits from low fraction: snez a1, a1 or a4, a4, a1 // Round to nearest, even on tie (aka bias upward if odd) bexti a1, a4, 8 add a4, a4, a1 addi a4, a4, 127 // Check carry-out: exponent may increase due to rounding bgez a4, 2f 1: bge a2, t0, __mulsf_overflow // Pack it and ship it packh a2, a2, a6 slli a2, a2, 23 slli a4, a4, 1 srli a4, a4, 9 add a0, a4, a2 ret 2: srli a4, a4, 1 addi a2, a2, 1 j 1b __mulsf_underflow: // Signed zero slli a0, a6, 31 ret __mulsf_overflow: // Signed inf packh a0, t0, a6 slli a0, a0, 23 ret __mulsf_x_0: // 0 times nan -> propagate nan // 0 times inf -> generate nan // 0 times others -> 0 (need to flush significand too as we are FTZ) bne a3, t0, __mulsf_return_flushed_x slli a5, a1, 9 beqz a5, 1f // Propagate nan from y __mulsf_return_y: mv a0, a1 ret 1: // Generate new nan li a0, -1 ret __mulsf_y_0: // Mirror image of x_0 except we still return x for signed 0, since the // signs were already resolved. bne a2, t0, __mulsf_return_flushed_x slli a1, a0, 9 bnez a1, 1f li a0, -1 1: ret __mulsf_return_flushed_x: // If we don't support subnormals we at least need to flush to a canonical // zero. This is just a sign bit in bit 31. srli a0, a0, 31 slli a0, a0, 31 __mulsf_return_x: ret __mulsf_x_nan_inf: // We know that y is not zero and is positive. So... // x is nan -> return x // else y is nan -> return y // else y is inf -> return x // else y is normal -> return x // (the order of the first two clauses is actually our free choice) slli a4, a0, 9 bnez a4, __mulsf_return_x bne a3, t0, __mulsf_return_x slli a5, a1, 9 bnez a5, __mulsf_return_y ret // return x __mulsf_y_nan_inf: // We know that x is not zero, nan, nor inf. That just leaves normals. // y is nan -> return y // y is inf -> return inf * sgn(x) (since we already merged the signs) slli a5, a1, 9 bnez a5, __mulsf_return_y srai a0, a0, 31 packh a0, t0, a0 slli a0, a0, 23 ret // This is a hack to improve soft float performance for the routines we don't // implement (e.g. libm) in libraries built against a non-Zbb ISA dialect: float_section __clz2si .global __clz2si __clz2si: clz a0, a0 ret