aboutsummaryrefslogtreecommitdiffstats
path: root/qfplib-port.h
diff options
context:
space:
mode:
Diffstat (limited to 'qfplib-port.h')
-rw-r--r--qfplib-port.h381
1 files changed, 381 insertions, 0 deletions
diff --git a/qfplib-port.h b/qfplib-port.h
new file mode 100644
index 0000000..1070d35
--- /dev/null
+++ b/qfplib-port.h
@@ -0,0 +1,381 @@
+inline float qfp_fpow(float b, float e)
+{
+ return qfp_fexp(qfp_fmul(e, qfp_fln(b)));
+}
+
+inline float qfp_flog10(float x)
+{
+ static const auto ln10 = qfp_fln(10.f);
+ return qfp_fdiv(qfp_fln(x), ln10);
+}
+
+__attribute__((naked, section(".data")))
+inline float qfp_fadd_asm(float, float)
+{
+asm(R"(
+.syntax unified
+ push {r4,r5,r6,r14}
+ asrs r4,r0,#31
+ lsls r2,r0,#1
+ lsrs r2,#24 @ x exponent
+ beq fa_xe0
+ cmp r2,#255
+ beq fa_xe255
+fa_xe:
+ asrs r5,r1,#31
+ lsls r3,r1,#1
+ lsrs r3,#24 @ y exponent
+ beq fa_ye0
+ cmp r3,#255
+ beq fa_ye255
+fa_ye:
+ ldr r6,=#0x007fffff
+ ands r0,r0,r6 @ extract mantissa bits
+ ands r1,r1,r6
+ adds r6,#1 @ r6=0x00800000
+ orrs r0,r0,r6 @ set implied 1
+ orrs r1,r1,r6
+ eors r0,r0,r4 @ complement...
+ eors r1,r1,r5
+ subs r0,r0,r4 @ ... and add 1 if sign bit is set: 2's complement
+ subs r1,r1,r5
+ subs r5,r3,r2 @ ye-xe
+ subs r4,r2,r3 @ xe-ye
+ bmi fa_ygtx
+@ here xe>=ye
+ cmp r4,#30
+ bge fa_xmgty @ xe much greater than ye?
+ adds r5,#32
+ movs r3,r2 @ save exponent
+@ here y in r1 must be shifted down r4 places to align with x in r0
+ movs r2,r1
+ lsls r2,r2,r5 @ keep the bits we will shift off the bottom of r1
+ asrs r1,r1,r4
+ b fa_0
+
+.ltorg
+
+fa_ymgtx:
+ movs r2,#0 @ result is just y
+ movs r0,r1
+ b fa_1
+fa_xmgty:
+ movs r3,r2 @ result is just x
+ movs r2,#0
+ b fa_1
+
+fa_ygtx:
+@ here ye>xe
+ cmp r5,#30
+ bge fa_ymgtx @ ye much greater than xe?
+ adds r4,#32
+@ here x in r0 must be shifted down r5 places to align with y in r1
+ movs r2,r0
+ lsls r2,r2,r4 @ keep the bits we will shift off the bottom of r0
+ asrs r0,r0,r5
+fa_0:
+ adds r0,r1 @ result is now in r0:r2, possibly highly denormalised or zero; exponent in r3
+ beq fa_9 @ if zero, inputs must have been of identical magnitude and opposite sign, so return +0
+fa_1:
+ lsrs r1,r0,#31 @ sign bit
+ beq fa_8
+ mvns r0,r0
+ rsbs r2,r2,#0
+ bne fa_8
+ adds r0,#1
+fa_8:
+ adds r6,r6
+@ r6=0x01000000
+ cmp r0,r6
+ bhs fa_2
+fa_3:
+ adds r2,r2 @ normalisation loop
+ adcs r0,r0
+ subs r3,#1 @ adjust exponent
+ cmp r0,r6
+ blo fa_3
+fa_2:
+@ here r0:r2 is the result mantissa 0x01000000<=r0<0x02000000, r3 the exponent, and r1 the sign bit
+ lsrs r0,#1
+ bcc fa_4
+@ rounding bits here are 1:r2
+ adds r0,#1 @ round up
+ cmp r2,#0
+ beq fa_5 @ sticky bits all zero?
+fa_4:
+ cmp r3,#254
+ bhs fa_6 @ exponent too large or negative?
+ lsls r1,#31 @ pack everything
+ add r0,r1
+ lsls r3,#23
+ add r0,r3
+fa_end:
+ pop {r4,r5,r6,r15}
+
+fa_9:
+ cmp r2,#0 @ result zero?
+ beq fa_end @ return +0
+ b fa_1
+
+fa_5:
+ lsrs r0,#1
+ lsls r0,#1 @ round to even
+ b fa_4
+
+fa_6:
+ bge fa_7
+@ underflow
+@ can handle denormals here
+ lsls r0,r1,#31 @ result is signed zero
+ pop {r4,r5,r6,r15}
+fa_7:
+@ overflow
+ lsls r0,r1,#8
+ adds r0,#255
+ lsls r0,#23 @ result is signed infinity
+ pop {r4,r5,r6,r15}
+
+
+fa_xe0:
+@ can handle denormals here
+ subs r2,#32
+ adds r2,r4 @ exponent -32 for +Inf, -33 for -Inf
+ b fa_xe
+
+fa_xe255:
+@ can handle NaNs here
+ lsls r2,#8
+ add r2,r2,r4 @ exponent ~64k for +Inf, ~64k-1 for -Inf
+ b fa_xe
+
+fa_ye0:
+@ can handle denormals here
+ subs r3,#32
+ adds r3,r5 @ exponent -32 for +Inf, -33 for -Inf
+ b fa_ye
+
+fa_ye255:
+@ can handle NaNs here
+ lsls r3,#8
+ add r3,r3,r5 @ exponent ~64k for +Inf, ~64k-1 for -Inf
+ b fa_ye
+ )");
+}
+
+__attribute__((naked, section(".data")))
+inline float qfp_fmul_asm(float, float)
+{
+asm(R"(
+.syntax unified
+ push {r7,r14}
+ mov r2,r0
+ eors r2,r1 @ sign of result
+ lsrs r2,#31
+ lsls r2,#31
+ mov r14,r2
+ lsls r0,#1
+ lsls r1,#1
+ lsrs r2,r0,#24 @ xe
+ beq fm_xe0
+ cmp r2,#255
+ beq fm_xe255
+fm_xe:
+ lsrs r3,r1,#24 @ ye
+ beq fm_ye0
+ cmp r3,#255
+ beq fm_ye255
+fm_ye:
+ adds r7,r2,r3 @ exponent of result (will possibly be incremented)
+ subs r7,#128 @ adjust bias for packing
+ lsls r0,#8 @ x mantissa
+ lsls r1,#8 @ y mantissa
+ lsrs r0,#9
+ lsrs r1,#9
+
+ adds r2,r0,r1 @ for later
+ mov r12,r2
+ lsrs r2,r0,#7 @ x[22..7] Q16
+ lsrs r3,r1,#7 @ y[22..7] Q16
+ muls r2,r2,r3 @ result [45..14] Q32: never an overestimate and worst case error is 2*(2^7-1)*(2^23-2^7)+(2^7-1)^2 = 2130690049 < 2^31
+ muls r0,r0,r1 @ result [31..0] Q46
+ lsrs r2,#18 @ result [45..32] Q14
+ bcc 1f
+ cmp r0,#0
+ bmi 1f
+ adds r2,#1 @ fix error in r2
+1:
+ lsls r3,r0,#9 @ bits off bottom of result
+ lsrs r0,#23 @ Q23
+ lsls r2,#9
+ adds r0,r2 @ cut'n'shut
+ add r0,r12 @ implied 1*(x+y) to compensate for no insertion of implied 1s
+@ result-1 in r3:r0 Q23+32, i.e., in range [0,3)
+
+ lsrs r1,r0,#23
+ bne fm_0 @ branch if we need to shift down one place
+@ here 1<=result<2
+ cmp r7,#254
+ bhs fm_3a @ catches both underflow and overflow
+ lsls r3,#1 @ sticky bits at top of R3, rounding bit in carry
+ bcc fm_1 @ no rounding
+ beq fm_2 @ rounding tie?
+ adds r0,#1 @ round up
+fm_1:
+ adds r7,#1 @ for implied 1
+ lsls r7,#23 @ pack result
+ add r0,r7
+ add r0,r14
+ pop {r7,r15}
+fm_2: @ rounding tie
+ adds r0,#1
+fm_3:
+ lsrs r0,#1
+ lsls r0,#1 @ clear bottom bit
+ b fm_1
+
+@ here 1<=result-1<3
+fm_0:
+ adds r7,#1 @ increment exponent
+ cmp r7,#254
+ bhs fm_3b @ catches both underflow and overflow
+ lsrs r0,#1 @ shift mantissa down
+ bcc fm_1a @ no rounding
+ adds r0,#1 @ assume we will round up
+ cmp r3,#0 @ sticky bits
+ beq fm_3c @ rounding tie?
+fm_1a:
+ adds r7,r7
+ adds r7,#1 @ for implied 1
+ lsls r7,#22 @ pack result
+ add r0,r7
+ add r0,r14
+ pop {r7,r15}
+
+fm_3c:
+ lsrs r0,#1
+ lsls r0,#1 @ clear bottom bit
+ b fm_1a
+
+fm_xe0:
+ subs r2,#16
+fm_xe255:
+ lsls r2,#8
+ b fm_xe
+fm_ye0:
+ subs r3,#16
+fm_ye255:
+ lsls r3,#8
+ b fm_ye
+
+@ here the result is under- or overflowing
+fm_3b:
+ bge fm_4 @ branch on overflow
+@ trap case where result is denormal 0x007fffff + 0.5ulp or more
+ adds r7,#1 @ exponent=-1?
+ bne fm_5
+@ corrected mantissa will be >= 3.FFFFFC (0x1fffffe Q23)
+@ so r0 >= 2.FFFFFC (0x17ffffe Q23)
+ adds r0,#2
+ lsrs r0,#23
+ cmp r0,#3
+ bne fm_5
+ b fm_6
+
+fm_3a:
+ bge fm_4 @ branch on overflow
+@ trap case where result is denormal 0x007fffff + 0.5ulp or more
+ adds r7,#1 @ exponent=-1?
+ bne fm_5
+ adds r0,#1 @ mantissa=0xffffff (i.e., r0=0x7fffff)?
+ lsrs r0,#23
+ beq fm_5
+fm_6:
+ movs r0,#1 @ return smallest normal
+ lsls r0,#23
+ add r0,r14
+ pop {r7,r15}
+
+fm_5:
+ mov r0,r14
+ pop {r7,r15}
+fm_4:
+ movs r0,#0xff
+ lsls r0,#23
+ add r0,r14
+ pop {r7,r15}
+)");
+}
+
+__attribute__((naked, section(".data")))
+inline float qfp_int2float_asm(int)
+{
+asm(R"(
+.syntax unified
+ movs r1,#0 @ fall through
+ push {r4,r5,r6,r14}
+ movs r2,#29
+ subs r2,r1 @ fix exponent
+ movs r5,#0
+ bl qfp_int2float_packx
+ pop {r4,r5,r6,r15}
+qfp_int2float_packx:
+ lsrs r4,r0,#31 @ save sign bit
+ lsls r4,r4,#31 @ sign now in b31
+ bpl 2f @ skip if positive
+ cmp r5,#0
+ beq 11f
+ adds r0,#1 @ fiddle carry in to following rsb if sticky bits are non-zero
+11:
+ rsbs r0,#0 @ can now treat r0 as unsigned
+ bmi 3f @ catch r0=0x80000000 case
+2:
+ subs r2,#1 @ normalisation loop
+ adds r0,r0
+ beq 1f @ zero? special case
+ bpl 2b @ normalise so leading "1" in bit 31
+3:
+ adds r2,#129 @ (mis-)offset exponent
+ bne 12f @ special case: highest denormal can round to lowest normal
+ adds r0,#0x80 @ in special case, need to add 256 to r0 for rounding
+ bcs 4f @ tripped carry? then have leading 1 in C as required
+12:
+ adds r0,#0x80 @ rounding
+ bcs 4f @ tripped carry? then have leading 1 in C as required (and result is even so can ignore sticky bits)
+ cmp r5,#0
+ beq 7f @ sticky bits zero?
+8:
+ lsls r0,#1 @ remove leading 1
+9:
+ subs r2,#1 @ compensate exponent on this path
+4:
+ cmp r2,#254
+ bge 5f @ overflow?
+ adds r2,#1 @ correct exponent offset
+ ble 10f @ denormal/underflow?
+ lsrs r0,#9 @ align mantissa
+ lsls r2,#23 @ align exponent
+ orrs r0,r2 @ assemble exponent and mantissa
+6:
+ orrs r0,r4 @ apply sign
+1:
+ bx r14
+
+5:
+ movs r0,#0xff @ create infinity
+ lsls r0,#23
+ b 6b
+
+10:
+ movs r0,#0 @ create zero
+ bx r14
+
+7: @ sticky bit rounding case
+ lsls r5,r0,#24 @ check bottom 8 bits of r0
+ bne 8b @ in rounding-tie case?
+ lsrs r0,#9 @ ensure even result
+ lsls r0,#10
+ b 9b
+)");
+}
+
+