diff --git a/README.md b/README.md
index db4522d..25d2fca 100644
--- a/README.md
+++ b/README.md
@@ -16,6 +16,14 @@ You need:
 
 Extract ChibiOS to a folder, edit the `Makefile` so CHIBIOS points to that folder, then run `make`.
 
+### Flashing the card
+
+You'll need a 6-pin Tag-Connect cable (e.g. [TC2030-CTX-NL](https://www.tag-connect.com/product/tc2030-ctx-nl-6-pin-no-legs-cable-with-10-pin-micro-connector-for-cortex-processors)), compatible programmer, and OpenOCD. Power up the card and run the following command (using the appropriate interface scripts for your programmer):
+
+```
+openocd -f interface/ftdi/olimex-arm-usb-ocd-h.cfg -f interface/ftdi/olimex-arm-jtag-swd.cfg -f target/stm32g0x.cfg -c "program build/ch.hex verify reset exit"
+```
+
 ## Credits
 
 * [ESP32-I2S-SLM](https://hackaday.io/project/166867-esp32-i2s-slm) for a starting point with accurate decibel-measuring code.
diff --git a/qfplib-m0-full-20240105/qfplib-m0-full.h b/qfplib-m0-full-20240105/qfplib-m0-full.h
index 36d17d2..0b1f6eb 100644
--- a/qfplib-m0-full-20240105/qfplib-m0-full.h
+++ b/qfplib-m0-full-20240105/qfplib-m0-full.h
@@ -23,8 +23,6 @@ Fifth Floor, Boston, MA  02110-1301, USA.
 
 typedef unsigned           int ui32;
 typedef                    int i32;
-typedef unsigned long long int ui64;
-typedef          long long int i64;
 
 extern float  qfp_fadd          (float x,float y);
 extern float  qfp_fsub          (float x,float y);
@@ -40,58 +38,7 @@ extern float  qfp_int2float     (i32 x);
 extern float  qfp_fix2float     (i32 x,int f);
 extern float  qfp_uint2float    (ui32 x);
 extern float  qfp_ufix2float    (ui32 x,int f);
-extern float  qfp_int642float   (i64 x);
-extern float  qfp_fix642float   (i64 x,int f);
-extern float  qfp_uint642float  (ui64 x);
-extern float  qfp_ufix642float  (ui64 x,int f);
-extern float  qfp_fcos          (float x);
-extern float  qfp_fsin          (float x);
-extern float  qfp_ftan          (float x);
-extern float  qfp_fatan2        (float y,float x);
 extern float  qfp_fexp          (float x);
 extern float  qfp_fln           (float x);
 
-extern double qfp_dadd          (double x,double y);
-extern double qfp_dsub          (double x,double y);
-extern double qfp_dmul          (double x,double y);
-extern double qfp_ddiv          (double x,double y);
-extern double qfp_dsqrt         (double x);
-extern double qfp_dcos          (double x);
-extern double qfp_dsin          (double x);
-extern double qfp_dtan          (double x);
-extern double qfp_datan2        (double y,double x);
-extern double qfp_dexp          (double x);
-extern double qfp_dln           (double x);
-extern int    qfp_dcmp          (double x,double y);
-
-extern i64    qfp_float2int64   (float x);
-extern i64    qfp_float2fix64   (float x,int f);
-extern ui64   qfp_float2uint64  (float x);
-extern ui64   qfp_float2ufix64  (float x,int f);
-extern i32    qfp_float2int_z   (float x);
-extern i64    qfp_float2int64_z (float x);
-
-extern i32    qfp_double2int    (double x);
-extern i32    qfp_double2fix    (double x,int f);
-extern ui32   qfp_double2uint   (double x);
-extern ui32   qfp_double2ufix   (double x,int f);
-extern i64    qfp_double2int64  (double x);
-extern i64    qfp_double2fix64  (double x,int f);
-extern ui64   qfp_double2uint64 (double x);
-extern ui64   qfp_double2ufix64 (double x,int f);
-extern i32    qfp_double2int_z  (double x);
-extern i64    qfp_double2int64_z(double x);
-
-extern double qfp_int2double    (i32  x);
-extern double qfp_fix2double    (i32  x,int f);
-extern double qfp_uint2double   (ui32 x);
-extern double qfp_ufix2double   (ui32 x,int f);
-extern double qfp_int642double  (i64  x);
-extern double qfp_fix642double  (i64  x,int f);
-extern double qfp_uint642double (ui64 x);
-extern double qfp_ufix642double (ui64 x,int f);
-
-extern float  qfp_double2float  (double x);
-extern double qfp_float2double  (float x);
-
 #endif
diff --git a/qfplib-m0-full-20240105/qfplib-m0-full.s b/qfplib-m0-full-20240105/qfplib-m0-full.s
index 0a354f1..2ded28d 100644
--- a/qfplib-m0-full-20240105/qfplib-m0-full.s
+++ b/qfplib-m0-full-20240105/qfplib-m0-full.s
@@ -36,60 +36,9 @@
 .global qfp_fix2float
 .global qfp_uint2float
 .global qfp_ufix2float
-.global qfp_int642float
-.global qfp_fix642float
-.global qfp_uint642float
-.global qfp_ufix642float
-.global qfp_fcos
-.global qfp_fsin
-.global qfp_ftan
-.global qfp_fatan2
 .global qfp_fexp
 .global qfp_fln
 
-.global qfp_dadd
-.global qfp_dsub
-.global qfp_dmul
-.global qfp_ddiv
-.global qfp_dsqrt
-.global qfp_dcos
-.global qfp_dsin
-.global qfp_dtan
-.global qfp_datan2
-.global qfp_dexp
-.global qfp_dln
-.global qfp_dcmp
-
-.global qfp_float2int64
-.global qfp_float2fix64
-.global qfp_float2uint64
-.global qfp_float2ufix64
-.global qfp_float2int_z
-.global qfp_float2int64_z
-
-.global qfp_double2int
-.global qfp_double2fix
-.global qfp_double2uint
-.global qfp_double2ufix
-.global qfp_double2int64
-.global qfp_double2fix64
-.global qfp_double2uint64
-.global qfp_double2ufix64
-.global qfp_double2int_z
-.global qfp_double2int64_z
-
-.global qfp_int2double
-.global qfp_fix2double
-.global qfp_uint2double
-.global qfp_ufix2double
-.global qfp_int642double
-.global qfp_fix642double
-.global qfp_uint642double
-.global qfp_ufix642double
-
-.global qfp_double2float
-.global qfp_float2double
-
 qfp_lib_start:
 
 @ exchange r0<->r1, r2<->r3
@@ -266,31 +215,6 @@ qfp_fcmp:
  bge 2b
  b 3b
 
-@ convert float to signed int, rounding towards 0, clamping
-.thumb_func
-qfp_float2int_z:
- push {r14}
- cmp r0,#0
- blt 1f
- bl qfp_float2int   @ +ve or zero? then use rounding towards -Inf
- cmp r0,#0
- bge 2f
- ldr r0,=#0x7fffffff
-2:
- pop {r15}
-1:
- lsls r0,#1          @ -ve: clear sign bit
- lsrs r0,#1
- bl qfp_float2uint   @ convert to unsigned, rounding towards -Inf
- cmp r0,#0
- blt 1f
- rsbs r0,#0
- pop {r15}
-1:
- movs r0,#1
- lsls r0,#31         @ 0x80000000
- pop {r15}
-
 .ltorg
 
 @ convert float to signed int, rounding towards -Inf, clamping
@@ -351,56 +275,6 @@ qfp_float2ufix:
  lsls r0,r0,r2   @ result fits, left shifted
  pop {r4,r15}
 
-
-@ convert uint64 to float, rounding
-.thumb_func
-qfp_uint642float:
- movs r2,#0       @ fall through
-
-@ convert unsigned 64-bit fix to float, rounding; number of r0:r1 bits after point in r2
-.thumb_func
-qfp_ufix642float:
- push {r4,r5,r14}
- cmp r1,#0
- bpl 3f          @ positive? we can use signed code
- lsls r5,r1,#31  @ contribution to sticky bits
- orrs r5,r0
- lsrs r0,r1,#1
- subs r2,#1
- b 4f
-
-@ convert int64 to float, rounding
-.thumb_func
-qfp_int642float:
- movs r2,#0       @ fall through
-
-@ convert signed 64-bit fix to float, rounding; number of r0:r1 bits after point in r2
-.thumb_func
-qfp_fix642float:
- push {r4,r5,r14}
-3:
- movs r5,r0
- orrs r5,r1
- beq ret_pop45   @ zero? return +0
- asrs r5,r1,#31  @ sign bits
-2:
- asrs r4,r1,#24  @ try shifting 7 bits at a time
- cmp r4,r5
- bne 1f          @ next shift will overflow?
- lsls r1,#7
- lsrs r4,r0,#25
- orrs r1,r4
- lsls r0,#7
- adds r2,#7
- b 2b
-1:
- movs r5,r0
- movs r0,r1
-4:
- rsbs r2,#0
- adds r2,#32+29
- b packret
-
 @ convert signed int to float, rounding
 .thumb_func
 qfp_int2float:
@@ -436,164 +310,6 @@ qfp_ufix2float:
  lsrs r0,#1
  b packret
 
-@ All the scientific functions are implemented using the CORDIC algorithm. For notation,
-@ details not explained in the comments below, and a good overall survey see
-@ "50 Years of CORDIC: Algorithms, Architectures, and Applications" by Meher et al.,
-@ IEEE Transactions on Circuits and Systems Part I, Volume 56 Issue 9.
-
-@ Register use:
-@ r0: x
-@ r1: y
-@ r2: z/omega
-@ r3: coefficient pointer
-@ r4,r12: m
-@ r5: i (shift)
-
-cordic_start: @ initialisation
- movs r5,#0   @ initial shift=0
- mov r12,r4
- b 5f
-
-cordic_vstep: @ one step of algorithm in vector mode
- cmp r1,#0    @ check sign of y
- bgt 4f
- b 1f
-cordic_rstep: @ one step of algorithm in rotation mode
- cmp r2,#0    @ check sign of angle
- bge 1f
-4:
- subs r1,r6   @ negative rotation: y=y-(x>>i)
- rsbs r7,#0
- adds r2,r4   @ accumulate angle
- b 2f
-1:
- adds r1,r6   @ positive rotation: y=y+(x>>i)
- subs r2,r4   @ accumulate angle
-2:
- mov r4,r12
- muls r7,r4   @ apply sign from m
- subs r0,r7   @ finish rotation: x=x{+/-}(y>>i)
-5:
- ldmia r3!,{r4} @ fetch next angle from table and bump pointer
- lsrs r4,#1   @ repeated angle?
- bcs 3f
- adds r5,#1   @ adjust shift if not
-3:
- mov r6,r0
- asrs r6,r5   @ x>>i
- mov r7,r1
- asrs r7,r5   @ y>>i
- lsrs r4,#1   @ shift end flag into carry
- bx r14
-
-@ CORDIC rotation mode
-cordic_rot:
- push {r6,r7,r14}
- bl cordic_start   @ initialise
-1:
- bl cordic_rstep
- bcc 1b            @ step until table finished
- asrs r6,r0,#14    @ remaining small rotations can be linearised: see IV.B of paper referenced above
- asrs r7,r1,#14
- asrs r2,#3
- muls r6,r2        @ all remaining CORDIC steps in a multiplication
- muls r7,r2
- mov r4,r12
- muls r7,r4
- asrs r6,#12
- asrs r7,#12
- subs r0,r7        @ x=x{+/-}(yz>>k)
- adds r1,r6        @ y=y+(xz>>k)
-cordic_exit:
- pop {r6,r7,r15}
-
-@ CORDIC vector mode
-cordic_vec:
- push {r6,r7,r14}
- bl cordic_start   @ initialise
-1:
- bl cordic_vstep
- bcc 1b            @ step until table finished
-4:
- cmp r1,#0         @ continue as in cordic_vstep but without using table; x is not affected as y is small
- bgt 2f            @ check sign of y
- adds r1,r6        @ positive rotation: y=y+(x>>i)
- subs r2,r4        @ accumulate angle
- b 3f
-2:
- subs r1,r6        @ negative rotation: y=y-(x>>i)
- adds r2,r4        @ accumulate angle
-3:
- asrs r6,#1
- asrs r4,#1        @ next "table entry"
- bne 4b
- b cordic_exit
-
-.thumb_func
-qfp_fsin:            @ calculate sin and cos using CORDIC rotation method
- push {r4,r5,r14}
- movs r1,#24
- bl qfp_float2fix    @ range reduction by repeated subtraction/addition in fixed point
- ldr r4,pi_q29
- lsrs r4,#4          @ 2pi Q24
-1:
- subs r0,r4
- bge 1b
-1:
- adds r0,r4
- bmi 1b              @ now in range 0..2pi
- lsls r2,r0,#2       @ z Q26
- lsls r5,r4,#1       @ pi Q26 (r4=pi/2 Q26)
- ldr r0,=#0x136e9db4 @ initialise CORDIC x,y with scaling
- movs r1,#0
-1:
- cmp r2,r4           @ >pi/2?
- blt 2f
- subs r2,r5          @ reduce range to -pi/2..pi/2
- rsbs r0,#0          @ rotate vector by pi
- b 1b
-2:
- lsls r2,#3          @ Q29
- adr r3,tab_cc       @ circular coefficients
- movs r4,#1          @ m=1
- bl cordic_rot
- adds r1,#9          @ fiddle factor to make sin(0)==0
- movs r2,#0          @ exponents to zero
- movs r3,#0
- movs r5,#0          @ no sticky bits
- bl clampx
- bl packx            @ pack cosine
- bl xchxy
- bl clampx
- b packretns         @ pack sine
-
-.thumb_func
-qfp_fcos:
- push {r14}
- bl qfp_fsin
- mov r0,r1           @ extract cosine result
- pop {r15}
-
-@ force r0 to lie in range [-1,1] Q29
-clampx:
- movs r4,#1
- lsls r4,#29
- cmp r0,r4
- bgt 1f
- rsbs r4,#0
- cmp r0,r4
- ble 1f
- bx r14
-1:
- movs r0,r4
- bx r14
-
-.thumb_func
-qfp_ftan:
- push {r4,r5,r6,r14}
- bl qfp_fsin         @ sine in r0/r2, cosine in r1/r3
- b fdiv_n            @ sin/cos
-
 .thumb_func
 qfp_fexp:
  push {r4,r5,r14}
@@ -733,90 +449,6 @@ ftab_exp:
 .word 0x0001fff8   @ log 1+2^-13 Q30
 .word 0x0000fffe   @ log 1+2^-14 Q30
 
-.thumb_func
-qfp_fatan2:
- push {r4,r5,r14}
-
-@ unpack arguments and shift one down to have common exponent
- bl unpackx
- bl xchxy
- bl unpackx
- lsls r0,r0,#5  @ Q28
- lsls r1,r1,#5  @ Q28
- adds r4,r2,r3  @ this is -760 if both arguments are 0 and at least -380-126=-506 otherwise
- asrs r4,#9
- adds r4,#1
- bmi 2f         @ force y to 0 proper, so result will be zero
- subs r4,r2,r3  @ calculate shift
- bge 1f         @ ex>=ey?
- rsbs r4,#0     @ make shift positive
- asrs r0,r4
- cmp r4,#28
- blo 3f
- asrs r0,#31
- b 3f
-1:
- asrs r1,r4
- cmp r4,#28
- blo 3f
-2:
-@ here |x|>>|y| or both x and y are ±0
- cmp r0,#0
- bge 4f         @ x positive, return signed 0
- ldr r0,pi_q29  @ x negative, return +/- pi
- asrs r1,#31
- eors r0,r1
- b 7f
-4:
- asrs r0,r1,#31
- b 7f
-3:
- movs r2,#0              @ initial angle
- cmp r0,#0               @ x negative
- bge 5f
- rsbs r0,#0              @ rotate to 1st/4th quadrants
- rsbs r1,#0
- ldr r2,pi_q29           @ pi Q29
-5:
- adr r3,tab_cc           @ circular coefficients
- movs r4,#1              @ m=1
- bl cordic_vec           @ also produces magnitude (with scaling factor 1.646760119), which is discarded
- mov r0,r2               @ result here is -pi/2..3pi/2 Q29
-@ asrs r2,#29
-@ subs r0,r2
- ldr r2,pi_q29           @ pi Q29
- adds r4,r0,r2           @ attempt to fix -3pi/2..-pi case
- bcs 6f                  @ -pi/2..0? leave result as is
- subs r4,r0,r2           @ <pi? leave as is
- bmi 6f
- subs r0,r4,r2           @ >pi: take off 2pi
-6:
- subs r0,#1              @ fiddle factor so atan2(0,1)==0
-7:
- movs r2,#0              @ exponent for pack
- b packretns
-
-.align 2
-.ltorg
-
-@ first entry in following table is pi Q29
-pi_q29:
-@ circular CORDIC coefficients: atan(2^-i), b0=flag for preventing shift, b1=flag for end of table
-tab_cc:
-.word 0x1921fb54*4+1     @ no shift before first iteration
-.word 0x0ed63383*4+0
-.word 0x07d6dd7e*4+0
-.word 0x03fab753*4+0
-.word 0x01ff55bb*4+0
-.word 0x00ffeaae*4+0
-.word 0x007ffd55*4+0
-.word 0x003fffab*4+0
-.word 0x001ffff5*4+0
-.word 0x000fffff*4+0
-.word 0x0007ffff*4+0
-.word 0x00040000*4+0
-.word 0x00020000*4+0+2   @ +2 marks end
-
 .align 2
 .thumb_func
 qfp_fsub:
@@ -1404,2267 +1036,4 @@ sq_2:
 rsqrtapp:
 .byte 0xf1,0xda,0xc9,0xbb, 0xb0,0xa6,0x9e,0x97, 0x91,0x8b,0x86,0x82
 
-
-
-@ Notation:
-@ rx:ry means the concatenation of rx and ry with rx having the less significant bits
-
-@ IEEE double in ra:rb ->
-@ mantissa in ra:rb 12Q52 (53 significant bits) with implied 1 set
-@ exponent in re
-@ sign in rs
-@ trashes rt
-.macro mdunpack ra,rb,re,rs,rt
- lsrs \re,\rb,#20              @ extract sign and exponent
- subs \rs,\re,#1
- lsls \rs,#20
- subs \rb,\rs                  @ clear sign and exponent in mantissa; insert implied 1
- lsrs \rs,\re,#11              @ sign
- lsls \re,#21
- lsrs \re,#21                  @ exponent
- beq l\@_1                     @ zero exponent?
- adds \rt,\re,#1
- lsrs \rt,#11
- beq l\@_2                     @ exponent != 0x7ff? then done
-l\@_1:
- movs \ra,#0
- movs \rb,#1
- lsls \rb,#20
- subs \re,#128
- lsls \re,#12
-l\@_2:
-.endm
-
-@ IEEE double in ra:rb ->
-@ signed mantissa in ra:rb 12Q52 (53 significant bits) with implied 1
-@ exponent in re
-@ trashes rt0 and rt1
-@ +zero, +denormal -> exponent=-0x80000
-@ -zero, -denormal -> exponent=-0x80000
-@ +Inf, +NaN -> exponent=+0x77f000
-@ -Inf, -NaN -> exponent=+0x77e000
-.macro mdunpacks ra,rb,re,rt0,rt1
- lsrs \re,\rb,#20              @ extract sign and exponent
- lsrs \rt1,\rb,#31             @ sign only
- subs \rt0,\re,#1
- lsls \rt0,#20
- subs \rb,\rt0                 @ clear sign and exponent in mantissa; insert implied 1
- lsls \re,#21
- bcc l\@_1                     @ skip on positive
- mvns \rb,\rb                  @ negate mantissa
- rsbs \ra,#0
- bcc l\@_1
- adds \rb,#1
-l\@_1:
- lsrs \re,#21
- beq l\@_2                     @ zero exponent?
- adds \rt0,\re,#1
- lsrs \rt0,#11
- beq l\@_3                     @ exponent != 0x7ff? then done
- subs \re,\rt1
-l\@_2:
- movs \ra,#0
- lsls \rt1,#1                  @ +ve: 0  -ve: 2
- adds \rb,\rt1,#1              @ +ve: 1  -ve: 3
- lsls \rb,#30                  @ create +/-1 mantissa
- asrs \rb,#10
- subs \re,#128
- lsls \re,#12
-l\@_3:
-.endm
-
-.align 2
-.thumb_func
-qfp_dsub:
- push {r4-r7,r14}
- movs r4,#1
- lsls r4,#31
- eors r3,r4                    @ flip sign on second argument
- b da_entry                    @ continue in dadd
-
-.align 2
-.thumb_func
-qfp_dadd:
- push {r4-r7,r14}
-da_entry:
- mdunpacks r0,r1,r4,r6,r7
- mdunpacks r2,r3,r5,r6,r7
- subs r7,r5,r4                 @ ye-xe
- subs r6,r4,r5                 @ xe-ye
- bmi da_ygtx
-@ here xe>=ye: need to shift y down r6 places
- mov r12,r4                    @ save exponent
- cmp r6,#32
- bge da_xrgty                  @ xe rather greater than ye?
- adds r7,#32
- movs r4,r2
- lsls r4,r4,r7                 @ rounding bit + sticky bits
-da_xgty0:
- movs r5,r3
- lsls r5,r5,r7
- lsrs r2,r6
- asrs r3,r6
- orrs r2,r5
-da_add:
- adds r0,r2
- adcs r1,r3
-da_pack:
-@ here unnormalised signed result (possibly 0) is in r0:r1 with exponent r12, rounding + sticky bits in r4
-@ Note that if a large normalisation shift is required then the arguments were close in magnitude and so we
-@ cannot have not gone via the xrgty/yrgtx paths. There will therefore always be enough high bits in r4
-@ to provide a correct continuation of the exact result.
-@ now pack result back up
- lsrs r3,r1,#31                @ get sign bit
- beq 1f                        @ skip on positive
- mvns r1,r1                    @ negate mantissa
- mvns r0,r0
- movs r2,#0
- rsbs r4,#0
- adcs r0,r2
- adcs r1,r2
-1:
- mov r2,r12                    @ get exponent
- lsrs r5,r1,#21
- bne da_0                      @ shift down required?
- lsrs r5,r1,#20
- bne da_1                      @ normalised?
- cmp r0,#0
- beq da_5                      @ could mantissa be zero?
-da_2:
- adds r4,r4
- adcs r0,r0
- adcs r1,r1
- subs r2,#1                    @ adjust exponent
- lsrs r5,r1,#20
- beq da_2
-da_1:
- lsls r4,#1                    @ check rounding bit
- bcc da_3
-da_4:
- adds r0,#1                    @ round up
- bcc 2f
- adds r1,#1
-2:
- cmp r4,#0                     @ sticky bits zero?
- bne da_3
- lsrs r0,#1                    @ round to even
- lsls r0,#1
-da_3:
- subs r2,#1
- bmi da_6
- adds r4,r2,#2                 @ check if exponent is overflowing
- lsrs r4,#11
- bne da_7
- lsls r2,#20                   @ pack exponent and sign
- add r1,r2
- lsls r3,#31
- add r1,r3
- pop {r4-r7,r15}
-
-da_7:
-@ here exponent overflow: return signed infinity
- lsls r1,r3,#31
- ldr r3,=#0x7ff00000
- orrs r1,r3
- b 1f
-da_6:
-@ here exponent underflow: return signed zero
- lsls r1,r3,#31
-1:
- movs r0,#0
- pop {r4-r7,r15}
-
-da_5:
-@ here mantissa could be zero
- cmp r1,#0
- bne da_2
- cmp r4,#0
- bne da_2
-@ inputs must have been of identical magnitude and opposite sign, so return +0
- pop {r4-r7,r15}
-
-da_0:
-@ here a shift down by one place is required for normalisation
- adds r2,#1                    @ adjust exponent
- lsls r6,r0,#31                @ save rounding bit
- lsrs r0,#1
- lsls r5,r1,#31
- orrs r0,r5
- lsrs r1,#1
- cmp r6,#0
- beq da_3
- b da_4
-
-da_xrgty:                      @ xe>ye and shift>=32 places
- cmp r6,#60
- bge da_xmgty                  @ xe much greater than ye?
- subs r6,#32
- adds r7,#64
-
- movs r4,r2
- lsls r4,r4,r7                 @ these would be shifted off the bottom of the sticky bits
- beq 1f
- movs r4,#1
-1:
- lsrs r2,r2,r6
- orrs r4,r2
- movs r2,r3
- lsls r3,r3,r7
- orrs r4,r3
- asrs r3,r2,#31                @ propagate sign bit
- b da_xgty0
-
-da_ygtx:
-@ here ye>xe: need to shift x down r7 places
- mov r12,r5                    @ save exponent
- cmp r7,#32
- bge da_yrgtx                  @ ye rather greater than xe?
- adds r6,#32
- movs r4,r0
- lsls r4,r4,r6                 @ rounding bit + sticky bits
-da_ygtx0:
- movs r5,r1
- lsls r5,r5,r6
- lsrs r0,r7
- asrs r1,r7
- orrs r0,r5
- b da_add
-
-da_yrgtx:
- cmp r7,#60
- bge da_ymgtx                  @ ye much greater than xe?
- subs r7,#32
- adds r6,#64
-
- movs r4,r0
- lsls r4,r4,r6                 @ these would be shifted off the bottom of the sticky bits
- beq 1f
- movs r4,#1
-1:
- lsrs r0,r0,r7
- orrs r4,r0
- movs r0,r1
- lsls r1,r1,r6
- orrs r4,r1
- asrs r1,r0,#31                @ propagate sign bit
- b da_ygtx0
-
-da_ymgtx:                      @ result is just y
- movs r0,r2
- movs r1,r3
-da_xmgty:                      @ result is just x
- movs r4,#0                    @ clear sticky bits
- b da_pack
-
-.ltorg
-
-@ equivalent of UMULL
-@ needs five temporary registers
-@ can have rt3==rx, in which case rx trashed
-@ can have rt4==ry, in which case ry trashed
-@ can have rzl==rx
-@ can have rzh==ry
-@ can have rzl,rzh==rt3,rt4
-.macro mul32_32_64 rx,ry,rzl,rzh,rt0,rt1,rt2,rt3,rt4
-                               @   t0   t1   t2   t3   t4
-                               @                  (x)  (y)
- uxth \rt0,\rx                 @   xl
- uxth \rt1,\ry                 @        yl
- muls \rt0,\rt1                @  xlyl=L
- lsrs \rt2,\rx,#16             @             xh
- muls \rt1,\rt2                @       xhyl=M0
- lsrs \rt4,\ry,#16             @                       yh
- muls \rt2,\rt4                @           xhyh=H
- uxth \rt3,\rx                 @                   xl
- muls \rt3,\rt4                @                  xlyh=M1
- adds \rt1,\rt3                @      M0+M1=M
- bcc l\@_1                     @ addition of the two cross terms can overflow, so add carry into H
- movs \rt3,#1                  @                   1
- lsls \rt3,#16                 @                0x10000
- adds \rt2,\rt3                @             H'
-l\@_1:
-                               @   t0   t1   t2   t3   t4
-                               @                 (zl) (zh)
- lsls \rzl,\rt1,#16            @                  ML
- lsrs \rzh,\rt1,#16            @                       MH
- adds \rzl,\rt0                @                  ZL
- adcs \rzh,\rt2                @                       ZH
-.endm
-
-@ SUMULL: x signed, y unsigned
-@ in table below ¯ means signed variable
-@ needs five temporary registers
-@ can have rt3==rx, in which case rx trashed
-@ can have rt4==ry, in which case ry trashed
-@ can have rzl==rx
-@ can have rzh==ry
-@ can have rzl,rzh==rt3,rt4
-.macro muls32_32_64 rx,ry,rzl,rzh,rt0,rt1,rt2,rt3,rt4
-                               @   t0   t1   t2   t3   t4
-                               @                 ¯(x)  (y)
- uxth \rt0,\rx                 @   xl
- uxth \rt1,\ry                 @        yl
- muls \rt0,\rt1                @  xlyl=L
- asrs \rt2,\rx,#16             @            ¯xh
- muls \rt1,\rt2                @      ¯xhyl=M0
- lsrs \rt4,\ry,#16             @                       yh
- muls \rt2,\rt4                @          ¯xhyh=H
- uxth \rt3,\rx                 @                   xl
- muls \rt3,\rt4                @                 xlyh=M1
- asrs \rt4,\rt1,#31            @                      M0sx   (M1 sign extension is zero)
- adds \rt1,\rt3                @      M0+M1=M 
- movs \rt3,#0                  @                    0
- adcs \rt4,\rt3                @                      ¯Msx
- lsls \rt4,#16                 @                    ¯Msx<<16
- adds \rt2,\rt4                @             H'
-
-                               @   t0   t1   t2   t3   t4
-                               @                 (zl) (zh)
- lsls \rzl,\rt1,#16            @                  M~
- lsrs \rzh,\rt1,#16            @                       M~
- adds \rzl,\rt0                @                  ZL
- adcs \rzh,\rt2                @                      ¯ZH
-.endm
-
-@ SSMULL: x signed, y signed
-@ in table below ¯ means signed variable
-@ needs five temporary registers
-@ can have rt3==rx, in which case rx trashed
-@ can have rt4==ry, in which case ry trashed
-@ can have rzl==rx
-@ can have rzh==ry
-@ can have rzl,rzh==rt3,rt4
-.macro muls32_s32_64 rx,ry,rzl,rzh,rt0,rt1,rt2,rt3,rt4
-                               @   t0   t1   t2   t3   t4
-                               @                 ¯(x)  (y)
- uxth \rt0,\rx                 @   xl
- uxth \rt1,\ry                 @        yl
- muls \rt0,\rt1                @  xlyl=L
- asrs \rt2,\rx,#16             @            ¯xh
- muls \rt1,\rt2                @      ¯xhyl=M0
- asrs \rt4,\ry,#16             @                      ¯yh
- muls \rt2,\rt4                @          ¯xhyh=H
- uxth \rt3,\rx                 @                   xl
- muls \rt3,\rt4                @                ¯xlyh=M1
- adds \rt1,\rt3                @     ¯M0+M1=M
- asrs \rt3,\rt1,#31            @                  Msx
- bvc l\@_1                     @
- mvns \rt3,\rt3                @                 ¯Msx        flip sign extension bits if overflow
-l\@_1:
- lsls \rt3,#16                 @                    ¯Msx<<16
- adds \rt2,\rt3                @             H'
-
-                               @   t0   t1   t2   t3   t4
-                               @                 (zl) (zh)
- lsls \rzl,\rt1,#16            @                  M~
- lsrs \rzh,\rt1,#16            @                       M~
- adds \rzl,\rt0                @                  ZL
- adcs \rzh,\rt2                @                      ¯ZH
-.endm
-
-@ can have rt2==rx, in which case rx trashed
-@ can have rzl==rx
-@ can have rzh==rt1
-.macro square32_64 rx,rzl,rzh,rt0,rt1,rt2
-                               @   t0   t1   t2   zl   zh
- uxth \rt0,\rx                 @   xl
- muls \rt0,\rt0                @ xlxl=L 
- uxth \rt1,\rx                 @        xl
- lsrs \rt2,\rx,#16             @             xh
- muls \rt1,\rt2                @      xlxh=M
- muls \rt2,\rt2                @           xhxh=H
- lsls \rzl,\rt1,#17            @                  ML
- lsrs \rzh,\rt1,#15            @                       MH
- adds \rzl,\rt0                @                  ZL
- adcs \rzh,\rt2                @                       ZH
-.endm
-
-.align 2
-.thumb_func
-qfp_dmul:
- push {r4-r7,r14}
- mdunpack r0,r1,r4,r6,r5
- mov r12,r4
- mdunpack r2,r3,r4,r7,r5
- eors r7,r6                    @ sign of result
- add r4,r12                    @ exponent of result
- push {r0-r2,r4,r7}
-
-@ accumulate full product in r12:r5:r6:r7
- mul32_32_64 r0,r2, r0,r5, r4,r6,r7,r0,r5    @ XL*YL
- mov r12,r0                    @ save LL bits
-
- mul32_32_64 r1,r3, r6,r7, r0,r2,r4,r6,r7    @ XH*YH
-
- pop {r0}                      @ XL
- mul32_32_64 r0,r3, r0,r3, r1,r2,r4,r0,r3    @ XL*YH
- adds r5,r0
- adcs r6,r3
- movs r0,#0
- adcs r7,r0
-
- pop {r1,r2}                   @ XH,YL
- mul32_32_64 r1,r2, r1,r2, r0,r3,r4, r1,r2   @ XH*YL
- adds r5,r1
- adcs r6,r2
- movs r0,#0
- adcs r7,r0
-
-@ here r5:r6:r7 holds the product [1..4) in Q(104-32)=Q72, with extra LSBs in r12
- pop {r3,r4}                   @ exponent in r3, sign in r4
- lsls r1,r7,#11
- lsrs r2,r6,#21
- orrs r1,r2
- lsls r0,r6,#11
- lsrs r2,r5,#21
- orrs r0,r2
- lsls r5,#11                   @ now r5:r0:r1 Q83=Q(51+32), extra LSBs in r12
- lsrs r2,r1,#20
- bne 1f                        @ skip if in range [2..4)
- adds r5,r5                    @ shift up so always [2..4) Q83, i.e. [1..2) Q84=Q(52+32)
- adcs r0,r0
- adcs r1,r1
- subs r3,#1                    @ correct exponent
-1:
- ldr r6,=#0x3ff
- subs r3,r6                    @ correct for exponent bias
- lsls r6,#1                    @ 0x7fe
- cmp r3,r6
- bhs dm_0                      @ exponent over- or underflow
- lsls r5,#1                    @ rounding bit to carry
- bcc 1f                        @ result is correctly rounded
- adds r0,#1
- movs r6,#0
- adcs r1,r6                    @ round up
- mov r6,r12                    @ remaining sticky bits
- orrs r5,r6
- bne 1f                        @ some sticky bits set?
- lsrs r0,#1
- lsls r0,#1                    @ round to even
-1:
- lsls r3,#20
- adds r1,r3
-dm_2:
- lsls r4,#31
- add r1,r4
- pop {r4-r7,r15}
-
-@ here for exponent over- or underflow
-dm_0:
- bge dm_1                      @ overflow?
- adds r3,#1                    @ would-be zero exponent?
- bne 1f
- adds r0,#1
- bne 1f                        @ all-ones mantissa?
- adds r1,#1
- lsrs r7,r1,#21
- beq 1f
- lsrs r1,#1
- b dm_2
-1:
- lsls r1,r4,#31
- movs r0,#0
- pop {r4-r7,r15}
-
-@ here for exponent overflow
-dm_1:
- adds r6,#1                    @ 0x7ff
- lsls r1,r6,#20
- movs r0,#0
- b dm_2
-
-.ltorg
-
-@ Approach to division y/x is as follows.
-@
-@ First generate u1, an approximation to 1/x to about 29 bits. Multiply this by the top
-@ 32 bits of y to generate a0, a first approximation to the result (good to 28 bits or so).
-@ Calculate the exact remainder r0=y-a0*x, which will be about 0. Calculate a correction
-@ d0=r0*u1, and then write a1=a0+d0. If near a rounding boundary, compute the exact
-@ remainder r1=y-a1*x (which can be done using r0 as a basis) to determine whether to
-@ round up or down.
-@
-@ The calculation of 1/x is as given in dreciptest.c. That code verifies exhaustively
-@ that | u1*x-1 | < 10*2^-32.
-@
-@ More precisely:
-@
-@ x0=(q16)x;
-@ x1=(q30)x;
-@ y0=(q31)y;
-@ u0=(q15~)"(0xffffffffU/(unsigned int)roundq(x/x_ulp))/powq(2,16)"(x0); // q15 approximation to 1/x; "~" denotes rounding rather than truncation
-@ v=(q30)(u0*x1-1);
-@ u1=(q30)u0-(q30~)(u0*v);
-@
-@ a0=(q30)(u1*y0);
-@ r0=(q82)y-a0*x;
-@ r0x=(q57)r0;
-@ d0=r0x*u1;
-@ a1=d0+a0;
-@
-@ Error analysis
-@
-@ Use Greek letters to represent the errors introduced by rounding and truncation.
-@
-@               r₀ = y - a₀x
-@                  = y - [ u₁ ( y - α ) - β ] x    where 0 ≤ α < 2^-31, 0 ≤ β < 2^-30
-@                  = y ( 1 - u₁x ) + ( u₁α + β ) x
-@
-@     Hence
-@
-@       | r₀ / x | < 2 * 10*2^-32 + 2^-31 + 2^-30
-@                  = 26*2^-32
-@
-@               r₁ = y - a₁x
-@                  = y - a₀x - d₀x
-@                  = r₀ - d₀x
-@                  = r₀ - u₁ ( r₀ - γ ) x    where 0 ≤ γ < 2^-57
-@                  = r₀ ( 1 - u₁x ) + u₁γx
-@
-@     Hence
-@
-@       | r₁ / x | < 26*2^-32 * 10*2^-32 + 2^-57
-@                  = (260+128)*2^-64
-@                  < 2^-55
-@
-@ Empirically it seems to be nearly twice as good as this.
-@
-@ To determine correctly whether the exact remainder calculation can be skipped we need a result
-@ accurate to < 0.25ulp. In the case where x>y the quotient will be shifted up one place for normalisation
-@ and so 1ulp is 2^-53 and so the calculation above suffices.
-
-.align 2
-.thumb_func
-qfp_ddiv:
- push {r4-r7,r14}
-ddiv0:                         @ entry point from dtan
- mdunpack r2,r3,r4,r7,r6       @ unpack divisor
-
-@ unpack dividend by hand to save on register use
- lsrs r6,r1,#31
- adds r6,r7
- mov r12,r6                    @ result sign in r12b0; r12b1 trashed
- lsls r1,#1
- lsrs r7,r1,#21                @ exponent
- beq 1f                        @ zero exponent?
- adds r6,r7,#1
- lsrs r6,#11
- beq 2f                        @ exponent != 0x7ff? then done
-1:
- movs r0,#0
- movs r1,#0
- subs r7,#64                   @ less drastic fiddling of exponents to get 0/0, Inf/Inf correct
- lsls r7,#12
-2:
- subs r6,r7,r4
- lsls r6,#2
- add r12,r12,r6                @ (signed) exponent in r12[31..8]
- subs r7,#1                    @ implied 1
- lsls r7,#21
- subs r1,r7
- lsrs r1,#1
-
-// see dreciptest-boxc.c
- lsrs r4,r3,#15 @ x2=x>>15; // Q5 32..63
- ldr r5,=#(rcpapp-32)
- ldrb r4,[r5,r4] @ u=lut5[x2-32]; // Q8
- lsls r5,r3,#8
- muls r5,r5,r4
- asrs r5,#14 @ e=(i32)(u*(x<<8))>>14; // Q22
- asrs r6,r5,#11
- muls r6,r6,r6 @ e2=(e>>11)*(e>>11); // Q22
- subs r5,r6
- muls r5,r5,r4 @ c=(e-e2)*u; // Q30
- lsls r6,r4,#7
- asrs r5,#14
- adds r5,#1
- asrs r5,#1
- subs r6,r5 @ u0=(u<<7)-((c+0x4000)>>15); // Q15
-
-@ here
-@ r0:r1 y mantissa
-@ r2:r3 x mantissa
-@ r6    u0, first approximation to 1/x Q15
-@ r12: result sign, exponent
-
- lsls r4,r3,#10
- lsrs r5,r2,#22
- orrs r5,r4                    @ x1=(q30)x
- muls r5,r6                    @ u0*x1 Q45
- asrs r5,#15                   @ v=u0*x1-1 Q30
- muls r5,r6                    @ u0*v Q45
- asrs r5,#14
- adds r5,#1
- asrs r5,#1                    @ round u0*v to Q30
- lsls r6,#15
- subs r6,r5                    @ u1 Q30
-
-@ here
-@ r0:r1 y mantissa
-@ r2:r3 x mantissa
-@ r6    u1, second approximation to 1/x Q30
-@ r12: result sign, exponent
-
- push {r2,r3}
- lsls r4,r1,#11
- lsrs r5,r0,#21
- orrs r4,r5                    @ y0=(q31)y
- mul32_32_64 r4,r6, r4,r5, r2,r3,r7,r4,r5  @ y0*u1 Q61
- adds r4,r4
- adcs r5,r5                    @ a0=(q30)(y0*u1)
-
-@ here
-@ r0:r1 y mantissa
-@ r5    a0, first approximation to y/x Q30
-@ r6    u1, second approximation to 1/x Q30
-@ r12   result sign, exponent
-
- ldr r2,[r13,#0]               @ xL
- mul32_32_64 r2,r5, r2,r3, r1,r4,r7,r2,r3  @ xL*a0
- ldr r4,[r13,#4]               @ xH
- muls r4,r5                    @ xH*a0
- adds r3,r4                    @ r2:r3 now x*a0 Q82
- lsrs r2,#25
- lsls r1,r3,#7
- orrs r2,r1                    @ r2 now x*a0 Q57; r7:r2 is x*a0 Q89
- lsls r4,r0,#5                 @ y Q57
- subs r0,r4,r2                 @ r0x=y-x*a0 Q57 (signed)
-
-@ here
-@ r0  r0x Q57
-@ r5  a0, first approximation to y/x Q30
-@ r4  yL  Q57
-@ r6  u1 Q30
-@ r12 result sign, exponent
-
- muls32_32_64 r0,r6, r7,r6, r1,r2,r3, r7,r6   @ r7:r6 r0x*u1 Q87
- asrs r3,r6,#25
- adds r5,r3
- lsls r3,r6,#7                 @ r3:r5 a1 Q62 (but bottom 7 bits are zero so 55 bits of precision after binary point)
-@ here we could recover another 7 bits of precision (but not accuracy) from the top of r7
-@ but these bits are thrown away in the rounding and conversion to Q52 below
-
-@ here
-@ r3:r5  a1 Q62 candidate quotient [0.5,2) or so
-@ r4     yL Q57
-@ r12    result sign, exponent
-
- movs r6,#0
- adds r3,#128                  @ for initial rounding to Q53
- adcs r5,r5,r6
- lsrs  r1,r5,#30
- bne dd_0
-@ here candidate quotient a1 is in range [0.5,1)
-@ so 30 significant bits in r5
-
- lsls r4,#1                    @ y now Q58
- lsrs r1,r5,#9                 @ to Q52
- lsls r0,r5,#23
- lsrs r3,#9                    @ 0.5ulp-significance bit in carry: if this is 1 we may need to correct result
- orrs r0,r3
- bcs dd_1
- b dd_2
-dd_0:
-@ here candidate quotient a1 is in range [1,2)
-@ so 31 significant bits in r5
-
- movs r2,#4
- add r12,r12,r2                @ fix exponent; r3:r5 now effectively Q61
- adds r3,#128                  @ complete rounding to Q53
- adcs r5,r5,r6
- lsrs r1,r5,#10
- lsls r0,r5,#22
- lsrs r3,#10                   @ 0.5ulp-significance bit in carry: if this is 1 we may need to correct result
- orrs r0,r3
- bcc dd_2
-dd_1:
-
-@ here
-@ r0:r1  rounded result Q53 [0.5,1) or Q52 [1,2), but may not be correctly rounded-to-nearest
-@ r4     yL Q58 or Q57
-@ r12    result sign, exponent
-@ carry set
-
- adcs r0,r0,r0
- adcs r1,r1,r1                 @ z Q53 with 1 in LSB
- lsls r4,#16                   @ Q105-32=Q73
- ldr r2,[r13,#0]               @ xL Q52
- ldr r3,[r13,#4]               @ xH Q20
-
- movs r5,r1                    @ zH Q21
- muls r5,r2                    @ zH*xL Q73
- subs r4,r5
- muls r3,r0                    @ zL*xH Q73
- subs r4,r3
- mul32_32_64 r2,r0, r2,r3, r5,r6,r7,r2,r3  @ xL*zL
- rsbs r2,#0                    @ borrow from low half?
- sbcs r4,r3                    @ y-xz Q73 (remainder bits 52..73)
-
- cmp r4,#0
-
- bmi 1f
- movs r2,#0                    @ round up
- adds r0,#1
- adcs r1,r2
-1:
- lsrs r0,#1                    @ shift back down to Q52
- lsls r2,r1,#31
- orrs r0,r2
- lsrs r1,#1
-dd_2:
- add r13,#8
- mov r2,r12
- lsls r7,r2,#31                @ result sign
- asrs r2,#2                    @ result exponent
- ldr r3,=#0x3fd
- adds r2,r3
- ldr r3,=#0x7fe
- cmp r2,r3
- bhs dd_3                      @ over- or underflow?
- lsls r2,#20
- adds r1,r2                    @ pack exponent
-dd_5:
- adds r1,r7                    @ pack sign
- pop {r4-r7,r15}
-
-dd_3:
- movs r0,#0
- cmp r2,#0
- bgt dd_4                      @ overflow?
- movs r1,r7
- pop {r4-r7,r15}
-
-dd_4:
- adds r3,#1                    @ 0x7ff
- lsls r1,r3,#20
- b dd_5
-
-/*
-Approach to square root x=sqrt(y) is as follows.
-
-First generate a3, an approximation to 1/sqrt(y) to about 30 bits. Multiply this by y
-to give a4~sqrt(y) to about 28 bits and a remainder r4=y-a4^2. Then, because
-d sqrt(y) / dy = 1 / (2 sqrt(y)) let d4=r4*a3/2 and then the value a5=a4+d4 is
-a better approximation to sqrt(y). If this is near a rounding boundary we
-compute an exact remainder y-a5*a5 to decide whether to round up or down.
-
-The calculation of a3 and a4 is as given in dsqrttest.c. That code verifies exhaustively
-that | 1 - a3a4 | < 10*2^-32, | r4 | < 40*2^-32 and | r4/y | < 20*2^-32.
-
-More precisely, with "y" representing y truncated to 30 binary places:
-
-u=(q3)y;                          // 24-entry table
-a0=(q8~)"1/sqrtq(x+x_ulp/2)"(u);  // first approximation from table
-p0=(q16)(a0*a0) * (q16)y;
-r0=(q20)(p0-1);
-dy0=(q15)(r0*a0);                 // Newton-Raphson correction term
-a1=(q16)a0-dy0/2;                 // good to ~9 bits
-
-p1=(q19)(a1*a1)*(q19)y;
-r1=(q23)(p1-1);
-dy1=(q15~)(r1*a1);                // second Newton-Raphson correction
-a2x=(q16)a1-dy1/2;                // good to ~16 bits
-a2=a2x-a2x/1t16;                  // prevent overflow of a2*a2 in 32 bits
-
-p2=(a2*a2)*(q30)y;                // Q62
-r2=(q36)(p2-1+1t-31);
-dy2=(q30)(r2*a2);                 // Q52->Q30
-a3=(q31)a2-dy2/2;                 // good to about 30 bits
-a4=(q30)(a3*(q30)y+1t-31);        // good to about 28 bits
-
-Error analysis
-
-          r₄ = y - a₄²
-          d₄ = 1/2 a₃r₄
-          a₅ = a₄ + d₄
-          r₅ = y - a₅²
-             = y - ( a₄ + d₄ )²
-             = y - a₄² - a₃a₄r₄ - 1/4 a₃²r₄²
-             = r₄ - a₃a₄r₄ - 1/4 a₃²r₄²
-
-      | r₅ | < | r₄ | | 1 - a₃a₄ | + 1/4 r₄²
-
-          a₅ = √y √( 1 - r₅/y )
-             = √y ( 1 - 1/2 r₅/y + ... )
-
-So to first order (second order being very tiny)
-
-     √y - a₅ = 1/2 r₅/y
-
-and
-
- | √y - a₅ | < 1/2 ( | r₄/y | | 1 - a₃a₄ | + 1/4 r₄²/y )
-
-From dsqrttest.c (conservatively):
-
-             < 1/2 ( 20*2^-32 * 10*2^-32 + 1/4 * 40*2^-32*20*2^-32 )
-             = 1/2 ( 200 + 200 ) * 2^-64
-             < 2^-56
-
-Empirically we see about 1ulp worst-case error including rounding at Q57.
-
-To determine correctly whether the exact remainder calculation can be skipped we need a result
-accurate to < 0.25ulp at Q52, or 2^-54.
-*/
-
-dq_2:
- bge dq_3                      @ +Inf?
- movs r1,#0
- b dq_4
-
-dq_0:
- lsrs r1,#31
- lsls r1,#31                   @ preserve sign bit
- lsrs r2,#21                   @ extract exponent
- beq dq_4                      @ -0? return it
- asrs r1,#11                   @ make -Inf
- b dq_4
-
-dq_3:
- ldr r1,=#0x7ff
- lsls r1,#20                   @ return +Inf
-dq_4:
- movs r0,#0
-dq_1:
- bx r14
-
-.align 2
-.thumb_func
-qfp_dsqrt:
- lsls r2,r1,#1
- bcs dq_0                      @ negative?
- lsrs r2,#21                   @ extract exponent
- subs r2,#1
- ldr r3,=#0x7fe
- cmp r2,r3
- bhs dq_2                      @ catches 0 and +Inf
- push {r4-r7,r14}
- lsls r4,r2,#20
- subs r1,r4                    @ insert implied 1
- lsrs r2,#1
- bcc 1f                        @ even exponent? skip
- adds r0,r0,r0                 @ odd exponent: shift up mantissa
- adcs r1,r1,r1
-1:
- lsrs r3,#2
- adds r2,r3
- lsls r2,#20
- mov r12,r2                    @ save result exponent
-
-@ here
-@ r0:r1  y mantissa Q52 [1,4)
-@ r12    result exponent
-
- adr r4,drsqrtapp-8            @ first eight table entries are never accessed because of the mantissa's leading 1
- lsrs r2,r1,#17                @ y Q3
- ldrb r2,[r4,r2]               @ initial approximation to reciprocal square root a0 Q8
- lsrs r3,r1,#4                 @ first Newton-Raphson iteration
- muls r3,r2
- muls r3,r2                    @  i32 p0=a0*a0*(y>>14);          // Q32
- asrs r3,r3,#12                @  i32 r0=p0>>12;                 // Q20
- muls r3,r2
- asrs r3,#13                   @  i32 dy0=(r0*a0)>>13;           // Q15
- lsls r2,#8
- subs r2,r3                    @  i32 a1=(a0<<8)-dy0;         // Q16
-
- movs r3,r2
- muls r3,r3
- lsrs r3,#13
- lsrs r4,r1,#1
- muls r3,r4                    @  i32 p1=((a1*a1)>>11)*(y>>11);  // Q19*Q19=Q38
- asrs r3,#15                   @  i32 r1=p1>>15;                 // Q23
- muls r3,r2
- asrs r3,#23
- adds r3,#1
- asrs r3,#1                    @  i32 dy1=(r1*a1+(1<<23))>>24;   // Q23*Q16=Q39; Q15
- subs r2,r3                    @  i32 a2=a1-dy1;                 // Q16
- lsrs r3,r2,#16
- subs r2,r3                    @  if(a2>=0x10000) a2=0xffff; to prevent overflow of a2*a2
-
-@ here
-@ r0:r1 y mantissa
-@ r2    a2 ~ 1/sqrt(y) Q16
-@ r12   result exponent
-
- movs r3,r2
- muls r3,r3
- lsls r1,#10
- lsrs r4,r0,#22
- orrs r1,r4                    @ y Q30
- mul32_32_64 r1,r3, r4,r3, r5,r6,r7,r4,r3   @  i64 p2=(ui64)(a2*a2)*(ui64)y;  // Q62 r4:r3
- lsls r5,r3,#6
- lsrs r4,#26
- orrs r4,r5
- adds r4,#0x20                 @  i32 r2=(p2>>26)+0x20;          // Q36 r4
- uxth r5,r4
- muls r5,r2
- asrs r4,#16
- muls r4,r2
- lsrs r5,#16
- adds r4,r5
- asrs r4,#6                    @ i32 dy2=((i64)r2*(i64)a2)>>22; // Q36*Q16=Q52; Q30
- lsls r2,#15
- subs r2,r4
-
-@ here
-@ r0    y low bits
-@ r1    y Q30
-@ r2    a3 ~ 1/sqrt(y) Q31
-@ r12   result exponent
-
- mul32_32_64 r2,r1, r3,r4, r5,r6,r7,r3,r4
- adds r3,r3,r3
- adcs r4,r4,r4
- adds r3,r3,r3
- movs r3,#0
- adcs r3,r4                    @ ui32 a4=((ui64)a3*(ui64)y+(1U<<31))>>31; // Q30
-
-@ here
-@ r0    y low bits
-@ r1    y Q30
-@ r2    a3 Q31 ~ 1/sqrt(y)
-@ r3    a4 Q30 ~ sqrt(y)
-@ r12   result exponent
-
- square32_64 r3, r4,r5, r6,r5,r7
- lsls r6,r0,#8
- lsrs r7,r1,#2
- subs r6,r4
- sbcs r7,r5                    @ r4=(q60)y-a4*a4
-
-@ by exhaustive testing, r4 = fffffffc0e134fdc .. 00000003c2bf539c Q60
-
- lsls r5,r7,#29
- lsrs r6,#3
- adcs r6,r5                    @ r4 Q57 with rounding
- muls32_32_64 r6,r2, r6,r2, r4,r5,r7,r6,r2    @ d4=a3*r4/2 Q89
-@ r4+d4 is correct to 1ULP at Q57, tested on ~9bn cases including all extreme values of r4 for each possible y Q30
-
- adds r2,#8
- asrs r2,#5                    @ d4 Q52, rounded to Q53 with spare bit in carry
-
-@ here
-@ r0    y low bits
-@ r1    y Q30
-@ r2    d4 Q52, rounded to Q53
-@ C flag contains d4_b53
-@ r3    a4 Q30
-
- bcs dq_5
-
- lsrs r5,r3,#10                @ a4 Q52
- lsls r4,r3,#22
-
- asrs r1,r2,#31
- adds r0,r2,r4
- adcs r1,r5                    @ a4+d4
-
- add r1,r12                    @ pack exponent
- pop {r4-r7,r15}
-
-.ltorg
- 
-
-@ round(sqrt(2^22./[68:8:252]))
-drsqrtapp:
-.byte 0xf8,0xeb,0xdf,0xd6,0xcd,0xc5,0xbe,0xb8
-.byte 0xb2,0xad,0xa8,0xa4,0xa0,0x9c,0x99,0x95
-.byte 0x92,0x8f,0x8d,0x8a,0x88,0x85,0x83,0x81
-
-dq_5:
-@ here we are near a rounding boundary, C is set
- adcs r2,r2,r2                 @ d4 Q53+1ulp
- lsrs r5,r3,#9
- lsls r4,r3,#23                @ r4:r5 a4 Q53
- asrs r1,r2,#31
- adds r4,r2,r4
- adcs r5,r1                    @ r4:r5 a5=a4+d4 Q53+1ulp
- movs r3,r5
- muls r3,r4
- square32_64 r4,r1,r2,r6,r2,r7
- adds r2,r3
- adds r2,r3                    @ r1:r2 a5^2 Q106
- lsls r0,#22                   @ y Q84
-
- rsbs r1,#0
- sbcs r0,r2                    @ remainder y-a5^2
- bmi 1f                        @ y<a5^2: no need to increment a5
- movs r3,#0
- adds r4,#1
- adcs r5,r3                    @ bump a5 if over rounding boundary
-1:
- lsrs r0,r4,#1
- lsrs r1,r5,#1
- lsls r5,#31
- orrs r0,r5
- add r1,r12
- pop {r4-r7,r15}
-
-@ compare r0:r1 against r2:r3, returning -1/0/1 for <, =, >
-@ also set flags accordingly
-.thumb_func
-qfp_dcmp:
- push {r4,r6,r7,r14}
- ldr r7,=#0x7ff                @ flush NaNs and denormals
- lsls r4,r1,#1
- lsrs r4,#21
- beq 1f
- cmp r4,r7
- bne 2f
-1:
- movs r0,#0
- lsrs r1,#20
- lsls r1,#20
-2:
- lsls r4,r3,#1
- lsrs r4,#21
- beq 1f
- cmp r4,r7
- bne 2f
-1:
- movs r2,#0
- lsrs r3,#20
- lsls r3,#20
-2:
-dcmp_fast_entry:
- movs r6,#1
- eors r3,r1
- bmi 4f                        @ opposite signs? then can proceed on basis of sign of x
- eors r3,r1                    @ restore r3
- bpl 1f
- rsbs r6,#0                    @ negative? flip comparison
-1:
- cmp r1,r3
- bne 1f
- cmp r0,r2
- bhi 2f
- blo 3f
-5:
- movs r6,#0                    @ equal? result is 0
-1:
- bgt 2f
-3:
- rsbs r6,#0
-2:
- subs r0,r6,#0                 @ copy and set flags
- pop {r4,r6,r7,r15}
-4:
- orrs r3,r1                    @ make -0==+0
- adds r3,r3
- orrs r3,r0
- orrs r3,r2
- beq 5b
- cmp r1,#0
- bge 2b
- b 3b
- 
-
-@ "scientific" functions start here
-
-.thumb_func
-push_r8_r11:
- mov r4,r8
- mov r5,r9
- mov r6,r10
- mov r7,r11
- push {r4-r7}
- bx r14
-
-.thumb_func
-pop_r8_r11:
- pop {r4-r7}
- mov r8,r4
- mov r9,r5
- mov r10,r6
- mov r11,r7
- bx r14
-
-@ double-length CORDIC rotation step
-
-@ r0:r1   ω
-@ r6      32-i (complementary shift)
-@ r7      i (shift)
-@ r8:r9   x
-@ r10:r11 y
-@ r12     coefficient pointer
-
-@ an option in rotation mode would be to compute the sequence of σ values
-@ in one pass, rotate the initial vector by the residual ω and then run a
-@ second pass to compute the final x and y. This would relieve pressure
-@ on registers and hence possibly be faster. The same trick does not work
-@ in vectoring mode (but perhaps one could work to single precision in
-@ a first pass and then double precision in a second pass?).
-
-.thumb_func
-dcordic_vec_step:
- mov r2,r12
- ldmia r2!,{r3,r4}
- mov r12,r2
- mov r2,r11
- cmp r2,#0
- blt 1f
- b 2f
-
-.thumb_func
-dcordic_rot_step:
- mov r2,r12
- ldmia r2!,{r3,r4}
- mov r12,r2
- cmp r1,#0
- bge 1f
-2:
-@ ω<0 / y>=0
-@ ω+=dω
-@ x+=y>>i, y-=x>>i
- adds r0,r3
- adcs r1,r4
-
- mov r3,r11
- asrs r3,r7
- mov r4,r11
- lsls r4,r6
- mov r2,r10
- lsrs r2,r7
- orrs r2,r4                    @ r2:r3 y>>i, rounding in carry
- mov r4,r8
- mov r5,r9                     @ r4:r5 x
- adcs r2,r4
- adcs r3,r5                    @ r2:r3 x+(y>>i)
- mov r8,r2
- mov r9,r3
-
- mov r3,r5
- lsls r3,r6
- asrs r5,r7
- lsrs r4,r7
- orrs r4,r3                    @ r4:r5 x>>i, rounding in carry
- mov r2,r10
- mov r3,r11
- sbcs r2,r4
- sbcs r3,r5                    @ r2:r3 y-(x>>i)
- mov r10,r2
- mov r11,r3
- bx r14
-
-
-@ ω>0 / y<0
-@ ω-=dω
-@ x-=y>>i, y+=x>>i
-1:
- subs r0,r3
- sbcs r1,r4
-
- mov r3,r9
- asrs r3,r7
- mov r4,r9
- lsls r4,r6
- mov r2,r8
- lsrs r2,r7
- orrs r2,r4                    @ r2:r3 x>>i, rounding in carry
- mov r4,r10
- mov r5,r11                    @ r4:r5 y
- adcs r2,r4
- adcs r3,r5                    @ r2:r3 y+(x>>i)
- mov r10,r2
- mov r11,r3
-
- mov r3,r5
- lsls r3,r6
- asrs r5,r7
- lsrs r4,r7
- orrs r4,r3                    @ r4:r5 y>>i, rounding in carry
- mov r2,r8
- mov r3,r9
- sbcs r2,r4
- sbcs r3,r5                    @ r2:r3 x-(y>>i)
- mov r8,r2
- mov r9,r3
- bx r14
-
-ret_dzero:
- movs r0,#0
- movs r1,#0
- bx r14
-
-@ convert double to signed int, rounding towards 0, clamping
-.thumb_func
-qfp_double2int_z:
- cmp r1,#0
- bge qfp_double2int  @ +ve or zero? then use rounding towards -Inf
- push {r14}
- lsls r1,#1          @ -ve: clear sign bit
- lsrs r1,#1
- bl qfp_double2uint  @ convert to unsigned, rounding towards -Inf
- movs r1,#1
- lsls r1,#31         @ r1=0x80000000
- cmp r0,r1
- bhi 1f
- rsbs r0,#0
- pop {r15}
-1:
- mov r0,r1
- pop {r15}
-
-@ convert packed double in r0:r1 to signed/unsigned 32/64-bit integer/fixed-point value in r0:r1 [with r2 places after point], with rounding towards -Inf
-@ fixed-point versions only work with reasonable values in r2 because of the way dunpacks works
-
-.thumb_func
-qfp_double2int:
- movs r2,#0                    @ and fall through
-.thumb_func
-qfp_double2fix:
- push {r14}
- adds r2,#32
- bl qfp_double2fix64
- movs r0,r1
- pop {r15}
-
-.thumb_func
-qfp_double2uint:
- movs r2,#0                    @ and fall through
-.thumb_func
-qfp_double2ufix:
- push {r14}
- adds r2,#32
- bl qfp_double2ufix64
- movs r0,r1
- pop {r15}
-
-.thumb_func
-qfp_float2int64_z:
- cmp r0,#0
- bge qfp_float2int64 @ +ve or zero? then use rounding towards -Inf
- push {r14}
- lsls r0,#1          @ -ve: clear sign bit
- lsrs r0,#1
- bl qfp_float2uint64 @ convert to unsigned, rounding towards -Inf
- movs r2,#1
- lsls r2,#31         @ r2=0x80000000
- cmp r1,r2
- bhs 1f
- mvns r1,r1
- rsbs r0,#0
- bcc 2f
- adds r1,#1
-2:
- pop {r15}
-1:
- movs r0,#0
- mov r1,r2
- pop {r15}
-
-.thumb_func
-qfp_float2int64:
- movs r1,#0                    @ and fall through
-.thumb_func
-qfp_float2fix64:
- push {r14}
- bl f2fix
- b d2f64_a
-
-.thumb_func
-qfp_float2uint64:
- movs r1,#0                    @ and fall through
-.thumb_func
-qfp_float2ufix64:
- asrs r3,r0,#23                @ negative? return 0
- bmi ret_dzero
-@ and fall through
-
-@ convert float in r0 to signed fixed point in r0:r1:r3, r1 places after point, rounding towards -Inf
-@ result clamped so that r3 can only be 0 or -1
-@ trashes r12
-.thumb_func
-f2fix:
- push {r4,r14}
- mov r12,r1
- asrs r3,r0,#31
- lsls r0,#1
- lsrs r2,r0,#24
- beq 1f                        @ zero?
- cmp r2,#0xff                  @ Inf?
- beq 2f
- subs r1,r2,#1
- subs r2,#0x7f                 @ remove exponent bias
- lsls r1,#24
- subs r0,r1                    @ insert implied 1
- eors r0,r3
- subs r0,r3                    @ top two's complement
- asrs r1,r0,#4                 @ convert to double format
- lsls r0,#28
- b d2fix_a
-1:
- movs r0,#0
- movs r1,r0
- movs r3,r0
- pop {r4,r15}
-2:
- mvns r0,r3                    @ return max/min value
- mvns r1,r3
- pop {r4,r15}
-
-.thumb_func
-qfp_double2int64_z:
- cmp r1,#0
- bge qfp_double2int64 @ +ve or zero? then use rounding towards -Inf
- push {r14}
- lsls r1,#1           @ -ve: clear sign bit
- lsrs r1,#1
- bl qfp_double2uint64 @ convert to unsigned, rounding towards -Inf
- cmp r1,#0
- blt 1f
- mvns r1,r1
- rsbs r0,#0
- bcc 2f
- adds r1,#1
-2:
- pop {r15}
-1:
- movs r0,#0
- movs r1,#1
- lsls r1,#31          @ 0x80000000
- pop {r15}
-
-.thumb_func
-qfp_double2int64:
- movs r2,#0                    @ and fall through
-.thumb_func
-qfp_double2fix64:
- push {r14}
- bl d2fix
-d2f64_a:
- asrs r2,r1,#31
- cmp r2,r3
- bne 1f                        @ sign extension bits fail to match sign of result?
- pop {r15}
-1:
- mvns r0,r3
- movs r1,#1
- lsls r1,#31
- eors r1,r1,r0                 @ generate extreme fixed-point values
- pop {r15}
-
-.thumb_func
-qfp_double2uint64:
- movs r2,#0                    @ and fall through
-.thumb_func
-qfp_double2ufix64:
- asrs r3,r1,#20                @ negative? return 0
- bmi ret_dzero
-@ and fall through
-
-@ convert double in r0:r1 to signed fixed point in r0:r1:r3, r2 places after point, rounding towards -Inf
-@ result clamped so that r3 can only be 0 or -1
-@ trashes r12
-.thumb_func
-d2fix:
- push {r4,r14}
- mov r12,r2
- bl dunpacks
- asrs r4,r2,#16
- adds r4,#1
- bge 1f
- movs r1,#0                    @ -0 -> +0
-1:
- asrs r3,r1,#31
-d2fix_a:
-@ here
-@ r0:r1 two's complement mantissa
-@ r2    unbaised exponent
-@ r3    mantissa sign extension bits
- add r2,r12                    @ exponent plus offset for required binary point position
- subs r2,#52                   @ required shift
- bmi 1f                        @ shift down?
-@ here a shift up by r2 places
- cmp r2,#12                    @ will clamp?
- bge 2f
- movs r4,r0
- lsls r1,r2
- lsls r0,r2
- rsbs r2,#0
- adds r2,#32                   @ complementary shift
- lsrs r4,r2
- orrs r1,r4
- pop {r4,r15}
-2:
- mvns r0,r3
- mvns r1,r3                    @ overflow: clamp to extreme fixed-point values
- pop {r4,r15}
-1:
-@ here a shift down by -r2 places
- adds r2,#32
- bmi 1f                        @ long shift?
- mov r4,r1
- lsls r4,r2
- rsbs r2,#0
- adds r2,#32                   @ complementary shift
- asrs r1,r2
- lsrs r0,r2
- orrs r0,r4
- pop {r4,r15}
-1:
-@ here a long shift down
- movs r0,r1
- asrs r1,#31                   @ shift down 32 places
- adds r2,#32
- bmi 1f                        @ very long shift?
- rsbs r2,#0
- adds r2,#32
- asrs r0,r2
- pop {r4,r15}
-1:
- movs r0,r3                    @ result very near zero: use sign extension bits
- movs r1,r3
- pop {r4,r15}
-
-@ float <-> double conversions
-.thumb_func
-qfp_float2double:
- lsrs r3,r0,#31                @ sign bit
- lsls r3,#31
- lsls r1,r0,#1
- lsrs r2,r1,#24                @ exponent
- beq 1f                        @ zero?
- cmp r2,#0xff                  @ Inf?
- beq 2f
- lsrs r1,#4                    @ exponent and top 20 bits of mantissa
- ldr r2,=#(0x3ff-0x7f)<<20     @ difference in exponent offsets
- adds r1,r2
- orrs r1,r3
- lsls r0,#29                   @ bottom 3 bits of mantissa
- bx r14
-1:
- movs r1,r3                    @ return signed zero
-3:
- movs r0,#0
- bx r14
-2:
- ldr r1,=#0x7ff00000           @ return signed infinity
- adds r1,r3
- b 3b
-
-.thumb_func
-qfp_double2float:
- lsls r2,r1,#1
- lsrs r2,#21                   @ exponent
- ldr r3,=#0x3ff-0x7f
- subs r2,r3                    @ fix exponent bias
- ble 1f                        @ underflow or zero
- cmp r2,#0xff
- bge 2f                        @ overflow or infinity
- lsls r2,#23                   @ position exponent of result
- lsrs r3,r1,#31
- lsls r3,#31
- orrs r2,r3                    @ insert sign
- lsls r3,r0,#3                 @ rounding bits
- lsrs r0,#29
- lsls r1,#12
- lsrs r1,#9
- orrs r0,r1                    @ assemble mantissa
- orrs r0,r2                    @ insert exponent and sign
- lsls r3,#1
- bcc 3f                        @ no rounding
- beq 4f                        @ all sticky bits 0?
-5:
- adds r0,#1
-3:
- bx r14
-4:
- lsrs r3,r0,#1                 @ odd? then round up
- bcs 5b
- bx r14
-1:
- beq 6f                        @ check case where value is just less than smallest normal
-7:
- lsrs r0,r1,#31
- lsls r0,#31
- bx r14
-6:
- lsls r2,r1,#12                @ 20 1:s at top of mantissa?
- asrs r2,#12
- adds r2,#1
- bne 7b
- lsrs r2,r0,#29                @ and 3 more 1:s?
- cmp r2,#7
- bne 7b
- movs r2,#1                    @ return smallest normal with correct sign
- b 8f
-2:
- movs r2,#0xff
-8:
- lsrs r0,r1,#31                @ return signed infinity
- lsls r0,#8
- adds r0,r2
- lsls r0,#23
- bx r14
-
-@ convert signed/unsigned 32/64-bit integer/fixed-point value in r0:r1 [with r2 places after point] to packed double in r0:r1, with rounding
-
-.thumb_func
-qfp_uint2double:
- movs r1,#0                    @ and fall through
-.thumb_func
-qfp_ufix2double:
- movs r2,r1
- movs r1,#0
- b qfp_ufix642double
-
-.thumb_func
-qfp_int2double:
- movs r1,#0                    @ and fall through
-.thumb_func
-qfp_fix2double:
- movs r2,r1
- asrs r1,r0,#31                @ sign extend
- b qfp_fix642double
-
-.thumb_func
-qfp_uint642double:
- movs r2,#0                    @ and fall through
-.thumb_func
-qfp_ufix642double:
- movs r3,#0
- b uf2d
-
-.thumb_func
-qfp_int642double:
- movs r2,#0                    @ and fall through
-.thumb_func
-qfp_fix642double:
- asrs r3,r1,#31                @ sign bit across all bits
- eors r0,r3
- eors r1,r3
- subs r0,r3
- sbcs r1,r3
-uf2d:
- push {r4,r5,r14}
- ldr r4,=#0x432
- subs r2,r4,r2                 @ form biased exponent
-@ here
-@ r0:r1 unnormalised mantissa
-@ r2 -Q (will become exponent)
-@ r3 sign across all bits
- cmp r1,#0
- bne 1f                        @ short normalising shift?
- movs r1,r0
- beq 2f                        @ zero? return it
- movs r0,#0
- subs r2,#32                   @ fix exponent
-1:
- asrs r4,r1,#21
- bne 3f                        @ will need shift down (and rounding?)
- bcs 4f                        @ normalised already?
-5:
- subs r2,#1
- adds r0,r0                    @ shift up
- adcs r1,r1
- lsrs r4,r1,#21
- bcc 5b
-4:
- ldr r4,=#0x7fe
- cmp r2,r4
- bhs 6f                        @ over/underflow? return signed zero/infinity
-7:
- lsls r2,#20                   @ pack and return
- adds r1,r2
- lsls r3,#31
- adds r1,r3
-2:
- pop {r4,r5,r15}
-6:                             @ return signed zero/infinity according to unclamped exponent in r2
- mvns r2,r2
- lsrs r2,#21
- movs r0,#0
- movs r1,#0
- b 7b
-
-3:
-@ here we need to shift down to normalise and possibly round
- bmi 1f                        @ already normalised to Q63?
-2:
- subs r2,#1
- adds r0,r0                    @ shift up
- adcs r1,r1
- bpl 2b
-1:
-@ here we have a 1 in b63 of r0:r1
- adds r2,#11                   @ correct exponent for subsequent shift down
- lsls r4,r0,#21                @ save bits for rounding
- lsrs r0,#11
- lsls r5,r1,#21
- orrs r0,r5
- lsrs r1,#11
- lsls r4,#1
- beq 1f                        @ sticky bits are zero?
-8:
- movs r4,#0
- adcs r0,r4
- adcs r1,r4
- b 4b
-1:
- bcc 4b                        @ sticky bits are zero but not on rounding boundary
- lsrs r4,r0,#1                 @ increment if odd (force round to even)
- b 8b
-
-
-.ltorg
-
-.thumb_func
-dunpacks:
- mdunpacks r0,r1,r2,r3,r4
- ldr r3,=#0x3ff
- subs r2,r3                    @ exponent without offset
- bx r14
-
-@ r0:r1  signed mantissa Q52
-@ r2     unbiased exponent < 10 (i.e., |x|<2^10)
-@ r4     pointer to:
-@          - divisor reciprocal approximation r=1/d Q15
-@          - divisor d Q62  0..20
-@          - divisor d Q62 21..41
-@          - divisor d Q62 42..62
-@ returns:
-@ r0:r1  reduced result y Q62, -0.6 d < y < 0.6 d (better in practice)
-@ r2     quotient q (number of reductions)
-@ if exponent >=10, returns r0:r1=0, r2=1024*mantissa sign
-@ designed to work for 0.5<d<2, in particular d=ln2 (~0.7) and d=π/2 (~1.6)
-.thumb_func
-dreduce:
- adds r2,#2                    @ e+2
- bmi 1f                        @ |x|<0.25, too small to need adjustment
- cmp r2,#12
- bge 4f
-2:
- movs r5,#17
- subs r5,r2                    @ 15-e
- movs r3,r1                    @ Q20
- asrs r3,r5                    @ x Q5
- adds r2,#8                    @ e+10
- adds r5,#7                    @ 22-e = 32-(e+10)
- movs r6,r0
- lsrs r6,r5
- lsls r0,r2
- lsls r1,r2
- orrs r1,r6                    @ r0:r1 x Q62
- ldmia r4,{r4-r7}
- muls r3,r4                    @ rx Q20
- asrs r2,r3,#20
- movs r3,#0
- adcs r2,r3                    @ rx Q0 rounded = q; for e.g. r=1.5 |q|<1.5*2^10
- muls r5,r2                    @ qd in pieces: L Q62
- muls r6,r2                    @               M Q41
- muls r7,r2                    @               H Q20
- lsls r7,#10
- asrs r4,r6,#11
- lsls r6,#21
- adds r6,r5
- adcs r7,r4
- asrs r5,#31
- adds r7,r5                    @ r6:r7 qd Q62
- subs r0,r6
- sbcs r1,r7                    @ remainder Q62
- bx r14
-4:
- movs r2,#12                   @ overflow: clamp to +/-1024
- movs r0,#0
- asrs r1,#31
- lsls r1,#1
- adds r1,#1
- lsls r1,#20
- b 2b
-
-1:
- lsls r1,#8
- lsrs r3,r0,#24
- orrs r1,r3
- lsls r0,#8                    @ r0:r1 Q60, to be shifted down -r2 places
- rsbs r3,r2,#0
- adds r2,#32                   @ shift down in r3, complementary shift in r2
- bmi 1f                        @ long shift?
-2:
- movs r4,r1
- asrs r1,r3
- lsls r4,r2
- lsrs r0,r3
- orrs r0,r4
- movs r2,#0                    @ rounding
- adcs r0,r2
- adcs r1,r2
- bx r14
-
-1:
- movs r0,r1                    @ down 32 places
- asrs r1,#31
- subs r3,#32
- adds r2,#32
- bpl 2b
- movs r0,#0                    @ very long shift? return 0
- movs r1,#0
- movs r2,#0
- bx r14
-
-.thumb_func
-qfp_dtan:
- push {r4-r7,r14}
- bl push_r8_r11
- bl dsincos
- mov r12,r0                    @ save ε
- bl dcos_finish
- push {r0,r1}
- mov r0,r12
- bl dsin_finish
- pop {r2,r3}
- bl pop_r8_r11
- b ddiv0                       @ compute sin θ/cos θ
-
-.thumb_func
-qfp_dcos:
- push {r4-r7,r14}
- bl push_r8_r11
- bl dsincos
- bl dcos_finish
- b 1f
-
-.thumb_func
-qfp_dsin:
- push {r4-r7,r14}
- bl push_r8_r11
- bl dsincos
- bl dsin_finish
-1:
- bl pop_r8_r11
- pop {r4-r7,r15}
-
-
-@ unpack double θ in r0:r1, range reduce and calculate ε, cos α and sin α such that
-@ θ=α+ε and |ε|≤2^-32
-@ on return:
-@ r0:r1   ε (residual ω, where θ=α+ε) Q62, |ε|≤2^-32 (so fits in r0)
-@ r8:r9   cos α Q62
-@ r10:r11 sin α Q62
-.thumb_func
-dsincos:
- push {r14}
- bl dunpacks
- adr r4,dreddata0
- bl dreduce
-
- movs r4,#0
- ldr r5,=#0x9df04dbb           @ this value compensates for the non-unity scaling of the CORDIC rotations
- ldr r6,=#0x36f656c5
- lsls r2,#31
- bcc 1f
-@ quadrant 2 or 3
- mvns r6,r6
- rsbs r5,r5,#0
- adcs r6,r4
-1:
- lsls r2,#1
- bcs 1f
-@ even quadrant
- mov r10,r4
- mov r11,r4
- mov r8,r5
- mov r9,r6
- b 2f
-1:
-@ odd quadrant
- mov r8,r4
- mov r9,r4
- mov r10,r5
- mov r11,r6
-2:
- adr r4,dtab_cc
- mov r12,r4
- movs r7,#1
- movs r6,#31
-1:
- bl dcordic_rot_step
- adds r7,#1
- subs r6,#1
- cmp r7,#33
- bne 1b
- pop {r15}
-
-dcos_finish:
-@ here
-@ r0:r1   ε (residual ω, where θ=α+ε) Q62, |ε|≤2^-32 (so fits in r0)
-@ r8:r9   cos α Q62
-@ r10:r11 sin α Q62
-@ and we wish to calculate cos θ=cos(α+ε)~cos α - ε sin α
- mov r1,r11
-@ mov r2,r10
-@ lsrs r2,#31
-@ adds r1,r2                    @ rounding improves accuracy very slightly
- muls32_s32_64 r0,r1, r2,r3, r4,r5,r6,r2,r3
-@ r2:r3   ε sin α Q(62+62-32)=Q92
- mov r0,r8
- mov r1,r9
- lsls r5,r3,#2
- asrs r3,r3,#30
- lsrs r2,r2,#30
- orrs r2,r5
- sbcs r0,r2                    @ include rounding
- sbcs r1,r3
- movs r2,#62
- b qfp_fix642double
-
-dsin_finish:
-@ here
-@ r0:r1   ε (residual ω, where θ=α+ε) Q62, |ε|≤2^-32 (so fits in r0)
-@ r8:r9   cos α Q62
-@ r10:r11 sin α Q62
-@ and we wish to calculate sin θ=sin(α+ε)~sin α + ε cos α
- mov r1,r9
- muls32_s32_64 r0,r1, r2,r3, r4,r5,r6,r2,r3
-@ r2:r3   ε cos α Q(62+62-32)=Q92
- mov r0,r10
- mov r1,r11
- lsls r5,r3,#2
- asrs r3,r3,#30
- lsrs r2,r2,#30
- orrs r2,r5
- adcs r0,r2                    @ include rounding
- adcs r1,r3
- movs r2,#62
- b qfp_fix642double
-
-.ltorg
-.align 2
-dreddata0:
-.word 0x0000517d               @ 2/π Q15
-.word 0x0014611A               @ π/2 Q62=6487ED5110B4611A split into 21-bit pieces
-.word 0x000A8885
-.word 0x001921FB
-
-.thumb_func
-qfp_datan2:
-@ r0:r1 y
-@ r2:r3 x
- push {r4-r7,r14}
- bl push_r8_r11
- ldr r5,=#0x7ff00000
- movs r4,r1
- ands r4,r5                    @ y==0?
- beq 1f
- cmp r4,r5                     @ or Inf/NaN?
- bne 2f
-1:
- lsrs r1,#20                   @ flush
- lsls r1,#20
- movs r0,#0
-2:
- movs r4,r3
- ands r4,r5                    @ x==0?
- beq 1f
- cmp r4,r5                     @ or Inf/NaN?
- bne 2f
-1:
- lsrs r3,#20                   @ flush
- lsls r3,#20
- movs r2,#0
-2:
- movs r6,#0                    @ quadrant offset
- lsls r5,#11                   @ constant 0x80000000
- cmp r3,#0
- bpl 1f                        @ skip if x positive
- movs r6,#2
- eors r3,r5
- eors r1,r5
- bmi 1f                        @ quadrant offset=+2 if y was positive
- rsbs r6,#0                    @ quadrant offset=-2 if y was negative
-1:
-@ now in quadrant 0 or 3
- adds r7,r1,r5                 @ r7=-r1
- bpl 1f
-@ y>=0: in quadrant 0
- cmp r1,r3
- ble 2f                        @ y<~x so 0≤θ<~π/4: skip
- adds r6,#1
- eors r1,r5                    @ negate x
- b 3f                          @ and exchange x and y = rotate by -π/2
-1:
- cmp r3,r7
- bge 2f                        @ -y<~x so -π/4<~θ≤0: skip
- subs r6,#1
- eors r3,r5                    @ negate y and ...
-3:
- movs r7,r0                    @ exchange x and y
- movs r0,r2
- movs r2,r7
- movs r7,r1
- movs r1,r3
- movs r3,r7
-2:
-@ here -π/4<~θ<~π/4
-@ r6 has quadrant offset
- push {r6}
- cmp r2,#0
- bne 1f
- cmp r3,#0
- beq 10f                       @ x==0 going into division?
- lsls r4,r3,#1
- asrs r4,#21
- adds r4,#1
- bne 1f                        @ x==Inf going into division?
- lsls r4,r1,#1
- asrs r4,#21
- adds r4,#1                    @ y also ±Inf?
- bne 10f
- subs r1,#1                    @ make them both just finite
- subs r3,#1
- b 1f
-
-10:
- movs r0,#0
- movs r1,#0
- b 12f
- 
-1:
- bl qfp_ddiv
- movs r2,#62
- bl qfp_double2fix64
-@ r0:r1 y/x
- mov r10,r0
- mov r11,r1
- movs r0,#0                    @ ω=0
- movs r1,#0
- mov r8,r0
- movs r2,#1
- lsls r2,#30
- mov r9,r2                     @ x=1
-
- adr r4,dtab_cc
- mov r12,r4
- movs r7,#1
- movs r6,#31
-1:
- bl dcordic_vec_step
- adds r7,#1
- subs r6,#1
- cmp r7,#33
- bne 1b
-@ r0:r1   atan(y/x) Q62
-@ r8:r9   x residual Q62
-@ r10:r11 y residual Q62
- mov r2,r9
- mov r3,r10
- subs r2,#12                   @ this makes atan(0)==0
-@ the following is basically a division residual y/x ~ atan(residual y/x)
- movs r4,#1
- lsls r4,#29
- movs r7,#0
-2:
- lsrs r2,#1
- movs r3,r3                    @ preserve carry
- bmi 1f
- sbcs r3,r2
- adds r0,r4
- adcs r1,r7
- lsrs r4,#1
- bne 2b
- b 3f
-1:
- adcs r3,r2
- subs r0,r4
- sbcs r1,r7
- lsrs r4,#1
- bne 2b
-3:
- lsls r6,r1,#31
- asrs r1,#1
- lsrs r0,#1
- orrs r0,r6                    @ Q61
-
-12:
- pop {r6}
-
- cmp r6,#0
- beq 1f
- ldr r4,=#0x885A308D           @ π/2 Q61
- ldr r5,=#0x3243F6A8
- bpl 2f
- mvns r4,r4                    @ negative quadrant offset
- mvns r5,r5
-2:
- lsls r6,#31
- bne 2f                        @ skip if quadrant offset is ±1
- adds r0,r4
- adcs r1,r5
-2:
- adds r0,r4
- adcs r1,r5
-1:
- movs r2,#61
- bl qfp_fix642double
-
- bl pop_r8_r11
- pop {r4-r7,r15}
-
-.ltorg
-
-dtab_cc:
-.word 0x61bb4f69, 0x1dac6705   @ atan 2^-1 Q62
-.word 0x96406eb1, 0x0fadbafc   @ atan 2^-2 Q62
-.word 0xab0bdb72, 0x07f56ea6   @ atan 2^-3 Q62
-.word 0xe59fbd39, 0x03feab76   @ atan 2^-4 Q62
-.word 0xba97624b, 0x01ffd55b   @ atan 2^-5 Q62
-.word 0xdddb94d6, 0x00fffaaa   @ atan 2^-6 Q62
-.word 0x56eeea5d, 0x007fff55   @ atan 2^-7 Q62
-.word 0xaab7776e, 0x003fffea   @ atan 2^-8 Q62
-.word 0x5555bbbc, 0x001ffffd   @ atan 2^-9 Q62
-.word 0xaaaaadde, 0x000fffff   @ atan 2^-10 Q62
-.word 0xf555556f, 0x0007ffff   @ atan 2^-11 Q62
-.word 0xfeaaaaab, 0x0003ffff   @ atan 2^-12 Q62
-.word 0xffd55555, 0x0001ffff   @ atan 2^-13 Q62
-.word 0xfffaaaab, 0x0000ffff   @ atan 2^-14 Q62
-.word 0xffff5555, 0x00007fff   @ atan 2^-15 Q62
-.word 0xffffeaab, 0x00003fff   @ atan 2^-16 Q62
-.word 0xfffffd55, 0x00001fff   @ atan 2^-17 Q62
-.word 0xffffffab, 0x00000fff   @ atan 2^-18 Q62
-.word 0xfffffff5, 0x000007ff   @ atan 2^-19 Q62
-.word 0xffffffff, 0x000003ff   @ atan 2^-20 Q62
-.word 0x00000000, 0x00000200   @ atan 2^-21 Q62 @ consider optimising these
-.word 0x00000000, 0x00000100   @ atan 2^-22 Q62
-.word 0x00000000, 0x00000080   @ atan 2^-23 Q62
-.word 0x00000000, 0x00000040   @ atan 2^-24 Q62
-.word 0x00000000, 0x00000020   @ atan 2^-25 Q62
-.word 0x00000000, 0x00000010   @ atan 2^-26 Q62
-.word 0x00000000, 0x00000008   @ atan 2^-27 Q62
-.word 0x00000000, 0x00000004   @ atan 2^-28 Q62
-.word 0x00000000, 0x00000002   @ atan 2^-29 Q62
-.word 0x00000000, 0x00000001   @ atan 2^-30 Q62
-.word 0x80000000, 0x00000000   @ atan 2^-31 Q62
-.word 0x40000000, 0x00000000   @ atan 2^-32 Q62
-
-.thumb_func
-qfp_dexp:
- push {r4-r7,r14}
- bl dunpacks
- adr r4,dreddata1
- bl dreduce
- cmp r1,#0
- bge 1f
- ldr r4,=#0xF473DE6B
- ldr r5,=#0x2C5C85FD           @ ln2 Q62
- adds r0,r4
- adcs r1,r5
- subs r2,#1
-1:
- push {r2}
- movs r7,#1                    @ shift
- adr r6,dtab_exp
- movs r2,#0
- movs r3,#1
- lsls r3,#30                   @ x=1 Q62
-
-3:
- ldmia r6!,{r4,r5}
- mov r12,r6
- subs r0,r4
- sbcs r1,r5
- bmi 1f
-
- rsbs r6,r7,#0
- adds r6,#32                   @ complementary shift
- movs r5,r3
- asrs r5,r7
- movs r4,r3
- lsls r4,r6
- movs r6,r2
- lsrs r6,r7                    @ rounding bit in carry
- orrs r4,r6
- adcs r2,r4
- adcs r3,r5                    @ x+=x>>i
- b 2f
-
-1:
- adds r0,r4                    @ restore argument
- adcs r1,r5
-2:
- mov r6,r12
- adds r7,#1
- cmp r7,#33
- bne 3b
-
-@ here
-@ r0:r1   ε (residual x, where x=a+ε) Q62, |ε|≤2^-32 (so fits in r0)
-@ r2:r3   exp a Q62
-@ and we wish to calculate exp x=exp a exp ε~(exp a)(1+ε)
- muls32_32_64 r0,r3, r4,r1, r5,r6,r7,r4,r1
-@ r4:r1 ε exp a Q(62+62-32)=Q92
- lsrs r4,#30
- lsls r0,r1,#2
- orrs r0,r4
- asrs r1,#30
- adds r0,r2
- adcs r1,r3
-
- pop {r2}
- rsbs r2,#0
- adds r2,#62
- bl qfp_fix642double                 @ in principle we can pack faster than this because we know the exponent
- pop {r4-r7,r15}
-
-.ltorg
-
-.thumb_func
-qfp_dln:
- push {r4-r7,r14}
- lsls r7,r1,#1
- bcs 5f                        @ <0 ...
- asrs r7,#21
- beq 5f                        @ ... or =0? return -Inf
- adds r7,#1
- beq 6f                        @ Inf/NaN? return +Inf
- bl dunpacks
- push {r2}
- lsls r1,#9
- lsrs r2,r0,#23
- orrs r1,r2
- lsls r0,#9
-@ r0:r1 m Q61 = m/2 Q62 0.5≤m/2<1
-
- movs r7,#1                    @ shift
- adr r6,dtab_exp
- mov r12,r6
- movs r2,#0
- movs r3,#0                    @ y=0 Q62
-
-3:
- rsbs r6,r7,#0
- adds r6,#32                   @ complementary shift
- movs r5,r1
- asrs r5,r7
- movs r4,r1
- lsls r4,r6
- movs r6,r0
- lsrs r6,r7
- orrs r4,r6                    @ x>>i, rounding bit in carry
- adcs r4,r0
- adcs r5,r1                    @ x+(x>>i)
-
- lsrs r6,r5,#30
- bne 1f                        @ x+(x>>i)>1?
- movs r0,r4
- movs r1,r5                    @ x+=x>>i
- mov r6,r12
- ldmia r6!,{r4,r5}
- subs r2,r4
- sbcs r3,r5
-
-1:
- movs r4,#8
- add r12,r4
- adds r7,#1
- cmp r7,#33
- bne 3b
-@ here:
-@ r0:r1 residual x, nearly 1 Q62
-@ r2:r3 y ~ ln m/2 = ln m - ln2 Q62
-@ result is y + ln2 + ln x ~ y + ln2 + (x-1)
- lsls r1,#2
- asrs r1,#2                    @ x-1
- adds r2,r0
- adcs r3,r1
-
- pop {r7}
-@ here:
-@ r2:r3 ln m/2 = ln m - ln2 Q62
-@ r7    unbiased exponent
-
- adr r4,dreddata1+4
- ldmia r4,{r0,r1,r4}
- adds r7,#1
- muls r0,r7                    @ Q62
- muls r1,r7                    @ Q41
- muls r4,r7                    @ Q20
- lsls r7,r1,#21
- asrs r1,#11
- asrs r5,r1,#31
- adds r0,r7
- adcs r1,r5
- lsls r7,r4,#10
- asrs r4,#22
- asrs r5,r1,#31
- adds r1,r7
- adcs r4,r5
-@ r0:r1:r4 exponent*ln2 Q62
- asrs r5,r3,#31
- adds r0,r2
- adcs r1,r3
- adcs r4,r5
-@ r0:r1:r4 result Q62
- movs r2,#62
-1:
- asrs r5,r1,#31
- cmp r4,r5
- beq 2f                        @ r4 a sign extension of r1?
- lsrs r0,#4                    @ no: shift down 4 places and try again
- lsls r6,r1,#28
- orrs r0,r6
- lsrs r1,#4
- lsls r6,r4,#28
- orrs r1,r6
- asrs r4,#4
- subs r2,#4
- b 1b
-2:
- bl qfp_fix642double
- pop {r4-r7,r15}
-
-5:
- ldr r1,=#0xfff00000
- movs r0,#0
- pop {r4-r7,r15}
-
-6:
- ldr r1,=#0x7ff00000
- movs r0,#0
- pop {r4-r7,r15}
-
-
-.ltorg
-
-.align 2
-dreddata1:
-.word 0x0000B8AA               @ 1/ln2 Q15
-.word 0x0013DE6B               @ ln2 Q62 Q62=2C5C85FDF473DE6B split into 21-bit pieces
-.word 0x000FEFA3
-.word 0x000B1721
-
-dtab_exp:
-.word 0xbf984bf3, 0x19f323ec   @ log 1+2^-1 Q62
-.word 0xcd4d10d6, 0x0e47fbe3   @ log 1+2^-2 Q62
-.word 0x8abcb97a, 0x0789c1db   @ log 1+2^-3 Q62
-.word 0x022c54cc, 0x03e14618   @ log 1+2^-4 Q62
-.word 0xe7833005, 0x01f829b0   @ log 1+2^-5 Q62
-.word 0x87e01f1e, 0x00fe0545   @ log 1+2^-6 Q62
-.word 0xac419e24, 0x007f80a9   @ log 1+2^-7 Q62
-.word 0x45621781, 0x003fe015   @ log 1+2^-8 Q62
-.word 0xa9ab10e6, 0x001ff802   @ log 1+2^-9 Q62
-.word 0x55455888, 0x000ffe00   @ log 1+2^-10 Q62
-.word 0x0aa9aac4, 0x0007ff80   @ log 1+2^-11 Q62
-.word 0x01554556, 0x0003ffe0   @ log 1+2^-12 Q62
-.word 0x002aa9ab, 0x0001fff8   @ log 1+2^-13 Q62
-.word 0x00055545, 0x0000fffe   @ log 1+2^-14 Q62
-.word 0x8000aaaa, 0x00007fff   @ log 1+2^-15 Q62
-.word 0xe0001555, 0x00003fff   @ log 1+2^-16 Q62
-.word 0xf80002ab, 0x00001fff   @ log 1+2^-17 Q62
-.word 0xfe000055, 0x00000fff   @ log 1+2^-18 Q62
-.word 0xff80000b, 0x000007ff   @ log 1+2^-19 Q62
-.word 0xffe00001, 0x000003ff   @ log 1+2^-20 Q62
-.word 0xfff80000, 0x000001ff   @ log 1+2^-21 Q62
-.word 0xfffe0000, 0x000000ff   @ log 1+2^-22 Q62
-.word 0xffff8000, 0x0000007f   @ log 1+2^-23 Q62
-.word 0xffffe000, 0x0000003f   @ log 1+2^-24 Q62
-.word 0xfffff800, 0x0000001f   @ log 1+2^-25 Q62
-.word 0xfffffe00, 0x0000000f   @ log 1+2^-26 Q62
-.word 0xffffff80, 0x00000007   @ log 1+2^-27 Q62
-.word 0xffffffe0, 0x00000003   @ log 1+2^-28 Q62
-.word 0xfffffff8, 0x00000001   @ log 1+2^-29 Q62
-.word 0xfffffffe, 0x00000000   @ log 1+2^-30 Q62
-.word 0x80000000, 0x00000000   @ log 1+2^-31 Q62
-.word 0x40000000, 0x00000000   @ log 1+2^-32 Q62
-
 qfp_lib_end: