You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

3671 lines
87 KiB
ArmAsm

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

@ Copyright 2019-2024 Mark Owen
@ http://www.quinapalus.com
@ E-mail: qfp@quinapalus.com
@
@ This file is free software: you can redistribute it and/or modify
@ it under the terms of version 2 of the GNU General Public License
@ as published by the Free Software Foundation.
@
@ This file is distributed in the hope that it will be useful,
@ but WITHOUT ANY WARRANTY; without even the implied warranty of
@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
@ GNU General Public License for more details.
@
@ You should have received a copy of the GNU General Public License
@ along with this file. If not, see <http://www.gnu.org/licenses/> or
@ write to the Free Software Foundation, Inc., 51 Franklin Street,
@ Fifth Floor, Boston, MA 02110-1301, USA.
.syntax unified
.cpu cortex-m0plus
.thumb
@ exported symbols
.global qfp_fadd
.global qfp_fsub
.global qfp_fmul
.global qfp_fdiv
.global qfp_fcmp
.global qfp_fsqrt
.global qfp_float2int
.global qfp_float2fix
.global qfp_float2uint
.global qfp_float2ufix
.global qfp_int2float
.global qfp_fix2float
.global qfp_uint2float
.global qfp_ufix2float
.global qfp_int642float
.global qfp_fix642float
.global qfp_uint642float
.global qfp_ufix642float
.global qfp_fcos
.global qfp_fsin
.global qfp_ftan
.global qfp_fatan2
.global qfp_fexp
.global qfp_fln
.global qfp_dadd
.global qfp_dsub
.global qfp_dmul
.global qfp_ddiv
.global qfp_dsqrt
.global qfp_dcos
.global qfp_dsin
.global qfp_dtan
.global qfp_datan2
.global qfp_dexp
.global qfp_dln
.global qfp_dcmp
.global qfp_float2int64
.global qfp_float2fix64
.global qfp_float2uint64
.global qfp_float2ufix64
.global qfp_float2int_z
.global qfp_float2int64_z
.global qfp_double2int
.global qfp_double2fix
.global qfp_double2uint
.global qfp_double2ufix
.global qfp_double2int64
.global qfp_double2fix64
.global qfp_double2uint64
.global qfp_double2ufix64
.global qfp_double2int_z
.global qfp_double2int64_z
.global qfp_int2double
.global qfp_fix2double
.global qfp_uint2double
.global qfp_ufix2double
.global qfp_int642double
.global qfp_fix642double
.global qfp_uint642double
.global qfp_ufix642double
.global qfp_double2float
.global qfp_float2double
qfp_lib_start:
@ exchange r0<->r1, r2<->r3
xchxy:
push {r0,r2,r14}
mov r0,r1
mov r2,r3
pop {r1,r3,r15}
@ IEEE single in r0-> signed (two's complemennt) mantissa in r0 9Q23 (24 significant bits), signed exponent (bias removed) in r2
@ trashes r4; zero, denormal -> mantissa=+/-1, exponent=-380; Inf, NaN -> mantissa=+/-1, exponent=+640
unpackx:
lsrs r2,r0,#23 @ save exponent and sign
lsls r0,#9 @ extract mantissa
lsrs r0,#9
movs r4,#1
lsls r4,#23
orrs r0,r4 @ reinstate implied leading 1
cmp r2,#255 @ test sign bit
uxtb r2,r2 @ clear it
bls 1f @ branch on positive
rsbs r0,#0 @ negate mantissa
1:
subs r2,#1
cmp r2,#254 @ zero/denormal/Inf/NaN?
bhs 2f
subs r2,#126 @ remove exponent bias: can now be -126..+127
bx r14
2: @ here with special-case values
cmp r0,#0
mov r0,r4 @ set mantissa to +1
bpl 3f
rsbs r0,#0 @ zero/denormal/Inf/NaN: mantissa=+/-1
3:
subs r2,#126 @ zero/denormal: exponent -> -127; Inf, NaN: exponent -> 128
lsls r2,#2 @ zero/denormal: exponent -> -508; Inf, NaN: exponent -> 512
adds r2,#128 @ zero/denormal: exponent -> -380; Inf, NaN: exponent -> 640
bx r14
@ normalise and pack signed mantissa in r0 nominally 3Q29, signed exponent in r2-> IEEE single in r0
@ trashes r4, preserves r1,r3
@ r5: "sticky bits", must be zero iff all result bits below r0 are zero for correct rounding
packx:
lsrs r4,r0,#31 @ save sign bit
lsls r4,r4,#31 @ sign now in b31
bpl 2f @ skip if positive
cmp r5,#0
beq 11f
adds r0,#1 @ fiddle carry in to following rsb if sticky bits are non-zero
11:
rsbs r0,#0 @ can now treat r0 as unsigned
packx0:
bmi 3f @ catch r0=0x80000000 case
2:
subs r2,#1 @ normalisation loop
adds r0,r0
beq 1f @ zero? special case
bpl 2b @ normalise so leading "1" in bit 31
3:
adds r2,#129 @ (mis-)offset exponent
bne 12f @ special case: highest denormal can round to lowest normal
adds r0,#0x80 @ in special case, need to add 256 to r0 for rounding
bcs 4f @ tripped carry? then have leading 1 in C as required
12:
adds r0,#0x80 @ rounding
bcs 4f @ tripped carry? then have leading 1 in C as required (and result is even so can ignore sticky bits)
cmp r5,#0
beq 7f @ sticky bits zero?
8:
lsls r0,#1 @ remove leading 1
9:
subs r2,#1 @ compensate exponent on this path
4:
cmp r2,#254
bge 5f @ overflow?
adds r2,#1 @ correct exponent offset
ble 10f @ denormal/underflow?
lsrs r0,#9 @ align mantissa
lsls r2,#23 @ align exponent
orrs r0,r2 @ assemble exponent and mantissa
6:
orrs r0,r4 @ apply sign
1:
bx r14
5:
movs r0,#0xff @ create infinity
lsls r0,#23
b 6b
10:
movs r0,#0 @ create zero
bx r14
7: @ sticky bit rounding case
lsls r5,r0,#24 @ check bottom 8 bits of r0
bne 8b @ in rounding-tie case?
lsrs r0,#9 @ ensure even result
lsls r0,#10
b 9b
.align 2
.ltorg
@ signed multiply r0 1Q23 by r1 4Q23, result in r0 7Q25, sticky bits in r5
@ trashes r3,r4
mul0:
uxth r3,r0 @ Q23
asrs r4,r1,#16 @ Q7
muls r3,r4 @ L*H, Q30 signed
asrs r4,r0,#16 @ Q7
uxth r5,r1 @ Q23
muls r4,r5 @ H*L, Q30 signed
adds r3,r4 @ sum of middle partial products
uxth r4,r0
muls r4,r5 @ L*L, Q46 unsigned
lsls r5,r4,#16 @ initialise sticky bits from low half of low partial product
lsrs r4,#16 @ Q25
adds r3,r4 @ add high half of low partial product to sum of middle partial products
@ (cannot generate carry by limits on input arguments)
asrs r0,#16 @ Q7
asrs r1,#16 @ Q7
muls r0,r1 @ H*H, Q14 signed
lsls r0,#11 @ high partial product Q25
lsls r1,r3,#27 @ sticky
orrs r5,r1 @ collect further sticky bits
asrs r1,r3,#5 @ middle partial products Q25
adds r0,r1 @ final result
bx r14
.thumb_func
qfp_fcmp:
lsls r2,r0,#1
lsrs r2,#24
beq 1f
cmp r2,#0xff
bne 2f
1:
lsrs r0,#23 @ clear mantissa if NaN or denormal
lsls r0,#23
2:
lsls r2,r1,#1
lsrs r2,#24
beq 1f
cmp r2,#0xff
bne 2f
1:
lsrs r1,#23 @ clear mantissa if NaN or denormal
lsls r1,#23
2:
movs r2,#1 @ initialise result
eors r1,r0
bmi 4f @ opposite signs? then can proceed on basis of sign of x
eors r1,r0 @ restore y
bpl 1f
rsbs r2,#0 @ both negative? flip comparison
1:
cmp r0,r1
bgt 2f
blt 3f
5:
movs r2,#0
3:
rsbs r2,#0
2:
subs r0,r2,#0
bx r14
4:
orrs r1,r0
adds r1,r1
beq 5b
cmp r0,#0
bge 2b
b 3b
@ convert float to signed int, rounding towards 0, clamping
.thumb_func
qfp_float2int_z:
push {r14}
cmp r0,#0
blt 1f
bl qfp_float2int @ +ve or zero? then use rounding towards -Inf
cmp r0,#0
bge 2f
ldr r0,=#0x7fffffff
2:
pop {r15}
1:
lsls r0,#1 @ -ve: clear sign bit
lsrs r0,#1
bl qfp_float2uint @ convert to unsigned, rounding towards -Inf
cmp r0,#0
blt 1f
rsbs r0,#0
pop {r15}
1:
movs r0,#1
lsls r0,#31 @ 0x80000000
pop {r15}
.ltorg
@ convert float to signed int, rounding towards -Inf, clamping
.thumb_func
qfp_float2int:
movs r1,#0 @ fall through
@ convert float in r0 to signed fixed point in r0, clamping
.thumb_func
qfp_float2fix:
push {r4,r14}
bl unpackx
movs r3,r2
adds r3,#130
bmi 6f @ -0?
add r2,r1 @ incorporate binary point position into exponent
subs r2,#23 @ r2 is now amount of left shift required
blt 1f @ requires right shift?
cmp r2,#7 @ overflow?
ble 4f
3: @ overflow
asrs r1,r0,#31 @ +ve:0 -ve:0xffffffff
mvns r1,r1 @ +ve:0xffffffff -ve:0
movs r0,#1
lsls r0,#31
5:
eors r0,r1 @ +ve:0x7fffffff -ve:0x80000000 (unsigned path: 0xffffffff)
pop {r4,r15}
1:
rsbs r2,#0 @ right shift for r0, >0
cmp r2,#32
blt 2f @ more than 32 bits of right shift?
movs r2,#32
2:
asrs r0,r0,r2
pop {r4,r15}
6:
movs r0,#0
pop {r4,r15}
@ unsigned version
.thumb_func
qfp_float2uint:
movs r1,#0 @ fall through
.thumb_func
qfp_float2ufix:
push {r4,r14}
bl unpackx
add r2,r1 @ incorporate binary point position into exponent
movs r1,r0
bmi 5b @ negative? return zero
subs r2,#23 @ r2 is now amount of left shift required
blt 1b @ requires right shift?
mvns r1,r0 @ ready to return 0xffffffff
cmp r2,#8 @ overflow?
bgt 5b
4:
lsls r0,r0,r2 @ result fits, left shifted
pop {r4,r15}
@ convert uint64 to float, rounding
.thumb_func
qfp_uint642float:
movs r2,#0 @ fall through
@ convert unsigned 64-bit fix to float, rounding; number of r0:r1 bits after point in r2
.thumb_func
qfp_ufix642float:
push {r4,r5,r14}
cmp r1,#0
bpl 3f @ positive? we can use signed code
lsls r5,r1,#31 @ contribution to sticky bits
orrs r5,r0
lsrs r0,r1,#1
subs r2,#1
b 4f
@ convert int64 to float, rounding
.thumb_func
qfp_int642float:
movs r2,#0 @ fall through
@ convert signed 64-bit fix to float, rounding; number of r0:r1 bits after point in r2
.thumb_func
qfp_fix642float:
push {r4,r5,r14}
3:
movs r5,r0
orrs r5,r1
beq ret_pop45 @ zero? return +0
asrs r5,r1,#31 @ sign bits
2:
asrs r4,r1,#24 @ try shifting 7 bits at a time
cmp r4,r5
bne 1f @ next shift will overflow?
lsls r1,#7
lsrs r4,r0,#25
orrs r1,r4
lsls r0,#7
adds r2,#7
b 2b
1:
movs r5,r0
movs r0,r1
4:
rsbs r2,#0
adds r2,#32+29
b packret
@ convert signed int to float, rounding
.thumb_func
qfp_int2float:
movs r1,#0 @ fall through
@ convert signed fix to float, rounding; number of r0 bits after point in r1
.thumb_func
qfp_fix2float:
push {r4,r5,r14}
1:
movs r2,#29
subs r2,r1 @ fix exponent
packretns: @ pack and return, sticky bits=0
movs r5,#0
packret: @ common return point: "pack and return"
bl packx
ret_pop45:
pop {r4,r5,r15}
@ unsigned version
.thumb_func
qfp_uint2float:
movs r1,#0 @ fall through
.thumb_func
qfp_ufix2float:
push {r4,r5,r14}
cmp r0,#0
bge 1b @ treat <2^31 as signed
movs r2,#30
subs r2,r1 @ fix exponent
lsls r5,r0,#31 @ one sticky bit
lsrs r0,#1
b packret
@ All the scientific functions are implemented using the CORDIC algorithm. For notation,
@ details not explained in the comments below, and a good overall survey see
@ "50 Years of CORDIC: Algorithms, Architectures, and Applications" by Meher et al.,
@ IEEE Transactions on Circuits and Systems Part I, Volume 56 Issue 9.
@ Register use:
@ r0: x
@ r1: y
@ r2: z/omega
@ r3: coefficient pointer
@ r4,r12: m
@ r5: i (shift)
cordic_start: @ initialisation
movs r5,#0 @ initial shift=0
mov r12,r4
b 5f
cordic_vstep: @ one step of algorithm in vector mode
cmp r1,#0 @ check sign of y
bgt 4f
b 1f
cordic_rstep: @ one step of algorithm in rotation mode
cmp r2,#0 @ check sign of angle
bge 1f
4:
subs r1,r6 @ negative rotation: y=y-(x>>i)
rsbs r7,#0
adds r2,r4 @ accumulate angle
b 2f
1:
adds r1,r6 @ positive rotation: y=y+(x>>i)
subs r2,r4 @ accumulate angle
2:
mov r4,r12
muls r7,r4 @ apply sign from m
subs r0,r7 @ finish rotation: x=x{+/-}(y>>i)
5:
ldmia r3!,{r4} @ fetch next angle from table and bump pointer
lsrs r4,#1 @ repeated angle?
bcs 3f
adds r5,#1 @ adjust shift if not
3:
mov r6,r0
asrs r6,r5 @ x>>i
mov r7,r1
asrs r7,r5 @ y>>i
lsrs r4,#1 @ shift end flag into carry
bx r14
@ CORDIC rotation mode
cordic_rot:
push {r6,r7,r14}
bl cordic_start @ initialise
1:
bl cordic_rstep
bcc 1b @ step until table finished
asrs r6,r0,#14 @ remaining small rotations can be linearised: see IV.B of paper referenced above
asrs r7,r1,#14
asrs r2,#3
muls r6,r2 @ all remaining CORDIC steps in a multiplication
muls r7,r2
mov r4,r12
muls r7,r4
asrs r6,#12
asrs r7,#12
subs r0,r7 @ x=x{+/-}(yz>>k)
adds r1,r6 @ y=y+(xz>>k)
cordic_exit:
pop {r6,r7,r15}
@ CORDIC vector mode
cordic_vec:
push {r6,r7,r14}
bl cordic_start @ initialise
1:
bl cordic_vstep
bcc 1b @ step until table finished
4:
cmp r1,#0 @ continue as in cordic_vstep but without using table; x is not affected as y is small
bgt 2f @ check sign of y
adds r1,r6 @ positive rotation: y=y+(x>>i)
subs r2,r4 @ accumulate angle
b 3f
2:
subs r1,r6 @ negative rotation: y=y-(x>>i)
adds r2,r4 @ accumulate angle
3:
asrs r6,#1
asrs r4,#1 @ next "table entry"
bne 4b
b cordic_exit
.thumb_func
qfp_fsin: @ calculate sin and cos using CORDIC rotation method
push {r4,r5,r14}
movs r1,#24
bl qfp_float2fix @ range reduction by repeated subtraction/addition in fixed point
ldr r4,pi_q29
lsrs r4,#4 @ 2pi Q24
1:
subs r0,r4
bge 1b
1:
adds r0,r4
bmi 1b @ now in range 0..2pi
lsls r2,r0,#2 @ z Q26
lsls r5,r4,#1 @ pi Q26 (r4=pi/2 Q26)
ldr r0,=#0x136e9db4 @ initialise CORDIC x,y with scaling
movs r1,#0
1:
cmp r2,r4 @ >pi/2?
blt 2f
subs r2,r5 @ reduce range to -pi/2..pi/2
rsbs r0,#0 @ rotate vector by pi
b 1b
2:
lsls r2,#3 @ Q29
adr r3,tab_cc @ circular coefficients
movs r4,#1 @ m=1
bl cordic_rot
adds r1,#9 @ fiddle factor to make sin(0)==0
movs r2,#0 @ exponents to zero
movs r3,#0
movs r5,#0 @ no sticky bits
bl clampx
bl packx @ pack cosine
bl xchxy
bl clampx
b packretns @ pack sine
.thumb_func
qfp_fcos:
push {r14}
bl qfp_fsin
mov r0,r1 @ extract cosine result
pop {r15}
@ force r0 to lie in range [-1,1] Q29
clampx:
movs r4,#1
lsls r4,#29
cmp r0,r4
bgt 1f
rsbs r4,#0
cmp r0,r4
ble 1f
bx r14
1:
movs r0,r4
bx r14
.thumb_func
qfp_ftan:
push {r4,r5,r6,r14}
bl qfp_fsin @ sine in r0/r2, cosine in r1/r3
b fdiv_n @ sin/cos
.thumb_func
qfp_fexp:
push {r4,r5,r14}
movs r1,#24
bl qfp_float2fix @ Q24: covers entire valid input range
asrs r1,r0,#16 @ Q8
ldr r2,=#5909 @ log_2(e) Q12
muls r2,r1 @ estimate exponent of result Q20 (always an underestimate)
asrs r2,#20 @ Q0
lsls r1,r0,#6 @ Q30
ldr r0,=#0x2c5c85fe @ ln(2) Q30
muls r0,r2 @ accurate contribution of estimated exponent
subs r1,r0 @ residual to be exponentiated, guaranteed ≥0, < about 0.75 Q30
@ here
@ r1: mantissa to exponentiate, 0...~0.75 Q30
@ r2: first exponent estimate
movs r5,#1 @ shift
adr r3,ftab_exp @ could use alternate words from dtab_exp to save space if required
movs r0,#1
lsls r0,#29 @ x=1 Q29
3:
ldmia r3!,{r4}
subs r4,r1,r4
bmi 1f
movs r1,r4 @ keep result of subtraction
movs r4,r0
lsrs r4,r5
adcs r0,r4 @ x+=x>>i with rounding
1:
adds r5,#1
cmp r5,#15
bne 3b
@ here
@ r0: exp a Q29 1..2+
@ r1: ε (residual x where x=a+ε), < 2^-14 Q30
@ r2: first exponent estimate
@ and we wish to calculate exp x=exp a exp ε~(exp a)(1+ε)
lsrs r3,r0,#15 @ exp a Q14
muls r3,r1 @ ε exp a Q44
lsrs r3,#15 @ ε exp a Q29
adcs r0,r3 @ (1+ε) exp a Q29 with rounding
b packretns @ pack result
.thumb_func
qfp_fln:
push {r4,r5,r14}
asrs r1,r0,#23
bmi 3f @ -ve argument?
beq 3f @ 0 argument?
cmp r1,#0xff
beq 4f @ +Inf/NaN
bl unpackx
adds r2,#1
ldr r3,=#0x2c5c85fe @ ln(2) Q30
lsrs r1,r3,#14 @ ln(2) Q16
muls r1,r2 @ result estimate Q16
asrs r1,#16 @ integer contribution to result
muls r3,r2
lsls r4,r1,#30
subs r3,r4 @ fractional contribution to result Q30, signed
lsls r0,#8 @ Q31
@ here
@ r0: mantissa Q31
@ r1: integer contribution to result
@ r3: fractional contribution to result Q30, signed
movs r5,#1 @ shift
adr r4,ftab_exp @ could use alternate words from dtab_exp to save space if required
2:
movs r2,r0
lsrs r2,r5
adcs r2,r0 @ x+(x>>i) with rounding
bcs 1f @ >=2?
movs r0,r2 @ keep result
ldr r2,[r4]
subs r3,r2
1:
adds r4,#4
adds r5,#1
cmp r5,#15
bne 2b
@ here
@ r0: residual x, nearly 2 Q31
@ r1: integer contribution to result
@ r3: fractional part of result Q30
asrs r0,#2
adds r0,r3,r0
cmp r1,#0
bne 2f
asrs r0,#1
lsls r1,#29
adds r0,r1
movs r2,#0
b packretns
2:
lsls r1,#24
asrs r0,#6 @ Q24
adcs r0,r1 @ with rounding
movs r2,#5
b packretns
3:
ldr r0,=#0xff800000 @ -Inf
pop {r4,r5,r15}
4:
ldr r0,=#0x7f800000 @ +Inf
pop {r4,r5,r15}
.align 2
ftab_exp:
.word 0x19f323ed @ log 1+2^-1 Q30
.word 0x0e47fbe4 @ log 1+2^-2 Q30
.word 0x0789c1dc @ log 1+2^-3 Q30
.word 0x03e14618 @ log 1+2^-4 Q30
.word 0x01f829b1 @ log 1+2^-5 Q30
.word 0x00fe0546 @ log 1+2^-6 Q30
.word 0x007f80aa @ log 1+2^-7 Q30
.word 0x003fe015 @ log 1+2^-8 Q30
.word 0x001ff803 @ log 1+2^-9 Q30
.word 0x000ffe00 @ log 1+2^-10 Q30
.word 0x0007ff80 @ log 1+2^-11 Q30
.word 0x0003ffe0 @ log 1+2^-12 Q30
.word 0x0001fff8 @ log 1+2^-13 Q30
.word 0x0000fffe @ log 1+2^-14 Q30
.thumb_func
qfp_fatan2:
push {r4,r5,r14}
@ unpack arguments and shift one down to have common exponent
bl unpackx
bl xchxy
bl unpackx
lsls r0,r0,#5 @ Q28
lsls r1,r1,#5 @ Q28
adds r4,r2,r3 @ this is -760 if both arguments are 0 and at least -380-126=-506 otherwise
asrs r4,#9
adds r4,#1
bmi 2f @ force y to 0 proper, so result will be zero
subs r4,r2,r3 @ calculate shift
bge 1f @ ex>=ey?
rsbs r4,#0 @ make shift positive
asrs r0,r4
cmp r4,#28
blo 3f
asrs r0,#31
b 3f
1:
asrs r1,r4
cmp r4,#28
blo 3f
2:
@ here |x|>>|y| or both x and y are ±0
cmp r0,#0
bge 4f @ x positive, return signed 0
ldr r0,pi_q29 @ x negative, return +/- pi
asrs r1,#31
eors r0,r1
b 7f
4:
asrs r0,r1,#31
b 7f
3:
movs r2,#0 @ initial angle
cmp r0,#0 @ x negative
bge 5f
rsbs r0,#0 @ rotate to 1st/4th quadrants
rsbs r1,#0
ldr r2,pi_q29 @ pi Q29
5:
adr r3,tab_cc @ circular coefficients
movs r4,#1 @ m=1
bl cordic_vec @ also produces magnitude (with scaling factor 1.646760119), which is discarded
mov r0,r2 @ result here is -pi/2..3pi/2 Q29
@ asrs r2,#29
@ subs r0,r2
ldr r2,pi_q29 @ pi Q29
adds r4,r0,r2 @ attempt to fix -3pi/2..-pi case
bcs 6f @ -pi/2..0? leave result as is
subs r4,r0,r2 @ <pi? leave as is
bmi 6f
subs r0,r4,r2 @ >pi: take off 2pi
6:
subs r0,#1 @ fiddle factor so atan2(0,1)==0
7:
movs r2,#0 @ exponent for pack
b packretns
.align 2
.ltorg
@ first entry in following table is pi Q29
pi_q29:
@ circular CORDIC coefficients: atan(2^-i), b0=flag for preventing shift, b1=flag for end of table
tab_cc:
.word 0x1921fb54*4+1 @ no shift before first iteration
.word 0x0ed63383*4+0
.word 0x07d6dd7e*4+0
.word 0x03fab753*4+0
.word 0x01ff55bb*4+0
.word 0x00ffeaae*4+0
.word 0x007ffd55*4+0
.word 0x003fffab*4+0
.word 0x001ffff5*4+0
.word 0x000fffff*4+0
.word 0x0007ffff*4+0
.word 0x00040000*4+0
.word 0x00020000*4+0+2 @ +2 marks end
.align 2
.thumb_func
qfp_fsub:
ldr r2,=#0x80000000
eors r1,r2 @ flip sign on second argument
@ drop into fadd, on .align2:ed boundary
.thumb_func
qfp_fadd:
push {r4,r5,r6,r14}
asrs r4,r0,#31
lsls r2,r0,#1
lsrs r2,#24 @ x exponent
beq fa_xe0
cmp r2,#255
beq fa_xe255
fa_xe:
asrs r5,r1,#31
lsls r3,r1,#1
lsrs r3,#24 @ y exponent
beq fa_ye0
cmp r3,#255
beq fa_ye255
fa_ye:
ldr r6,=#0x007fffff
ands r0,r0,r6 @ extract mantissa bits
ands r1,r1,r6
adds r6,#1 @ r6=0x00800000
orrs r0,r0,r6 @ set implied 1
orrs r1,r1,r6
eors r0,r0,r4 @ complement...
eors r1,r1,r5
subs r0,r0,r4 @ ... and add 1 if sign bit is set: 2's complement
subs r1,r1,r5
subs r5,r3,r2 @ ye-xe
subs r4,r2,r3 @ xe-ye
bmi fa_ygtx
@ here xe>=ye
cmp r4,#30
bge fa_xmgty @ xe much greater than ye?
adds r5,#32
movs r3,r2 @ save exponent
@ here y in r1 must be shifted down r4 places to align with x in r0
movs r2,r1
lsls r2,r2,r5 @ keep the bits we will shift off the bottom of r1
asrs r1,r1,r4
b fa_0
.ltorg
fa_ymgtx:
movs r2,#0 @ result is just y
movs r0,r1
b fa_1
fa_xmgty:
movs r3,r2 @ result is just x
movs r2,#0
b fa_1
fa_ygtx:
@ here ye>xe
cmp r5,#30
bge fa_ymgtx @ ye much greater than xe?
adds r4,#32
@ here x in r0 must be shifted down r5 places to align with y in r1
movs r2,r0
lsls r2,r2,r4 @ keep the bits we will shift off the bottom of r0
asrs r0,r0,r5
fa_0:
adds r0,r1 @ result is now in r0:r2, possibly highly denormalised or zero; exponent in r3
beq fa_9 @ if zero, inputs must have been of identical magnitude and opposite sign, so return +0
fa_1:
lsrs r1,r0,#31 @ sign bit
beq fa_8
mvns r0,r0
rsbs r2,r2,#0
bne fa_8
adds r0,#1
fa_8:
adds r6,r6
@ r6=0x01000000
cmp r0,r6
bhs fa_2
fa_3:
adds r2,r2 @ normalisation loop
adcs r0,r0
subs r3,#1 @ adjust exponent
cmp r0,r6
blo fa_3
fa_2:
@ here r0:r2 is the result mantissa 0x01000000<=r0<0x02000000, r3 the exponent, and r1 the sign bit
lsrs r0,#1
bcc fa_4
@ rounding bits here are 1:r2
adds r0,#1 @ round up
cmp r2,#0
beq fa_5 @ sticky bits all zero?
fa_4:
cmp r3,#254
bhs fa_6 @ exponent too large or negative?
lsls r1,#31 @ pack everything
add r0,r1
lsls r3,#23
add r0,r3
fa_end:
pop {r4,r5,r6,r15}
fa_9:
cmp r2,#0 @ result zero?
beq fa_end @ return +0
b fa_1
fa_5:
lsrs r0,#1
lsls r0,#1 @ round to even
b fa_4
fa_6:
bge fa_7
@ underflow
@ can handle denormals here
lsls r0,r1,#31 @ result is signed zero
pop {r4,r5,r6,r15}
fa_7:
@ overflow
lsls r0,r1,#8
adds r0,#255
lsls r0,#23 @ result is signed infinity
pop {r4,r5,r6,r15}
fa_xe0:
@ can handle denormals here
subs r2,#32
adds r2,r4 @ exponent -32 for +Inf, -33 for -Inf
b fa_xe
fa_xe255:
@ can handle NaNs here
lsls r2,#8
add r2,r2,r4 @ exponent ~64k for +Inf, ~64k-1 for -Inf
b fa_xe
fa_ye0:
@ can handle denormals here
subs r3,#32
adds r3,r5 @ exponent -32 for +Inf, -33 for -Inf
b fa_ye
fa_ye255:
@ can handle NaNs here
lsls r3,#8
add r3,r3,r5 @ exponent ~64k for +Inf, ~64k-1 for -Inf
b fa_ye
.align 2
.thumb_func
qfp_fmul:
push {r7,r14}
mov r2,r0
eors r2,r1 @ sign of result
lsrs r2,#31
lsls r2,#31
mov r14,r2
lsls r0,#1
lsls r1,#1
lsrs r2,r0,#24 @ xe
beq fm_xe0
cmp r2,#255
beq fm_xe255
fm_xe:
lsrs r3,r1,#24 @ ye
beq fm_ye0
cmp r3,#255
beq fm_ye255
fm_ye:
adds r7,r2,r3 @ exponent of result (will possibly be incremented)
subs r7,#128 @ adjust bias for packing
lsls r0,#8 @ x mantissa
lsls r1,#8 @ y mantissa
lsrs r0,#9
lsrs r1,#9
adds r2,r0,r1 @ for later
mov r12,r2
lsrs r2,r0,#7 @ x[22..7] Q16
lsrs r3,r1,#7 @ y[22..7] Q16
muls r2,r2,r3 @ result [45..14] Q32: never an overestimate and worst case error is 2*(2^7-1)*(2^23-2^7)+(2^7-1)^2 = 2130690049 < 2^31
muls r0,r0,r1 @ result [31..0] Q46
lsrs r2,#18 @ result [45..32] Q14
bcc 1f
cmp r0,#0
bmi 1f
adds r2,#1 @ fix error in r2
1:
lsls r3,r0,#9 @ bits off bottom of result
lsrs r0,#23 @ Q23
lsls r2,#9
adds r0,r2 @ cut'n'shut
add r0,r12 @ implied 1*(x+y) to compensate for no insertion of implied 1s
@ result-1 in r3:r0 Q23+32, i.e., in range [0,3)
lsrs r1,r0,#23
bne fm_0 @ branch if we need to shift down one place
@ here 1<=result<2
cmp r7,#254
bhs fm_3a @ catches both underflow and overflow
lsls r3,#1 @ sticky bits at top of R3, rounding bit in carry
bcc fm_1 @ no rounding
beq fm_2 @ rounding tie?
adds r0,#1 @ round up
fm_1:
adds r7,#1 @ for implied 1
lsls r7,#23 @ pack result
add r0,r7
add r0,r14
pop {r7,r15}
fm_2: @ rounding tie
adds r0,#1
fm_3:
lsrs r0,#1
lsls r0,#1 @ clear bottom bit
b fm_1
@ here 1<=result-1<3
fm_0:
adds r7,#1 @ increment exponent
cmp r7,#254
bhs fm_3b @ catches both underflow and overflow
lsrs r0,#1 @ shift mantissa down
bcc fm_1a @ no rounding
adds r0,#1 @ assume we will round up
cmp r3,#0 @ sticky bits
beq fm_3c @ rounding tie?
fm_1a:
adds r7,r7
adds r7,#1 @ for implied 1
lsls r7,#22 @ pack result
add r0,r7
add r0,r14
pop {r7,r15}
fm_3c:
lsrs r0,#1
lsls r0,#1 @ clear bottom bit
b fm_1a
fm_xe0:
subs r2,#16
fm_xe255:
lsls r2,#8
b fm_xe
fm_ye0:
subs r3,#16
fm_ye255:
lsls r3,#8
b fm_ye
@ here the result is under- or overflowing
fm_3b:
bge fm_4 @ branch on overflow
@ trap case where result is denormal 0x007fffff + 0.5ulp or more
adds r7,#1 @ exponent=-1?
bne fm_5
@ corrected mantissa will be >= 3.FFFFFC (0x1fffffe Q23)
@ so r0 >= 2.FFFFFC (0x17ffffe Q23)
adds r0,#2
lsrs r0,#23
cmp r0,#3
bne fm_5
b fm_6
fm_3a:
bge fm_4 @ branch on overflow
@ trap case where result is denormal 0x007fffff + 0.5ulp or more
adds r7,#1 @ exponent=-1?
bne fm_5
adds r0,#1 @ mantissa=0xffffff (i.e., r0=0x7fffff)?
lsrs r0,#23
beq fm_5
fm_6:
movs r0,#1 @ return smallest normal
lsls r0,#23
add r0,r14
pop {r7,r15}
fm_5:
mov r0,r14
pop {r7,r15}
fm_4:
movs r0,#0xff
lsls r0,#23
add r0,r14
pop {r7,r15}
@ This version of the division algorithm uses external divider hardware to estimate the
@ reciprocal of the divisor to about 14 bits; then a multiplication step to get a first
@ quotient estimate; then the remainder based on this estimate is used to calculate a
@ correction to the quotient. The result is good to about 27 bits and so we only need
@ to calculate the exact remainder when close to a rounding boundary.
.align 2
.thumb_func
qfp_fdiv:
push {r4,r5,r6,r14}
fdiv_n:
movs r4,#1
lsls r4,#23 @ implied 1 position
lsls r2,r1,#9 @ clear out sign and exponent
lsrs r2,r2,#9
orrs r2,r2,r4 @ divisor mantissa Q23 with implied 1
@ here
@ r0=packed dividend
@ r1=packed divisor
@ r2=divisor mantissa Q23
@ r4=1<<23
// see divtest.c
lsrs r3,r2,#18 @ x2=x>>18; // Q5 32..63
adr r5,rcpapp-32
ldrb r3,[r5,r3] @ u=lut5[x2-32]; // Q8
lsls r5,r2,#5
muls r5,r5,r3
asrs r5,#14 @ e=(i32)(u*(x<<5))>>14; // Q22
asrs r6,r5,#11
muls r6,r6,r6 @ e2=(e>>11)*(e>>11); // Q22
subs r5,r6
muls r5,r5,r3 @ c=(e-e2)*u; // Q30
lsls r6,r3,#8
asrs r5,#13
adds r5,#1
asrs r5,#1
subs r5,r6,r5 @ u0=(u<<8)-((c+0x2000)>>14); // Q16
@ here
@ r0=packed dividend
@ r1=packed divisor
@ r2=divisor mantissa Q23
@ r4=1<<23
@ r5=reciprocal estimate Q16
lsrs r6,r0,#23
uxtb r3,r6 @ dividend exponent
lsls r0,#9
lsrs r0,#9
orrs r0,r0,r4 @ dividend mantissa Q23
lsrs r1,#23
eors r6,r1 @ sign of result in bit 8
lsrs r6,#8
lsls r6,#31 @ sign of result in bit 31, other bits clear
@ here
@ r0=dividend mantissa Q23
@ r1=divisor sign+exponent
@ r2=divisor mantissa Q23
@ r3=dividend exponent
@ r5=reciprocal estimate Q16
@ r6b31=sign of result
uxtb r1,r1 @ divisor exponent
cmp r1,#0
beq retinf
cmp r1,#255
beq 20f @ divisor is infinite
cmp r3,#0
beq retzero
cmp r3,#255
beq retinf
subs r3,r1 @ initial result exponent (no bias)
adds r3,#125 @ add bias
lsrs r1,r0,#8 @ dividend mantissa Q15
@ here
@ r0=dividend mantissa Q23
@ r1=dividend mantissa Q15
@ r2=divisor mantissa Q23
@ r3=initial result exponent
@ r5=reciprocal estimate Q16
@ r6b31=sign of result
muls r1,r5
lsrs r1,#16 @ Q15 qu0=(q15)(u*y0);
lsls r0,r0,#15 @ dividend Q38
movs r4,r2
muls r4,r1 @ Q38 qu0*x
subs r4,r0,r4 @ Q38 re0=(y<<15)-qu0*x; note this remainder is signed
asrs r4,#10
muls r4,r5 @ Q44 qu1=(re0>>10)*u; this quotient correction is also signed
asrs r4,#16 @ Q28
lsls r1,#13
adds r1,r1,r4 @ Q28 qu=(qu0<<13)+(qu1>>16);
@ here
@ r0=dividend mantissa Q38
@ r1=quotient Q28
@ r2=divisor mantissa Q23
@ r3=initial result exponent
@ r6b31=sign of result
lsrs r4,r1,#28
bne 1f
@ here the quotient is less than 1<<28 (i.e., result mantissa <1.0)
adds r1,#5
lsrs r4,r1,#4 @ rounding + small reduction in systematic bias
bcc 2f @ skip if we are not near a rounding boundary
lsrs r1,#3 @ quotient Q25
lsls r0,#10 @ dividend mantissa Q48
muls r1,r1,r2 @ quotient*divisor Q48
subs r0,r0,r1 @ remainder Q48
bmi 2f
b 3f
1:
@ here the quotient is at least 1<<28 (i.e., result mantissa >=1.0)
adds r3,#1 @ bump exponent (and shift mantissa down one more place)
adds r1,#9
lsrs r4,r1,#5 @ rounding + small reduction in systematic bias
bcc 2f @ skip if we are not near a rounding boundary
lsrs r1,#4 @ quotient Q24
lsls r0,#9 @ dividend mantissa Q47
muls r1,r1,r2 @ quotient*divisor Q47
subs r0,r0,r1 @ remainder Q47
bmi 2f
3:
adds r4,#1 @ increment quotient as we are above the rounding boundary
@ here
@ r3=result exponent
@ r4=correctly rounded quotient Q23 in range [1,2] *note closed interval*
@ r6b31=sign of result
2:
cmp r3,#254
bhs 10f @ this catches both underflow and overflow
lsls r1,r3,#23
adds r0,r4,r1
adds r0,r6
pop {r4,r5,r6,r15}
@ here divisor is infinite; dividend exponent in r3
20:
cmp r3,#255
bne retzero
retinf:
movs r0,#255
21:
lsls r0,#23
orrs r0,r6
pop {r4,r5,r6,r15}
10:
bge retinf @ overflow?
adds r1,r3,#1
bne retzero @ exponent <-1? return 0
@ here exponent is exactly -1
lsrs r1,r4,#25
bcc retzero @ mantissa is not 01000000?
@ return minimum normal
movs r0,#1
lsls r0,#23
orrs r0,r6
pop {r4,r5,r6,r15}
retzero:
movs r0,r6
pop {r4,r5,r6,r15}
@ x2=[32:1:63]/32;
@ round(256 ./(x2+1/64))
.align 2
rcpapp:
.byte 252,245,237,231,224,218,213,207,202,197,193,188,184,180,176,172
.byte 169,165,162,159,156,153,150,148,145,142,140,138,135,133,131,129
@ The square root routine uses an initial approximation to the reciprocal of the square root of the argument based
@ on the top four bits of the mantissa (possibly shifted one place to make the exponent even). It then performs two
@ Newton-Raphson iterations, resulting in about 14 bits of accuracy. This reciprocal is then multiplied by
@ the original argument to produce an approximation to the result, again with about 14 bits of accuracy.
@ Then a remainder is calculated, and multiplied by the reciprocal estiamte to generate a correction term
@ giving a final answer to about 28 bits of accuracy. A final remainder calculation rounds to the correct
@ result if necessary.
@ Again, the fixed-point calculation is carefully implemented to preserve accuracy, and similar comments to those
@ made above on the fast division routine apply.
@ The reciprocal square root calculation has been tested for all possible (possibly shifted) input mantissa values.
.align 2
.thumb_func
qfp_fsqrt:
push {r4}
lsls r1,r0,#1
bcs sq_0 @ negative?
lsls r1,#8
lsrs r1,#9 @ mantissa
movs r2,#1
lsls r2,#23
adds r1,r2 @ insert implied 1
lsrs r2,r0,#23 @ extract exponent
beq sq_2 @ zero?
cmp r2,#255 @ infinite?
beq sq_1
adds r2,#125 @ correction for packing
asrs r2,#1 @ exponent/2, LSB into carry
bcc 1f
lsls r1,#1 @ was even: double mantissa; mantissa y now 1..4 Q23
1:
adr r4,rsqrtapp-4@ first four table entries are never accessed because of the mantissa's leading 1
lsrs r3,r1,#21 @ y Q2
ldrb r4,[r4,r3] @ initial approximation to reciprocal square root a0 Q8
lsrs r0,r1,#7 @ y Q16: first Newton-Raphson iteration
muls r0,r4 @ a0*y Q24
muls r0,r4 @ r0=p0=a0*y*y Q32
asrs r0,#12 @ r0 Q20
muls r0,r4 @ dy0=a0*r0 Q28
asrs r0,#13 @ dy0 Q15
lsls r4,#8 @ a0 Q16
subs r4,r0 @ a1=a0-dy0/2 Q16-Q15/2 -> Q16
adds r4,#170 @ mostly remove systematic error in this approximation: gains approximately 1 bit
movs r0,r4 @ second Newton-Raphson iteration
muls r0,r0 @ a1*a1 Q32
lsrs r0,#15 @ a1*a1 Q17
lsrs r3,r1,#8 @ y Q15
muls r0,r3 @ r1=p1=a1*a1*y Q32
asrs r0,#12 @ r1 Q20
muls r0,r4 @ dy1=a1*r1 Q36
asrs r0,#21 @ dy1 Q15
subs r4,r0 @ a2=a1-dy1/2 Q16-Q15/2 -> Q16
muls r3,r4 @ a3=y*a2 Q31
lsrs r3,#15 @ a3 Q16
@ here a2 is an approximation to the reciprocal square root
@ and a3 is an approximation to the square root
movs r0,r3
muls r0,r0 @ a3*a3 Q32
lsls r1,#9 @ y Q32
subs r0,r1,r0 @ r2=y-a3*a3 Q32 remainder
asrs r0,#5 @ r2 Q27
muls r4,r0 @ r2*a2 Q43
lsls r3,#7 @ a3 Q23
asrs r0,r4,#15 @ r2*a2 Q28
adds r0,#16 @ rounding to Q24
asrs r0,r0,#6 @ r2*a2 Q22
add r3,r0 @ a4 Q23: candidate final result
bcc sq_3 @ near rounding boundary? skip if no rounding needed
mov r4,r3
adcs r4,r4 @ a4+0.5ulp Q24
muls r4,r4 @ Q48
lsls r1,#16 @ y Q48
subs r1,r4 @ remainder Q48
bmi sq_3
adds r3,#1 @ round up
sq_3:
lsls r2,#23 @ pack exponent
adds r0,r2,r3
sq_6:
pop {r4}
bx r14
sq_0:
lsrs r1,#24
beq sq_2 @ -0: return it
@ here negative and not -0: return -Inf
asrs r0,#31
sq_5:
lsls r0,#23
b sq_6
sq_1: @ +Inf
lsrs r0,#23
b sq_5
sq_2:
lsrs r0,#31
lsls r0,#31
b sq_6
@ round(sqrt(2^22./[72:16:248]))
rsqrtapp:
.byte 0xf1,0xda,0xc9,0xbb, 0xb0,0xa6,0x9e,0x97, 0x91,0x8b,0x86,0x82
@ Notation:
@ rx:ry means the concatenation of rx and ry with rx having the less significant bits
@ IEEE double in ra:rb ->
@ mantissa in ra:rb 12Q52 (53 significant bits) with implied 1 set
@ exponent in re
@ sign in rs
@ trashes rt
.macro mdunpack ra,rb,re,rs,rt
lsrs \re,\rb,#20 @ extract sign and exponent
subs \rs,\re,#1
lsls \rs,#20
subs \rb,\rs @ clear sign and exponent in mantissa; insert implied 1
lsrs \rs,\re,#11 @ sign
lsls \re,#21
lsrs \re,#21 @ exponent
beq l\@_1 @ zero exponent?
adds \rt,\re,#1
lsrs \rt,#11
beq l\@_2 @ exponent != 0x7ff? then done
l\@_1:
movs \ra,#0
movs \rb,#1
lsls \rb,#20
subs \re,#128
lsls \re,#12
l\@_2:
.endm
@ IEEE double in ra:rb ->
@ signed mantissa in ra:rb 12Q52 (53 significant bits) with implied 1
@ exponent in re
@ trashes rt0 and rt1
@ +zero, +denormal -> exponent=-0x80000
@ -zero, -denormal -> exponent=-0x80000
@ +Inf, +NaN -> exponent=+0x77f000
@ -Inf, -NaN -> exponent=+0x77e000
.macro mdunpacks ra,rb,re,rt0,rt1
lsrs \re,\rb,#20 @ extract sign and exponent
lsrs \rt1,\rb,#31 @ sign only
subs \rt0,\re,#1
lsls \rt0,#20
subs \rb,\rt0 @ clear sign and exponent in mantissa; insert implied 1
lsls \re,#21
bcc l\@_1 @ skip on positive
mvns \rb,\rb @ negate mantissa
rsbs \ra,#0
bcc l\@_1
adds \rb,#1
l\@_1:
lsrs \re,#21
beq l\@_2 @ zero exponent?
adds \rt0,\re,#1
lsrs \rt0,#11
beq l\@_3 @ exponent != 0x7ff? then done
subs \re,\rt1
l\@_2:
movs \ra,#0
lsls \rt1,#1 @ +ve: 0 -ve: 2
adds \rb,\rt1,#1 @ +ve: 1 -ve: 3
lsls \rb,#30 @ create +/-1 mantissa
asrs \rb,#10
subs \re,#128
lsls \re,#12
l\@_3:
.endm
.align 2
.thumb_func
qfp_dsub:
push {r4-r7,r14}
movs r4,#1
lsls r4,#31
eors r3,r4 @ flip sign on second argument
b da_entry @ continue in dadd
.align 2
.thumb_func
qfp_dadd:
push {r4-r7,r14}
da_entry:
mdunpacks r0,r1,r4,r6,r7
mdunpacks r2,r3,r5,r6,r7
subs r7,r5,r4 @ ye-xe
subs r6,r4,r5 @ xe-ye
bmi da_ygtx
@ here xe>=ye: need to shift y down r6 places
mov r12,r4 @ save exponent
cmp r6,#32
bge da_xrgty @ xe rather greater than ye?
adds r7,#32
movs r4,r2
lsls r4,r4,r7 @ rounding bit + sticky bits
da_xgty0:
movs r5,r3
lsls r5,r5,r7
lsrs r2,r6
asrs r3,r6
orrs r2,r5
da_add:
adds r0,r2
adcs r1,r3
da_pack:
@ here unnormalised signed result (possibly 0) is in r0:r1 with exponent r12, rounding + sticky bits in r4
@ Note that if a large normalisation shift is required then the arguments were close in magnitude and so we
@ cannot have not gone via the xrgty/yrgtx paths. There will therefore always be enough high bits in r4
@ to provide a correct continuation of the exact result.
@ now pack result back up
lsrs r3,r1,#31 @ get sign bit
beq 1f @ skip on positive
mvns r1,r1 @ negate mantissa
mvns r0,r0
movs r2,#0
rsbs r4,#0
adcs r0,r2
adcs r1,r2
1:
mov r2,r12 @ get exponent
lsrs r5,r1,#21
bne da_0 @ shift down required?
lsrs r5,r1,#20
bne da_1 @ normalised?
cmp r0,#0
beq da_5 @ could mantissa be zero?
da_2:
adds r4,r4
adcs r0,r0
adcs r1,r1
subs r2,#1 @ adjust exponent
lsrs r5,r1,#20
beq da_2
da_1:
lsls r4,#1 @ check rounding bit
bcc da_3
da_4:
adds r0,#1 @ round up
bcc 2f
adds r1,#1
2:
cmp r4,#0 @ sticky bits zero?
bne da_3
lsrs r0,#1 @ round to even
lsls r0,#1
da_3:
subs r2,#1
bmi da_6
adds r4,r2,#2 @ check if exponent is overflowing
lsrs r4,#11
bne da_7
lsls r2,#20 @ pack exponent and sign
add r1,r2
lsls r3,#31
add r1,r3
pop {r4-r7,r15}
da_7:
@ here exponent overflow: return signed infinity
lsls r1,r3,#31
ldr r3,=#0x7ff00000
orrs r1,r3
b 1f
da_6:
@ here exponent underflow: return signed zero
lsls r1,r3,#31
1:
movs r0,#0
pop {r4-r7,r15}
da_5:
@ here mantissa could be zero
cmp r1,#0
bne da_2
cmp r4,#0
bne da_2
@ inputs must have been of identical magnitude and opposite sign, so return +0
pop {r4-r7,r15}
da_0:
@ here a shift down by one place is required for normalisation
adds r2,#1 @ adjust exponent
lsls r6,r0,#31 @ save rounding bit
lsrs r0,#1
lsls r5,r1,#31
orrs r0,r5
lsrs r1,#1
cmp r6,#0
beq da_3
b da_4
da_xrgty: @ xe>ye and shift>=32 places
cmp r6,#60
bge da_xmgty @ xe much greater than ye?
subs r6,#32
adds r7,#64
movs r4,r2
lsls r4,r4,r7 @ these would be shifted off the bottom of the sticky bits
beq 1f
movs r4,#1
1:
lsrs r2,r2,r6
orrs r4,r2
movs r2,r3
lsls r3,r3,r7
orrs r4,r3
asrs r3,r2,#31 @ propagate sign bit
b da_xgty0
da_ygtx:
@ here ye>xe: need to shift x down r7 places
mov r12,r5 @ save exponent
cmp r7,#32
bge da_yrgtx @ ye rather greater than xe?
adds r6,#32
movs r4,r0
lsls r4,r4,r6 @ rounding bit + sticky bits
da_ygtx0:
movs r5,r1
lsls r5,r5,r6
lsrs r0,r7
asrs r1,r7
orrs r0,r5
b da_add
da_yrgtx:
cmp r7,#60
bge da_ymgtx @ ye much greater than xe?
subs r7,#32
adds r6,#64
movs r4,r0
lsls r4,r4,r6 @ these would be shifted off the bottom of the sticky bits
beq 1f
movs r4,#1
1:
lsrs r0,r0,r7
orrs r4,r0
movs r0,r1
lsls r1,r1,r6
orrs r4,r1
asrs r1,r0,#31 @ propagate sign bit
b da_ygtx0
da_ymgtx: @ result is just y
movs r0,r2
movs r1,r3
da_xmgty: @ result is just x
movs r4,#0 @ clear sticky bits
b da_pack
.ltorg
@ equivalent of UMULL
@ needs five temporary registers
@ can have rt3==rx, in which case rx trashed
@ can have rt4==ry, in which case ry trashed
@ can have rzl==rx
@ can have rzh==ry
@ can have rzl,rzh==rt3,rt4
.macro mul32_32_64 rx,ry,rzl,rzh,rt0,rt1,rt2,rt3,rt4
@ t0 t1 t2 t3 t4
@ (x) (y)
uxth \rt0,\rx @ xl
uxth \rt1,\ry @ yl
muls \rt0,\rt1 @ xlyl=L
lsrs \rt2,\rx,#16 @ xh
muls \rt1,\rt2 @ xhyl=M0
lsrs \rt4,\ry,#16 @ yh
muls \rt2,\rt4 @ xhyh=H
uxth \rt3,\rx @ xl
muls \rt3,\rt4 @ xlyh=M1
adds \rt1,\rt3 @ M0+M1=M
bcc l\@_1 @ addition of the two cross terms can overflow, so add carry into H
movs \rt3,#1 @ 1
lsls \rt3,#16 @ 0x10000
adds \rt2,\rt3 @ H'
l\@_1:
@ t0 t1 t2 t3 t4
@ (zl) (zh)
lsls \rzl,\rt1,#16 @ ML
lsrs \rzh,\rt1,#16 @ MH
adds \rzl,\rt0 @ ZL
adcs \rzh,\rt2 @ ZH
.endm
@ SUMULL: x signed, y unsigned
@ in table below ¯ means signed variable
@ needs five temporary registers
@ can have rt3==rx, in which case rx trashed
@ can have rt4==ry, in which case ry trashed
@ can have rzl==rx
@ can have rzh==ry
@ can have rzl,rzh==rt3,rt4
.macro muls32_32_64 rx,ry,rzl,rzh,rt0,rt1,rt2,rt3,rt4
@ t0 t1 t2 t3 t4
@ ¯(x) (y)
uxth \rt0,\rx @ xl
uxth \rt1,\ry @ yl
muls \rt0,\rt1 @ xlyl=L
asrs \rt2,\rx,#16 @ ¯xh
muls \rt1,\rt2 @ ¯xhyl=M0
lsrs \rt4,\ry,#16 @ yh
muls \rt2,\rt4 @ ¯xhyh=H
uxth \rt3,\rx @ xl
muls \rt3,\rt4 @ xlyh=M1
asrs \rt4,\rt1,#31 @ M0sx (M1 sign extension is zero)
adds \rt1,\rt3 @ M0+M1=M
movs \rt3,#0 @ 0
adcs \rt4,\rt3 @ ¯Msx
lsls \rt4,#16 @ ¯Msx<<16
adds \rt2,\rt4 @ H'
@ t0 t1 t2 t3 t4
@ (zl) (zh)
lsls \rzl,\rt1,#16 @ M~
lsrs \rzh,\rt1,#16 @ M~
adds \rzl,\rt0 @ ZL
adcs \rzh,\rt2 @ ¯ZH
.endm
@ SSMULL: x signed, y signed
@ in table below ¯ means signed variable
@ needs five temporary registers
@ can have rt3==rx, in which case rx trashed
@ can have rt4==ry, in which case ry trashed
@ can have rzl==rx
@ can have rzh==ry
@ can have rzl,rzh==rt3,rt4
.macro muls32_s32_64 rx,ry,rzl,rzh,rt0,rt1,rt2,rt3,rt4
@ t0 t1 t2 t3 t4
@ ¯(x) (y)
uxth \rt0,\rx @ xl
uxth \rt1,\ry @ yl
muls \rt0,\rt1 @ xlyl=L
asrs \rt2,\rx,#16 @ ¯xh
muls \rt1,\rt2 @ ¯xhyl=M0
asrs \rt4,\ry,#16 @ ¯yh
muls \rt2,\rt4 @ ¯xhyh=H
uxth \rt3,\rx @ xl
muls \rt3,\rt4 @ ¯xlyh=M1
adds \rt1,\rt3 @ ¯M0+M1=M
asrs \rt3,\rt1,#31 @ Msx
bvc l\@_1 @
mvns \rt3,\rt3 @ ¯Msx flip sign extension bits if overflow
l\@_1:
lsls \rt3,#16 @ ¯Msx<<16
adds \rt2,\rt3 @ H'
@ t0 t1 t2 t3 t4
@ (zl) (zh)
lsls \rzl,\rt1,#16 @ M~
lsrs \rzh,\rt1,#16 @ M~
adds \rzl,\rt0 @ ZL
adcs \rzh,\rt2 @ ¯ZH
.endm
@ can have rt2==rx, in which case rx trashed
@ can have rzl==rx
@ can have rzh==rt1
.macro square32_64 rx,rzl,rzh,rt0,rt1,rt2
@ t0 t1 t2 zl zh
uxth \rt0,\rx @ xl
muls \rt0,\rt0 @ xlxl=L
uxth \rt1,\rx @ xl
lsrs \rt2,\rx,#16 @ xh
muls \rt1,\rt2 @ xlxh=M
muls \rt2,\rt2 @ xhxh=H
lsls \rzl,\rt1,#17 @ ML
lsrs \rzh,\rt1,#15 @ MH
adds \rzl,\rt0 @ ZL
adcs \rzh,\rt2 @ ZH
.endm
.align 2
.thumb_func
qfp_dmul:
push {r4-r7,r14}
mdunpack r0,r1,r4,r6,r5
mov r12,r4
mdunpack r2,r3,r4,r7,r5
eors r7,r6 @ sign of result
add r4,r12 @ exponent of result
push {r0-r2,r4,r7}
@ accumulate full product in r12:r5:r6:r7
mul32_32_64 r0,r2, r0,r5, r4,r6,r7,r0,r5 @ XL*YL
mov r12,r0 @ save LL bits
mul32_32_64 r1,r3, r6,r7, r0,r2,r4,r6,r7 @ XH*YH
pop {r0} @ XL
mul32_32_64 r0,r3, r0,r3, r1,r2,r4,r0,r3 @ XL*YH
adds r5,r0
adcs r6,r3
movs r0,#0
adcs r7,r0
pop {r1,r2} @ XH,YL
mul32_32_64 r1,r2, r1,r2, r0,r3,r4, r1,r2 @ XH*YL
adds r5,r1
adcs r6,r2
movs r0,#0
adcs r7,r0
@ here r5:r6:r7 holds the product [1..4) in Q(104-32)=Q72, with extra LSBs in r12
pop {r3,r4} @ exponent in r3, sign in r4
lsls r1,r7,#11
lsrs r2,r6,#21
orrs r1,r2
lsls r0,r6,#11
lsrs r2,r5,#21
orrs r0,r2
lsls r5,#11 @ now r5:r0:r1 Q83=Q(51+32), extra LSBs in r12
lsrs r2,r1,#20
bne 1f @ skip if in range [2..4)
adds r5,r5 @ shift up so always [2..4) Q83, i.e. [1..2) Q84=Q(52+32)
adcs r0,r0
adcs r1,r1
subs r3,#1 @ correct exponent
1:
ldr r6,=#0x3ff
subs r3,r6 @ correct for exponent bias
lsls r6,#1 @ 0x7fe
cmp r3,r6
bhs dm_0 @ exponent over- or underflow
lsls r5,#1 @ rounding bit to carry
bcc 1f @ result is correctly rounded
adds r0,#1
movs r6,#0
adcs r1,r6 @ round up
mov r6,r12 @ remaining sticky bits
orrs r5,r6
bne 1f @ some sticky bits set?
lsrs r0,#1
lsls r0,#1 @ round to even
1:
lsls r3,#20
adds r1,r3
dm_2:
lsls r4,#31
add r1,r4
pop {r4-r7,r15}
@ here for exponent over- or underflow
dm_0:
bge dm_1 @ overflow?
adds r3,#1 @ would-be zero exponent?
bne 1f
adds r0,#1
bne 1f @ all-ones mantissa?
adds r1,#1
lsrs r7,r1,#21
beq 1f
lsrs r1,#1
b dm_2
1:
lsls r1,r4,#31
movs r0,#0
pop {r4-r7,r15}
@ here for exponent overflow
dm_1:
adds r6,#1 @ 0x7ff
lsls r1,r6,#20
movs r0,#0
b dm_2
.ltorg
@ Approach to division y/x is as follows.
@
@ First generate u1, an approximation to 1/x to about 29 bits. Multiply this by the top
@ 32 bits of y to generate a0, a first approximation to the result (good to 28 bits or so).
@ Calculate the exact remainder r0=y-a0*x, which will be about 0. Calculate a correction
@ d0=r0*u1, and then write a1=a0+d0. If near a rounding boundary, compute the exact
@ remainder r1=y-a1*x (which can be done using r0 as a basis) to determine whether to
@ round up or down.
@
@ The calculation of 1/x is as given in dreciptest.c. That code verifies exhaustively
@ that | u1*x-1 | < 10*2^-32.
@
@ More precisely:
@
@ x0=(q16)x;
@ x1=(q30)x;
@ y0=(q31)y;
@ u0=(q15~)"(0xffffffffU/(unsigned int)roundq(x/x_ulp))/powq(2,16)"(x0); // q15 approximation to 1/x; "~" denotes rounding rather than truncation
@ v=(q30)(u0*x1-1);
@ u1=(q30)u0-(q30~)(u0*v);
@
@ a0=(q30)(u1*y0);
@ r0=(q82)y-a0*x;
@ r0x=(q57)r0;
@ d0=r0x*u1;
@ a1=d0+a0;
@
@ Error analysis
@
@ Use Greek letters to represent the errors introduced by rounding and truncation.
@
@ r₀ = y - a₀x
@ = y - [ u₁ ( y - α ) - β ] x where 0 ≤ α < 2^-31, 0 ≤ β < 2^-30
@ = y ( 1 - u₁x ) + ( u₁α + β ) x
@
@ Hence
@
@ | r₀ / x | < 2 * 10*2^-32 + 2^-31 + 2^-30
@ = 26*2^-32
@
@ r₁ = y - a₁x
@ = y - a₀x - d₀x
@ = r₀ - d₀x
@ = r₀ - u₁ ( r₀ - γ ) x where 0 ≤ γ < 2^-57
@ = r₀ ( 1 - u₁x ) + u₁γx
@
@ Hence
@
@ | r₁ / x | < 26*2^-32 * 10*2^-32 + 2^-57
@ = (260+128)*2^-64
@ < 2^-55
@
@ Empirically it seems to be nearly twice as good as this.
@
@ To determine correctly whether the exact remainder calculation can be skipped we need a result
@ accurate to < 0.25ulp. In the case where x>y the quotient will be shifted up one place for normalisation
@ and so 1ulp is 2^-53 and so the calculation above suffices.
.align 2
.thumb_func
qfp_ddiv:
push {r4-r7,r14}
ddiv0: @ entry point from dtan
mdunpack r2,r3,r4,r7,r6 @ unpack divisor
@ unpack dividend by hand to save on register use
lsrs r6,r1,#31
adds r6,r7
mov r12,r6 @ result sign in r12b0; r12b1 trashed
lsls r1,#1
lsrs r7,r1,#21 @ exponent
beq 1f @ zero exponent?
adds r6,r7,#1
lsrs r6,#11
beq 2f @ exponent != 0x7ff? then done
1:
movs r0,#0
movs r1,#0
subs r7,#64 @ less drastic fiddling of exponents to get 0/0, Inf/Inf correct
lsls r7,#12
2:
subs r6,r7,r4
lsls r6,#2
add r12,r12,r6 @ (signed) exponent in r12[31..8]
subs r7,#1 @ implied 1
lsls r7,#21
subs r1,r7
lsrs r1,#1
// see dreciptest-boxc.c
lsrs r4,r3,#15 @ x2=x>>15; // Q5 32..63
ldr r5,=#(rcpapp-32)
ldrb r4,[r5,r4] @ u=lut5[x2-32]; // Q8
lsls r5,r3,#8
muls r5,r5,r4
asrs r5,#14 @ e=(i32)(u*(x<<8))>>14; // Q22
asrs r6,r5,#11
muls r6,r6,r6 @ e2=(e>>11)*(e>>11); // Q22
subs r5,r6
muls r5,r5,r4 @ c=(e-e2)*u; // Q30
lsls r6,r4,#7
asrs r5,#14
adds r5,#1
asrs r5,#1
subs r6,r5 @ u0=(u<<7)-((c+0x4000)>>15); // Q15
@ here
@ r0:r1 y mantissa
@ r2:r3 x mantissa
@ r6 u0, first approximation to 1/x Q15
@ r12: result sign, exponent
lsls r4,r3,#10
lsrs r5,r2,#22
orrs r5,r4 @ x1=(q30)x
muls r5,r6 @ u0*x1 Q45
asrs r5,#15 @ v=u0*x1-1 Q30
muls r5,r6 @ u0*v Q45
asrs r5,#14
adds r5,#1
asrs r5,#1 @ round u0*v to Q30
lsls r6,#15
subs r6,r5 @ u1 Q30
@ here
@ r0:r1 y mantissa
@ r2:r3 x mantissa
@ r6 u1, second approximation to 1/x Q30
@ r12: result sign, exponent
push {r2,r3}
lsls r4,r1,#11
lsrs r5,r0,#21
orrs r4,r5 @ y0=(q31)y
mul32_32_64 r4,r6, r4,r5, r2,r3,r7,r4,r5 @ y0*u1 Q61
adds r4,r4
adcs r5,r5 @ a0=(q30)(y0*u1)
@ here
@ r0:r1 y mantissa
@ r5 a0, first approximation to y/x Q30
@ r6 u1, second approximation to 1/x Q30
@ r12 result sign, exponent
ldr r2,[r13,#0] @ xL
mul32_32_64 r2,r5, r2,r3, r1,r4,r7,r2,r3 @ xL*a0
ldr r4,[r13,#4] @ xH
muls r4,r5 @ xH*a0
adds r3,r4 @ r2:r3 now x*a0 Q82
lsrs r2,#25
lsls r1,r3,#7
orrs r2,r1 @ r2 now x*a0 Q57; r7:r2 is x*a0 Q89
lsls r4,r0,#5 @ y Q57
subs r0,r4,r2 @ r0x=y-x*a0 Q57 (signed)
@ here
@ r0 r0x Q57
@ r5 a0, first approximation to y/x Q30
@ r4 yL Q57
@ r6 u1 Q30
@ r12 result sign, exponent
muls32_32_64 r0,r6, r7,r6, r1,r2,r3, r7,r6 @ r7:r6 r0x*u1 Q87
asrs r3,r6,#25
adds r5,r3
lsls r3,r6,#7 @ r3:r5 a1 Q62 (but bottom 7 bits are zero so 55 bits of precision after binary point)
@ here we could recover another 7 bits of precision (but not accuracy) from the top of r7
@ but these bits are thrown away in the rounding and conversion to Q52 below
@ here
@ r3:r5 a1 Q62 candidate quotient [0.5,2) or so
@ r4 yL Q57
@ r12 result sign, exponent
movs r6,#0
adds r3,#128 @ for initial rounding to Q53
adcs r5,r5,r6
lsrs r1,r5,#30
bne dd_0
@ here candidate quotient a1 is in range [0.5,1)
@ so 30 significant bits in r5
lsls r4,#1 @ y now Q58
lsrs r1,r5,#9 @ to Q52
lsls r0,r5,#23
lsrs r3,#9 @ 0.5ulp-significance bit in carry: if this is 1 we may need to correct result
orrs r0,r3
bcs dd_1
b dd_2
dd_0:
@ here candidate quotient a1 is in range [1,2)
@ so 31 significant bits in r5
movs r2,#4
add r12,r12,r2 @ fix exponent; r3:r5 now effectively Q61
adds r3,#128 @ complete rounding to Q53
adcs r5,r5,r6
lsrs r1,r5,#10
lsls r0,r5,#22
lsrs r3,#10 @ 0.5ulp-significance bit in carry: if this is 1 we may need to correct result
orrs r0,r3
bcc dd_2
dd_1:
@ here
@ r0:r1 rounded result Q53 [0.5,1) or Q52 [1,2), but may not be correctly rounded-to-nearest
@ r4 yL Q58 or Q57
@ r12 result sign, exponent
@ carry set
adcs r0,r0,r0
adcs r1,r1,r1 @ z Q53 with 1 in LSB
lsls r4,#16 @ Q105-32=Q73
ldr r2,[r13,#0] @ xL Q52
ldr r3,[r13,#4] @ xH Q20
movs r5,r1 @ zH Q21
muls r5,r2 @ zH*xL Q73
subs r4,r5
muls r3,r0 @ zL*xH Q73
subs r4,r3
mul32_32_64 r2,r0, r2,r3, r5,r6,r7,r2,r3 @ xL*zL
rsbs r2,#0 @ borrow from low half?
sbcs r4,r3 @ y-xz Q73 (remainder bits 52..73)
cmp r4,#0
bmi 1f
movs r2,#0 @ round up
adds r0,#1
adcs r1,r2
1:
lsrs r0,#1 @ shift back down to Q52
lsls r2,r1,#31
orrs r0,r2
lsrs r1,#1
dd_2:
add r13,#8
mov r2,r12
lsls r7,r2,#31 @ result sign
asrs r2,#2 @ result exponent
ldr r3,=#0x3fd
adds r2,r3
ldr r3,=#0x7fe
cmp r2,r3
bhs dd_3 @ over- or underflow?
lsls r2,#20
adds r1,r2 @ pack exponent
dd_5:
adds r1,r7 @ pack sign
pop {r4-r7,r15}
dd_3:
movs r0,#0
cmp r2,#0
bgt dd_4 @ overflow?
movs r1,r7
pop {r4-r7,r15}
dd_4:
adds r3,#1 @ 0x7ff
lsls r1,r3,#20
b dd_5
/*
Approach to square root x=sqrt(y) is as follows.
First generate a3, an approximation to 1/sqrt(y) to about 30 bits. Multiply this by y
to give a4~sqrt(y) to about 28 bits and a remainder r4=y-a4^2. Then, because
d sqrt(y) / dy = 1 / (2 sqrt(y)) let d4=r4*a3/2 and then the value a5=a4+d4 is
a better approximation to sqrt(y). If this is near a rounding boundary we
compute an exact remainder y-a5*a5 to decide whether to round up or down.
The calculation of a3 and a4 is as given in dsqrttest.c. That code verifies exhaustively
that | 1 - a3a4 | < 10*2^-32, | r4 | < 40*2^-32 and | r4/y | < 20*2^-32.
More precisely, with "y" representing y truncated to 30 binary places:
u=(q3)y; // 24-entry table
a0=(q8~)"1/sqrtq(x+x_ulp/2)"(u); // first approximation from table
p0=(q16)(a0*a0) * (q16)y;
r0=(q20)(p0-1);
dy0=(q15)(r0*a0); // Newton-Raphson correction term
a1=(q16)a0-dy0/2; // good to ~9 bits
p1=(q19)(a1*a1)*(q19)y;
r1=(q23)(p1-1);
dy1=(q15~)(r1*a1); // second Newton-Raphson correction
a2x=(q16)a1-dy1/2; // good to ~16 bits
a2=a2x-a2x/1t16; // prevent overflow of a2*a2 in 32 bits
p2=(a2*a2)*(q30)y; // Q62
r2=(q36)(p2-1+1t-31);
dy2=(q30)(r2*a2); // Q52->Q30
a3=(q31)a2-dy2/2; // good to about 30 bits
a4=(q30)(a3*(q30)y+1t-31); // good to about 28 bits
Error analysis
r = y - a²
d = 1/2 ar
a = a + d
r = y - a²
= y - ( a + d )²
= y - a² - aar - 1/4 a²r²
= r - aar - 1/4 a²r²
| r | < | r | | 1 - aa | + 1/4 r²
a = y ( 1 - r/y )
= y ( 1 - 1/2 r/y + ... )
So to first order (second order being very tiny)
y - a = 1/2 r/y
and
| y - a | < 1/2 ( | r/y | | 1 - aa | + 1/4 r²/y )
From dsqrttest.c (conservatively):
< 1/2 ( 20*2^-32 * 10*2^-32 + 1/4 * 40*2^-32*20*2^-32 )
= 1/2 ( 200 + 200 ) * 2^-64
< 2^-56
Empirically we see about 1ulp worst-case error including rounding at Q57.
To determine correctly whether the exact remainder calculation can be skipped we need a result
accurate to < 0.25ulp at Q52, or 2^-54.
*/
dq_2:
bge dq_3 @ +Inf?
movs r1,#0
b dq_4
dq_0:
lsrs r1,#31
lsls r1,#31 @ preserve sign bit
lsrs r2,#21 @ extract exponent
beq dq_4 @ -0? return it
asrs r1,#11 @ make -Inf
b dq_4
dq_3:
ldr r1,=#0x7ff
lsls r1,#20 @ return +Inf
dq_4:
movs r0,#0
dq_1:
bx r14
.align 2
.thumb_func
qfp_dsqrt:
lsls r2,r1,#1
bcs dq_0 @ negative?
lsrs r2,#21 @ extract exponent
subs r2,#1
ldr r3,=#0x7fe
cmp r2,r3
bhs dq_2 @ catches 0 and +Inf
push {r4-r7,r14}
lsls r4,r2,#20
subs r1,r4 @ insert implied 1
lsrs r2,#1
bcc 1f @ even exponent? skip
adds r0,r0,r0 @ odd exponent: shift up mantissa
adcs r1,r1,r1
1:
lsrs r3,#2
adds r2,r3
lsls r2,#20
mov r12,r2 @ save result exponent
@ here
@ r0:r1 y mantissa Q52 [1,4)
@ r12 result exponent
adr r4,drsqrtapp-8 @ first eight table entries are never accessed because of the mantissa's leading 1
lsrs r2,r1,#17 @ y Q3
ldrb r2,[r4,r2] @ initial approximation to reciprocal square root a0 Q8
lsrs r3,r1,#4 @ first Newton-Raphson iteration
muls r3,r2
muls r3,r2 @ i32 p0=a0*a0*(y>>14); // Q32
asrs r3,r3,#12 @ i32 r0=p0>>12; // Q20
muls r3,r2
asrs r3,#13 @ i32 dy0=(r0*a0)>>13; // Q15
lsls r2,#8
subs r2,r3 @ i32 a1=(a0<<8)-dy0; // Q16
movs r3,r2
muls r3,r3
lsrs r3,#13
lsrs r4,r1,#1
muls r3,r4 @ i32 p1=((a1*a1)>>11)*(y>>11); // Q19*Q19=Q38
asrs r3,#15 @ i32 r1=p1>>15; // Q23
muls r3,r2
asrs r3,#23
adds r3,#1
asrs r3,#1 @ i32 dy1=(r1*a1+(1<<23))>>24; // Q23*Q16=Q39; Q15
subs r2,r3 @ i32 a2=a1-dy1; // Q16
lsrs r3,r2,#16
subs r2,r3 @ if(a2>=0x10000) a2=0xffff; to prevent overflow of a2*a2
@ here
@ r0:r1 y mantissa
@ r2 a2 ~ 1/sqrt(y) Q16
@ r12 result exponent
movs r3,r2
muls r3,r3
lsls r1,#10
lsrs r4,r0,#22
orrs r1,r4 @ y Q30
mul32_32_64 r1,r3, r4,r3, r5,r6,r7,r4,r3 @ i64 p2=(ui64)(a2*a2)*(ui64)y; // Q62 r4:r3
lsls r5,r3,#6
lsrs r4,#26
orrs r4,r5
adds r4,#0x20 @ i32 r2=(p2>>26)+0x20; // Q36 r4
uxth r5,r4
muls r5,r2
asrs r4,#16
muls r4,r2
lsrs r5,#16
adds r4,r5
asrs r4,#6 @ i32 dy2=((i64)r2*(i64)a2)>>22; // Q36*Q16=Q52; Q30
lsls r2,#15
subs r2,r4
@ here
@ r0 y low bits
@ r1 y Q30
@ r2 a3 ~ 1/sqrt(y) Q31
@ r12 result exponent
mul32_32_64 r2,r1, r3,r4, r5,r6,r7,r3,r4
adds r3,r3,r3
adcs r4,r4,r4
adds r3,r3,r3
movs r3,#0
adcs r3,r4 @ ui32 a4=((ui64)a3*(ui64)y+(1U<<31))>>31; // Q30
@ here
@ r0 y low bits
@ r1 y Q30
@ r2 a3 Q31 ~ 1/sqrt(y)
@ r3 a4 Q30 ~ sqrt(y)
@ r12 result exponent
square32_64 r3, r4,r5, r6,r5,r7
lsls r6,r0,#8
lsrs r7,r1,#2
subs r6,r4
sbcs r7,r5 @ r4=(q60)y-a4*a4
@ by exhaustive testing, r4 = fffffffc0e134fdc .. 00000003c2bf539c Q60
lsls r5,r7,#29
lsrs r6,#3
adcs r6,r5 @ r4 Q57 with rounding
muls32_32_64 r6,r2, r6,r2, r4,r5,r7,r6,r2 @ d4=a3*r4/2 Q89
@ r4+d4 is correct to 1ULP at Q57, tested on ~9bn cases including all extreme values of r4 for each possible y Q30
adds r2,#8
asrs r2,#5 @ d4 Q52, rounded to Q53 with spare bit in carry
@ here
@ r0 y low bits
@ r1 y Q30
@ r2 d4 Q52, rounded to Q53
@ C flag contains d4_b53
@ r3 a4 Q30
bcs dq_5
lsrs r5,r3,#10 @ a4 Q52
lsls r4,r3,#22
asrs r1,r2,#31
adds r0,r2,r4
adcs r1,r5 @ a4+d4
add r1,r12 @ pack exponent
pop {r4-r7,r15}
.ltorg
@ round(sqrt(2^22./[68:8:252]))
drsqrtapp:
.byte 0xf8,0xeb,0xdf,0xd6,0xcd,0xc5,0xbe,0xb8
.byte 0xb2,0xad,0xa8,0xa4,0xa0,0x9c,0x99,0x95
.byte 0x92,0x8f,0x8d,0x8a,0x88,0x85,0x83,0x81
dq_5:
@ here we are near a rounding boundary, C is set
adcs r2,r2,r2 @ d4 Q53+1ulp
lsrs r5,r3,#9
lsls r4,r3,#23 @ r4:r5 a4 Q53
asrs r1,r2,#31
adds r4,r2,r4
adcs r5,r1 @ r4:r5 a5=a4+d4 Q53+1ulp
movs r3,r5
muls r3,r4
square32_64 r4,r1,r2,r6,r2,r7
adds r2,r3
adds r2,r3 @ r1:r2 a5^2 Q106
lsls r0,#22 @ y Q84
rsbs r1,#0
sbcs r0,r2 @ remainder y-a5^2
bmi 1f @ y<a5^2: no need to increment a5
movs r3,#0
adds r4,#1
adcs r5,r3 @ bump a5 if over rounding boundary
1:
lsrs r0,r4,#1
lsrs r1,r5,#1
lsls r5,#31
orrs r0,r5
add r1,r12
pop {r4-r7,r15}
@ compare r0:r1 against r2:r3, returning -1/0/1 for <, =, >
@ also set flags accordingly
.thumb_func
qfp_dcmp:
push {r4,r6,r7,r14}
ldr r7,=#0x7ff @ flush NaNs and denormals
lsls r4,r1,#1
lsrs r4,#21
beq 1f
cmp r4,r7
bne 2f
1:
movs r0,#0
lsrs r1,#20
lsls r1,#20
2:
lsls r4,r3,#1
lsrs r4,#21
beq 1f
cmp r4,r7
bne 2f
1:
movs r2,#0
lsrs r3,#20
lsls r3,#20
2:
dcmp_fast_entry:
movs r6,#1
eors r3,r1
bmi 4f @ opposite signs? then can proceed on basis of sign of x
eors r3,r1 @ restore r3
bpl 1f
rsbs r6,#0 @ negative? flip comparison
1:
cmp r1,r3
bne 1f
cmp r0,r2
bhi 2f
blo 3f
5:
movs r6,#0 @ equal? result is 0
1:
bgt 2f
3:
rsbs r6,#0
2:
subs r0,r6,#0 @ copy and set flags
pop {r4,r6,r7,r15}
4:
orrs r3,r1 @ make -0==+0
adds r3,r3
orrs r3,r0
orrs r3,r2
beq 5b
cmp r1,#0
bge 2b
b 3b
@ "scientific" functions start here
.thumb_func
push_r8_r11:
mov r4,r8
mov r5,r9
mov r6,r10
mov r7,r11
push {r4-r7}
bx r14
.thumb_func
pop_r8_r11:
pop {r4-r7}
mov r8,r4
mov r9,r5
mov r10,r6
mov r11,r7
bx r14
@ double-length CORDIC rotation step
@ r0:r1 ω
@ r6 32-i (complementary shift)
@ r7 i (shift)
@ r8:r9 x
@ r10:r11 y
@ r12 coefficient pointer
@ an option in rotation mode would be to compute the sequence of σ values
@ in one pass, rotate the initial vector by the residual ω and then run a
@ second pass to compute the final x and y. This would relieve pressure
@ on registers and hence possibly be faster. The same trick does not work
@ in vectoring mode (but perhaps one could work to single precision in
@ a first pass and then double precision in a second pass?).
.thumb_func
dcordic_vec_step:
mov r2,r12
ldmia r2!,{r3,r4}
mov r12,r2
mov r2,r11
cmp r2,#0
blt 1f
b 2f
.thumb_func
dcordic_rot_step:
mov r2,r12
ldmia r2!,{r3,r4}
mov r12,r2
cmp r1,#0
bge 1f
2:
@ ω<0 / y>=0
@ ω+=dω
@ x+=y>>i, y-=x>>i
adds r0,r3
adcs r1,r4
mov r3,r11
asrs r3,r7
mov r4,r11
lsls r4,r6
mov r2,r10
lsrs r2,r7
orrs r2,r4 @ r2:r3 y>>i, rounding in carry
mov r4,r8
mov r5,r9 @ r4:r5 x
adcs r2,r4
adcs r3,r5 @ r2:r3 x+(y>>i)
mov r8,r2
mov r9,r3
mov r3,r5
lsls r3,r6
asrs r5,r7
lsrs r4,r7
orrs r4,r3 @ r4:r5 x>>i, rounding in carry
mov r2,r10
mov r3,r11
sbcs r2,r4
sbcs r3,r5 @ r2:r3 y-(x>>i)
mov r10,r2
mov r11,r3
bx r14
@ ω>0 / y<0
@ ω-=dω
@ x-=y>>i, y+=x>>i
1:
subs r0,r3
sbcs r1,r4
mov r3,r9
asrs r3,r7
mov r4,r9
lsls r4,r6
mov r2,r8
lsrs r2,r7
orrs r2,r4 @ r2:r3 x>>i, rounding in carry
mov r4,r10
mov r5,r11 @ r4:r5 y
adcs r2,r4
adcs r3,r5 @ r2:r3 y+(x>>i)
mov r10,r2
mov r11,r3
mov r3,r5
lsls r3,r6
asrs r5,r7
lsrs r4,r7
orrs r4,r3 @ r4:r5 y>>i, rounding in carry
mov r2,r8
mov r3,r9
sbcs r2,r4
sbcs r3,r5 @ r2:r3 x-(y>>i)
mov r8,r2
mov r9,r3
bx r14
ret_dzero:
movs r0,#0
movs r1,#0
bx r14
@ convert double to signed int, rounding towards 0, clamping
.thumb_func
qfp_double2int_z:
cmp r1,#0
bge qfp_double2int @ +ve or zero? then use rounding towards -Inf
push {r14}
lsls r1,#1 @ -ve: clear sign bit
lsrs r1,#1
bl qfp_double2uint @ convert to unsigned, rounding towards -Inf
movs r1,#1
lsls r1,#31 @ r1=0x80000000
cmp r0,r1
bhi 1f
rsbs r0,#0
pop {r15}
1:
mov r0,r1
pop {r15}
@ convert packed double in r0:r1 to signed/unsigned 32/64-bit integer/fixed-point value in r0:r1 [with r2 places after point], with rounding towards -Inf
@ fixed-point versions only work with reasonable values in r2 because of the way dunpacks works
.thumb_func
qfp_double2int:
movs r2,#0 @ and fall through
.thumb_func
qfp_double2fix:
push {r14}
adds r2,#32
bl qfp_double2fix64
movs r0,r1
pop {r15}
.thumb_func
qfp_double2uint:
movs r2,#0 @ and fall through
.thumb_func
qfp_double2ufix:
push {r14}
adds r2,#32
bl qfp_double2ufix64
movs r0,r1
pop {r15}
.thumb_func
qfp_float2int64_z:
cmp r0,#0
bge qfp_float2int64 @ +ve or zero? then use rounding towards -Inf
push {r14}
lsls r0,#1 @ -ve: clear sign bit
lsrs r0,#1
bl qfp_float2uint64 @ convert to unsigned, rounding towards -Inf
movs r2,#1
lsls r2,#31 @ r2=0x80000000
cmp r1,r2
bhs 1f
mvns r1,r1
rsbs r0,#0
bcc 2f
adds r1,#1
2:
pop {r15}
1:
movs r0,#0
mov r1,r2
pop {r15}
.thumb_func
qfp_float2int64:
movs r1,#0 @ and fall through
.thumb_func
qfp_float2fix64:
push {r14}
bl f2fix
b d2f64_a
.thumb_func
qfp_float2uint64:
movs r1,#0 @ and fall through
.thumb_func
qfp_float2ufix64:
asrs r3,r0,#23 @ negative? return 0
bmi ret_dzero
@ and fall through
@ convert float in r0 to signed fixed point in r0:r1:r3, r1 places after point, rounding towards -Inf
@ result clamped so that r3 can only be 0 or -1
@ trashes r12
.thumb_func
f2fix:
push {r4,r14}
mov r12,r1
asrs r3,r0,#31
lsls r0,#1
lsrs r2,r0,#24
beq 1f @ zero?
cmp r2,#0xff @ Inf?
beq 2f
subs r1,r2,#1
subs r2,#0x7f @ remove exponent bias
lsls r1,#24
subs r0,r1 @ insert implied 1
eors r0,r3
subs r0,r3 @ top two's complement
asrs r1,r0,#4 @ convert to double format
lsls r0,#28
b d2fix_a
1:
movs r0,#0
movs r1,r0
movs r3,r0
pop {r4,r15}
2:
mvns r0,r3 @ return max/min value
mvns r1,r3
pop {r4,r15}
.thumb_func
qfp_double2int64_z:
cmp r1,#0
bge qfp_double2int64 @ +ve or zero? then use rounding towards -Inf
push {r14}
lsls r1,#1 @ -ve: clear sign bit
lsrs r1,#1
bl qfp_double2uint64 @ convert to unsigned, rounding towards -Inf
cmp r1,#0
blt 1f
mvns r1,r1
rsbs r0,#0
bcc 2f
adds r1,#1
2:
pop {r15}
1:
movs r0,#0
movs r1,#1
lsls r1,#31 @ 0x80000000
pop {r15}
.thumb_func
qfp_double2int64:
movs r2,#0 @ and fall through
.thumb_func
qfp_double2fix64:
push {r14}
bl d2fix
d2f64_a:
asrs r2,r1,#31
cmp r2,r3
bne 1f @ sign extension bits fail to match sign of result?
pop {r15}
1:
mvns r0,r3
movs r1,#1
lsls r1,#31
eors r1,r1,r0 @ generate extreme fixed-point values
pop {r15}
.thumb_func
qfp_double2uint64:
movs r2,#0 @ and fall through
.thumb_func
qfp_double2ufix64:
asrs r3,r1,#20 @ negative? return 0
bmi ret_dzero
@ and fall through
@ convert double in r0:r1 to signed fixed point in r0:r1:r3, r2 places after point, rounding towards -Inf
@ result clamped so that r3 can only be 0 or -1
@ trashes r12
.thumb_func
d2fix:
push {r4,r14}
mov r12,r2
bl dunpacks
asrs r4,r2,#16
adds r4,#1
bge 1f
movs r1,#0 @ -0 -> +0
1:
asrs r3,r1,#31
d2fix_a:
@ here
@ r0:r1 two's complement mantissa
@ r2 unbaised exponent
@ r3 mantissa sign extension bits
add r2,r12 @ exponent plus offset for required binary point position
subs r2,#52 @ required shift
bmi 1f @ shift down?
@ here a shift up by r2 places
cmp r2,#12 @ will clamp?
bge 2f
movs r4,r0
lsls r1,r2
lsls r0,r2
rsbs r2,#0
adds r2,#32 @ complementary shift
lsrs r4,r2
orrs r1,r4
pop {r4,r15}
2:
mvns r0,r3
mvns r1,r3 @ overflow: clamp to extreme fixed-point values
pop {r4,r15}
1:
@ here a shift down by -r2 places
adds r2,#32
bmi 1f @ long shift?
mov r4,r1
lsls r4,r2
rsbs r2,#0
adds r2,#32 @ complementary shift
asrs r1,r2
lsrs r0,r2
orrs r0,r4
pop {r4,r15}
1:
@ here a long shift down
movs r0,r1
asrs r1,#31 @ shift down 32 places
adds r2,#32
bmi 1f @ very long shift?
rsbs r2,#0
adds r2,#32
asrs r0,r2
pop {r4,r15}
1:
movs r0,r3 @ result very near zero: use sign extension bits
movs r1,r3
pop {r4,r15}
@ float <-> double conversions
.thumb_func
qfp_float2double:
lsrs r3,r0,#31 @ sign bit
lsls r3,#31
lsls r1,r0,#1
lsrs r2,r1,#24 @ exponent
beq 1f @ zero?
cmp r2,#0xff @ Inf?
beq 2f
lsrs r1,#4 @ exponent and top 20 bits of mantissa
ldr r2,=#(0x3ff-0x7f)<<20 @ difference in exponent offsets
adds r1,r2
orrs r1,r3
lsls r0,#29 @ bottom 3 bits of mantissa
bx r14
1:
movs r1,r3 @ return signed zero
3:
movs r0,#0
bx r14
2:
ldr r1,=#0x7ff00000 @ return signed infinity
adds r1,r3
b 3b
.thumb_func
qfp_double2float:
lsls r2,r1,#1
lsrs r2,#21 @ exponent
ldr r3,=#0x3ff-0x7f
subs r2,r3 @ fix exponent bias
ble 1f @ underflow or zero
cmp r2,#0xff
bge 2f @ overflow or infinity
lsls r2,#23 @ position exponent of result
lsrs r3,r1,#31
lsls r3,#31
orrs r2,r3 @ insert sign
lsls r3,r0,#3 @ rounding bits
lsrs r0,#29
lsls r1,#12
lsrs r1,#9
orrs r0,r1 @ assemble mantissa
orrs r0,r2 @ insert exponent and sign
lsls r3,#1
bcc 3f @ no rounding
beq 4f @ all sticky bits 0?
5:
adds r0,#1
3:
bx r14
4:
lsrs r3,r0,#1 @ odd? then round up
bcs 5b
bx r14
1:
beq 6f @ check case where value is just less than smallest normal
7:
lsrs r0,r1,#31
lsls r0,#31
bx r14
6:
lsls r2,r1,#12 @ 20 1:s at top of mantissa?
asrs r2,#12
adds r2,#1
bne 7b
lsrs r2,r0,#29 @ and 3 more 1:s?
cmp r2,#7
bne 7b
movs r2,#1 @ return smallest normal with correct sign
b 8f
2:
movs r2,#0xff
8:
lsrs r0,r1,#31 @ return signed infinity
lsls r0,#8
adds r0,r2
lsls r0,#23
bx r14
@ convert signed/unsigned 32/64-bit integer/fixed-point value in r0:r1 [with r2 places after point] to packed double in r0:r1, with rounding
.thumb_func
qfp_uint2double:
movs r1,#0 @ and fall through
.thumb_func
qfp_ufix2double:
movs r2,r1
movs r1,#0
b qfp_ufix642double
.thumb_func
qfp_int2double:
movs r1,#0 @ and fall through
.thumb_func
qfp_fix2double:
movs r2,r1
asrs r1,r0,#31 @ sign extend
b qfp_fix642double
.thumb_func
qfp_uint642double:
movs r2,#0 @ and fall through
.thumb_func
qfp_ufix642double:
movs r3,#0
b uf2d
.thumb_func
qfp_int642double:
movs r2,#0 @ and fall through
.thumb_func
qfp_fix642double:
asrs r3,r1,#31 @ sign bit across all bits
eors r0,r3
eors r1,r3
subs r0,r3
sbcs r1,r3
uf2d:
push {r4,r5,r14}
ldr r4,=#0x432
subs r2,r4,r2 @ form biased exponent
@ here
@ r0:r1 unnormalised mantissa
@ r2 -Q (will become exponent)
@ r3 sign across all bits
cmp r1,#0
bne 1f @ short normalising shift?
movs r1,r0
beq 2f @ zero? return it
movs r0,#0
subs r2,#32 @ fix exponent
1:
asrs r4,r1,#21
bne 3f @ will need shift down (and rounding?)
bcs 4f @ normalised already?
5:
subs r2,#1
adds r0,r0 @ shift up
adcs r1,r1
lsrs r4,r1,#21
bcc 5b
4:
ldr r4,=#0x7fe
cmp r2,r4
bhs 6f @ over/underflow? return signed zero/infinity
7:
lsls r2,#20 @ pack and return
adds r1,r2
lsls r3,#31
adds r1,r3
2:
pop {r4,r5,r15}
6: @ return signed zero/infinity according to unclamped exponent in r2
mvns r2,r2
lsrs r2,#21
movs r0,#0
movs r1,#0
b 7b
3:
@ here we need to shift down to normalise and possibly round
bmi 1f @ already normalised to Q63?
2:
subs r2,#1
adds r0,r0 @ shift up
adcs r1,r1
bpl 2b
1:
@ here we have a 1 in b63 of r0:r1
adds r2,#11 @ correct exponent for subsequent shift down
lsls r4,r0,#21 @ save bits for rounding
lsrs r0,#11
lsls r5,r1,#21
orrs r0,r5
lsrs r1,#11
lsls r4,#1
beq 1f @ sticky bits are zero?
8:
movs r4,#0
adcs r0,r4
adcs r1,r4
b 4b
1:
bcc 4b @ sticky bits are zero but not on rounding boundary
lsrs r4,r0,#1 @ increment if odd (force round to even)
b 8b
.ltorg
.thumb_func
dunpacks:
mdunpacks r0,r1,r2,r3,r4
ldr r3,=#0x3ff
subs r2,r3 @ exponent without offset
bx r14
@ r0:r1 signed mantissa Q52
@ r2 unbiased exponent < 10 (i.e., |x|<2^10)
@ r4 pointer to:
@ - divisor reciprocal approximation r=1/d Q15
@ - divisor d Q62 0..20
@ - divisor d Q62 21..41
@ - divisor d Q62 42..62
@ returns:
@ r0:r1 reduced result y Q62, -0.6 d < y < 0.6 d (better in practice)
@ r2 quotient q (number of reductions)
@ if exponent >=10, returns r0:r1=0, r2=1024*mantissa sign
@ designed to work for 0.5<d<2, in particular d=ln2 (~0.7) and d=π/2 (~1.6)
.thumb_func
dreduce:
adds r2,#2 @ e+2
bmi 1f @ |x|<0.25, too small to need adjustment
cmp r2,#12
bge 4f
2:
movs r5,#17
subs r5,r2 @ 15-e
movs r3,r1 @ Q20
asrs r3,r5 @ x Q5
adds r2,#8 @ e+10
adds r5,#7 @ 22-e = 32-(e+10)
movs r6,r0
lsrs r6,r5
lsls r0,r2
lsls r1,r2
orrs r1,r6 @ r0:r1 x Q62
ldmia r4,{r4-r7}
muls r3,r4 @ rx Q20
asrs r2,r3,#20
movs r3,#0
adcs r2,r3 @ rx Q0 rounded = q; for e.g. r=1.5 |q|<1.5*2^10
muls r5,r2 @ qd in pieces: L Q62
muls r6,r2 @ M Q41
muls r7,r2 @ H Q20
lsls r7,#10
asrs r4,r6,#11
lsls r6,#21
adds r6,r5
adcs r7,r4
asrs r5,#31
adds r7,r5 @ r6:r7 qd Q62
subs r0,r6
sbcs r1,r7 @ remainder Q62
bx r14
4:
movs r2,#12 @ overflow: clamp to +/-1024
movs r0,#0
asrs r1,#31
lsls r1,#1
adds r1,#1
lsls r1,#20
b 2b
1:
lsls r1,#8
lsrs r3,r0,#24
orrs r1,r3
lsls r0,#8 @ r0:r1 Q60, to be shifted down -r2 places
rsbs r3,r2,#0
adds r2,#32 @ shift down in r3, complementary shift in r2
bmi 1f @ long shift?
2:
movs r4,r1
asrs r1,r3
lsls r4,r2
lsrs r0,r3
orrs r0,r4
movs r2,#0 @ rounding
adcs r0,r2
adcs r1,r2
bx r14
1:
movs r0,r1 @ down 32 places
asrs r1,#31
subs r3,#32
adds r2,#32
bpl 2b
movs r0,#0 @ very long shift? return 0
movs r1,#0
movs r2,#0
bx r14
.thumb_func
qfp_dtan:
push {r4-r7,r14}
bl push_r8_r11
bl dsincos
mov r12,r0 @ save ε
bl dcos_finish
push {r0,r1}
mov r0,r12
bl dsin_finish
pop {r2,r3}
bl pop_r8_r11
b ddiv0 @ compute sin θ/cos θ
.thumb_func
qfp_dcos:
push {r4-r7,r14}
bl push_r8_r11
bl dsincos
bl dcos_finish
b 1f
.thumb_func
qfp_dsin:
push {r4-r7,r14}
bl push_r8_r11
bl dsincos
bl dsin_finish
1:
bl pop_r8_r11
pop {r4-r7,r15}
@ unpack double θ in r0:r1, range reduce and calculate ε, cos α and sin α such that
@ θ=α+ε and |ε|≤2^-32
@ on return:
@ r0:r1 ε (residual ω, where θ=α+ε) Q62, |ε|≤2^-32 (so fits in r0)
@ r8:r9 cos α Q62
@ r10:r11 sin α Q62
.thumb_func
dsincos:
push {r14}
bl dunpacks
adr r4,dreddata0
bl dreduce
movs r4,#0
ldr r5,=#0x9df04dbb @ this value compensates for the non-unity scaling of the CORDIC rotations
ldr r6,=#0x36f656c5
lsls r2,#31
bcc 1f
@ quadrant 2 or 3
mvns r6,r6
rsbs r5,r5,#0
adcs r6,r4
1:
lsls r2,#1
bcs 1f
@ even quadrant
mov r10,r4
mov r11,r4
mov r8,r5
mov r9,r6
b 2f
1:
@ odd quadrant
mov r8,r4
mov r9,r4
mov r10,r5
mov r11,r6
2:
adr r4,dtab_cc
mov r12,r4
movs r7,#1
movs r6,#31
1:
bl dcordic_rot_step
adds r7,#1
subs r6,#1
cmp r7,#33
bne 1b
pop {r15}
dcos_finish:
@ here
@ r0:r1 ε (residual ω, where θ=α+ε) Q62, |ε|≤2^-32 (so fits in r0)
@ r8:r9 cos α Q62
@ r10:r11 sin α Q62
@ and we wish to calculate cos θ=cos(α+ε)~cos α - ε sin α
mov r1,r11
@ mov r2,r10
@ lsrs r2,#31
@ adds r1,r2 @ rounding improves accuracy very slightly
muls32_s32_64 r0,r1, r2,r3, r4,r5,r6,r2,r3
@ r2:r3 ε sin α Q(62+62-32)=Q92
mov r0,r8
mov r1,r9
lsls r5,r3,#2
asrs r3,r3,#30
lsrs r2,r2,#30
orrs r2,r5
sbcs r0,r2 @ include rounding
sbcs r1,r3
movs r2,#62
b qfp_fix642double
dsin_finish:
@ here
@ r0:r1 ε (residual ω, where θ=α+ε) Q62, |ε|≤2^-32 (so fits in r0)
@ r8:r9 cos α Q62
@ r10:r11 sin α Q62
@ and we wish to calculate sin θ=sin(α+ε)~sin α + ε cos α
mov r1,r9
muls32_s32_64 r0,r1, r2,r3, r4,r5,r6,r2,r3
@ r2:r3 ε cos α Q(62+62-32)=Q92
mov r0,r10
mov r1,r11
lsls r5,r3,#2
asrs r3,r3,#30
lsrs r2,r2,#30
orrs r2,r5
adcs r0,r2 @ include rounding
adcs r1,r3
movs r2,#62
b qfp_fix642double
.ltorg
.align 2
dreddata0:
.word 0x0000517d @ 2/π Q15
.word 0x0014611A @ π/2 Q62=6487ED5110B4611A split into 21-bit pieces
.word 0x000A8885
.word 0x001921FB
.thumb_func
qfp_datan2:
@ r0:r1 y
@ r2:r3 x
push {r4-r7,r14}
bl push_r8_r11
ldr r5,=#0x7ff00000
movs r4,r1
ands r4,r5 @ y==0?
beq 1f
cmp r4,r5 @ or Inf/NaN?
bne 2f
1:
lsrs r1,#20 @ flush
lsls r1,#20
movs r0,#0
2:
movs r4,r3
ands r4,r5 @ x==0?
beq 1f
cmp r4,r5 @ or Inf/NaN?
bne 2f
1:
lsrs r3,#20 @ flush
lsls r3,#20
movs r2,#0
2:
movs r6,#0 @ quadrant offset
lsls r5,#11 @ constant 0x80000000
cmp r3,#0
bpl 1f @ skip if x positive
movs r6,#2
eors r3,r5
eors r1,r5
bmi 1f @ quadrant offset=+2 if y was positive
rsbs r6,#0 @ quadrant offset=-2 if y was negative
1:
@ now in quadrant 0 or 3
adds r7,r1,r5 @ r7=-r1
bpl 1f
@ y>=0: in quadrant 0
cmp r1,r3
ble 2f @ y<~x so 0≤θ<~π/4: skip
adds r6,#1
eors r1,r5 @ negate x
b 3f @ and exchange x and y = rotate by -π/2
1:
cmp r3,r7
bge 2f @ -y<~x so -π/4<~θ≤0: skip
subs r6,#1
eors r3,r5 @ negate y and ...
3:
movs r7,r0 @ exchange x and y
movs r0,r2
movs r2,r7
movs r7,r1
movs r1,r3
movs r3,r7
2:
@ here -π/4<~θ<~π/4
@ r6 has quadrant offset
push {r6}
cmp r2,#0
bne 1f
cmp r3,#0
beq 10f @ x==0 going into division?
lsls r4,r3,#1
asrs r4,#21
adds r4,#1
bne 1f @ x==Inf going into division?
lsls r4,r1,#1
asrs r4,#21
adds r4,#1 @ y also ±Inf?
bne 10f
subs r1,#1 @ make them both just finite
subs r3,#1
b 1f
10:
movs r0,#0
movs r1,#0
b 12f
1:
bl qfp_ddiv
movs r2,#62
bl qfp_double2fix64
@ r0:r1 y/x
mov r10,r0
mov r11,r1
movs r0,#0 @ ω=0
movs r1,#0
mov r8,r0
movs r2,#1
lsls r2,#30
mov r9,r2 @ x=1
adr r4,dtab_cc
mov r12,r4
movs r7,#1
movs r6,#31
1:
bl dcordic_vec_step
adds r7,#1
subs r6,#1
cmp r7,#33
bne 1b
@ r0:r1 atan(y/x) Q62
@ r8:r9 x residual Q62
@ r10:r11 y residual Q62
mov r2,r9
mov r3,r10
subs r2,#12 @ this makes atan(0)==0
@ the following is basically a division residual y/x ~ atan(residual y/x)
movs r4,#1
lsls r4,#29
movs r7,#0
2:
lsrs r2,#1
movs r3,r3 @ preserve carry
bmi 1f
sbcs r3,r2
adds r0,r4
adcs r1,r7
lsrs r4,#1
bne 2b
b 3f
1:
adcs r3,r2
subs r0,r4
sbcs r1,r7
lsrs r4,#1
bne 2b
3:
lsls r6,r1,#31
asrs r1,#1
lsrs r0,#1
orrs r0,r6 @ Q61
12:
pop {r6}
cmp r6,#0
beq 1f
ldr r4,=#0x885A308D @ π/2 Q61
ldr r5,=#0x3243F6A8
bpl 2f
mvns r4,r4 @ negative quadrant offset
mvns r5,r5
2:
lsls r6,#31
bne 2f @ skip if quadrant offset is ±1
adds r0,r4
adcs r1,r5
2:
adds r0,r4
adcs r1,r5
1:
movs r2,#61
bl qfp_fix642double
bl pop_r8_r11
pop {r4-r7,r15}
.ltorg
dtab_cc:
.word 0x61bb4f69, 0x1dac6705 @ atan 2^-1 Q62
.word 0x96406eb1, 0x0fadbafc @ atan 2^-2 Q62
.word 0xab0bdb72, 0x07f56ea6 @ atan 2^-3 Q62
.word 0xe59fbd39, 0x03feab76 @ atan 2^-4 Q62
.word 0xba97624b, 0x01ffd55b @ atan 2^-5 Q62
.word 0xdddb94d6, 0x00fffaaa @ atan 2^-6 Q62
.word 0x56eeea5d, 0x007fff55 @ atan 2^-7 Q62
.word 0xaab7776e, 0x003fffea @ atan 2^-8 Q62
.word 0x5555bbbc, 0x001ffffd @ atan 2^-9 Q62
.word 0xaaaaadde, 0x000fffff @ atan 2^-10 Q62
.word 0xf555556f, 0x0007ffff @ atan 2^-11 Q62
.word 0xfeaaaaab, 0x0003ffff @ atan 2^-12 Q62
.word 0xffd55555, 0x0001ffff @ atan 2^-13 Q62
.word 0xfffaaaab, 0x0000ffff @ atan 2^-14 Q62
.word 0xffff5555, 0x00007fff @ atan 2^-15 Q62
.word 0xffffeaab, 0x00003fff @ atan 2^-16 Q62
.word 0xfffffd55, 0x00001fff @ atan 2^-17 Q62
.word 0xffffffab, 0x00000fff @ atan 2^-18 Q62
.word 0xfffffff5, 0x000007ff @ atan 2^-19 Q62
.word 0xffffffff, 0x000003ff @ atan 2^-20 Q62
.word 0x00000000, 0x00000200 @ atan 2^-21 Q62 @ consider optimising these
.word 0x00000000, 0x00000100 @ atan 2^-22 Q62
.word 0x00000000, 0x00000080 @ atan 2^-23 Q62
.word 0x00000000, 0x00000040 @ atan 2^-24 Q62
.word 0x00000000, 0x00000020 @ atan 2^-25 Q62
.word 0x00000000, 0x00000010 @ atan 2^-26 Q62
.word 0x00000000, 0x00000008 @ atan 2^-27 Q62
.word 0x00000000, 0x00000004 @ atan 2^-28 Q62
.word 0x00000000, 0x00000002 @ atan 2^-29 Q62
.word 0x00000000, 0x00000001 @ atan 2^-30 Q62
.word 0x80000000, 0x00000000 @ atan 2^-31 Q62
.word 0x40000000, 0x00000000 @ atan 2^-32 Q62
.thumb_func
qfp_dexp:
push {r4-r7,r14}
bl dunpacks
adr r4,dreddata1
bl dreduce
cmp r1,#0
bge 1f
ldr r4,=#0xF473DE6B
ldr r5,=#0x2C5C85FD @ ln2 Q62
adds r0,r4
adcs r1,r5
subs r2,#1
1:
push {r2}
movs r7,#1 @ shift
adr r6,dtab_exp
movs r2,#0
movs r3,#1
lsls r3,#30 @ x=1 Q62
3:
ldmia r6!,{r4,r5}
mov r12,r6
subs r0,r4
sbcs r1,r5
bmi 1f
rsbs r6,r7,#0
adds r6,#32 @ complementary shift
movs r5,r3
asrs r5,r7
movs r4,r3
lsls r4,r6
movs r6,r2
lsrs r6,r7 @ rounding bit in carry
orrs r4,r6
adcs r2,r4
adcs r3,r5 @ x+=x>>i
b 2f
1:
adds r0,r4 @ restore argument
adcs r1,r5
2:
mov r6,r12
adds r7,#1
cmp r7,#33
bne 3b
@ here
@ r0:r1 ε (residual x, where x=a+ε) Q62, |ε|≤2^-32 (so fits in r0)
@ r2:r3 exp a Q62
@ and we wish to calculate exp x=exp a exp ε~(exp a)(1+ε)
muls32_32_64 r0,r3, r4,r1, r5,r6,r7,r4,r1
@ r4:r1 ε exp a Q(62+62-32)=Q92
lsrs r4,#30
lsls r0,r1,#2
orrs r0,r4
asrs r1,#30
adds r0,r2
adcs r1,r3
pop {r2}
rsbs r2,#0
adds r2,#62
bl qfp_fix642double @ in principle we can pack faster than this because we know the exponent
pop {r4-r7,r15}
.ltorg
.thumb_func
qfp_dln:
push {r4-r7,r14}
lsls r7,r1,#1
bcs 5f @ <0 ...
asrs r7,#21
beq 5f @ ... or =0? return -Inf
adds r7,#1
beq 6f @ Inf/NaN? return +Inf
bl dunpacks
push {r2}
lsls r1,#9
lsrs r2,r0,#23
orrs r1,r2
lsls r0,#9
@ r0:r1 m Q61 = m/2 Q62 0.5≤m/2<1
movs r7,#1 @ shift
adr r6,dtab_exp
mov r12,r6
movs r2,#0
movs r3,#0 @ y=0 Q62
3:
rsbs r6,r7,#0
adds r6,#32 @ complementary shift
movs r5,r1
asrs r5,r7
movs r4,r1
lsls r4,r6
movs r6,r0
lsrs r6,r7
orrs r4,r6 @ x>>i, rounding bit in carry
adcs r4,r0
adcs r5,r1 @ x+(x>>i)
lsrs r6,r5,#30
bne 1f @ x+(x>>i)>1?
movs r0,r4
movs r1,r5 @ x+=x>>i
mov r6,r12
ldmia r6!,{r4,r5}
subs r2,r4
sbcs r3,r5
1:
movs r4,#8
add r12,r4
adds r7,#1
cmp r7,#33
bne 3b
@ here:
@ r0:r1 residual x, nearly 1 Q62
@ r2:r3 y ~ ln m/2 = ln m - ln2 Q62
@ result is y + ln2 + ln x ~ y + ln2 + (x-1)
lsls r1,#2
asrs r1,#2 @ x-1
adds r2,r0
adcs r3,r1
pop {r7}
@ here:
@ r2:r3 ln m/2 = ln m - ln2 Q62
@ r7 unbiased exponent
adr r4,dreddata1+4
ldmia r4,{r0,r1,r4}
adds r7,#1
muls r0,r7 @ Q62
muls r1,r7 @ Q41
muls r4,r7 @ Q20
lsls r7,r1,#21
asrs r1,#11
asrs r5,r1,#31
adds r0,r7
adcs r1,r5
lsls r7,r4,#10
asrs r4,#22
asrs r5,r1,#31
adds r1,r7
adcs r4,r5
@ r0:r1:r4 exponent*ln2 Q62
asrs r5,r3,#31
adds r0,r2
adcs r1,r3
adcs r4,r5
@ r0:r1:r4 result Q62
movs r2,#62
1:
asrs r5,r1,#31
cmp r4,r5
beq 2f @ r4 a sign extension of r1?
lsrs r0,#4 @ no: shift down 4 places and try again
lsls r6,r1,#28
orrs r0,r6
lsrs r1,#4
lsls r6,r4,#28
orrs r1,r6
asrs r4,#4
subs r2,#4
b 1b
2:
bl qfp_fix642double
pop {r4-r7,r15}
5:
ldr r1,=#0xfff00000
movs r0,#0
pop {r4-r7,r15}
6:
ldr r1,=#0x7ff00000
movs r0,#0
pop {r4-r7,r15}
.ltorg
.align 2
dreddata1:
.word 0x0000B8AA @ 1/ln2 Q15
.word 0x0013DE6B @ ln2 Q62 Q62=2C5C85FDF473DE6B split into 21-bit pieces
.word 0x000FEFA3
.word 0x000B1721
dtab_exp:
.word 0xbf984bf3, 0x19f323ec @ log 1+2^-1 Q62
.word 0xcd4d10d6, 0x0e47fbe3 @ log 1+2^-2 Q62
.word 0x8abcb97a, 0x0789c1db @ log 1+2^-3 Q62
.word 0x022c54cc, 0x03e14618 @ log 1+2^-4 Q62
.word 0xe7833005, 0x01f829b0 @ log 1+2^-5 Q62
.word 0x87e01f1e, 0x00fe0545 @ log 1+2^-6 Q62
.word 0xac419e24, 0x007f80a9 @ log 1+2^-7 Q62
.word 0x45621781, 0x003fe015 @ log 1+2^-8 Q62
.word 0xa9ab10e6, 0x001ff802 @ log 1+2^-9 Q62
.word 0x55455888, 0x000ffe00 @ log 1+2^-10 Q62
.word 0x0aa9aac4, 0x0007ff80 @ log 1+2^-11 Q62
.word 0x01554556, 0x0003ffe0 @ log 1+2^-12 Q62
.word 0x002aa9ab, 0x0001fff8 @ log 1+2^-13 Q62
.word 0x00055545, 0x0000fffe @ log 1+2^-14 Q62
.word 0x8000aaaa, 0x00007fff @ log 1+2^-15 Q62
.word 0xe0001555, 0x00003fff @ log 1+2^-16 Q62
.word 0xf80002ab, 0x00001fff @ log 1+2^-17 Q62
.word 0xfe000055, 0x00000fff @ log 1+2^-18 Q62
.word 0xff80000b, 0x000007ff @ log 1+2^-19 Q62
.word 0xffe00001, 0x000003ff @ log 1+2^-20 Q62
.word 0xfff80000, 0x000001ff @ log 1+2^-21 Q62
.word 0xfffe0000, 0x000000ff @ log 1+2^-22 Q62
.word 0xffff8000, 0x0000007f @ log 1+2^-23 Q62
.word 0xffffe000, 0x0000003f @ log 1+2^-24 Q62
.word 0xfffff800, 0x0000001f @ log 1+2^-25 Q62
.word 0xfffffe00, 0x0000000f @ log 1+2^-26 Q62
.word 0xffffff80, 0x00000007 @ log 1+2^-27 Q62
.word 0xffffffe0, 0x00000003 @ log 1+2^-28 Q62
.word 0xfffffff8, 0x00000001 @ log 1+2^-29 Q62
.word 0xfffffffe, 0x00000000 @ log 1+2^-30 Q62
.word 0x80000000, 0x00000000 @ log 1+2^-31 Q62
.word 0x40000000, 0x00000000 @ log 1+2^-32 Q62
qfp_lib_end: