|
|
@ Copyright 2019-2024 Mark Owen
|
|
|
@ http://www.quinapalus.com
|
|
|
@ E-mail: qfp@quinapalus.com
|
|
|
@
|
|
|
@ This file is free software: you can redistribute it and/or modify
|
|
|
@ it under the terms of version 2 of the GNU General Public License
|
|
|
@ as published by the Free Software Foundation.
|
|
|
@
|
|
|
@ This file is distributed in the hope that it will be useful,
|
|
|
@ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
@ GNU General Public License for more details.
|
|
|
@
|
|
|
@ You should have received a copy of the GNU General Public License
|
|
|
@ along with this file. If not, see <http://www.gnu.org/licenses/> or
|
|
|
@ write to the Free Software Foundation, Inc., 51 Franklin Street,
|
|
|
@ Fifth Floor, Boston, MA 02110-1301, USA.
|
|
|
|
|
|
.syntax unified
|
|
|
.cpu cortex-m0plus
|
|
|
.thumb
|
|
|
|
|
|
@ exported symbols
|
|
|
|
|
|
.global qfp_fadd
|
|
|
.global qfp_fsub
|
|
|
.global qfp_fmul
|
|
|
.global qfp_fdiv
|
|
|
.global qfp_fcmp
|
|
|
.global qfp_fsqrt
|
|
|
.global qfp_float2int
|
|
|
.global qfp_float2fix
|
|
|
.global qfp_float2uint
|
|
|
.global qfp_float2ufix
|
|
|
.global qfp_int2float
|
|
|
.global qfp_fix2float
|
|
|
.global qfp_uint2float
|
|
|
.global qfp_ufix2float
|
|
|
.global qfp_int642float
|
|
|
.global qfp_fix642float
|
|
|
.global qfp_uint642float
|
|
|
.global qfp_ufix642float
|
|
|
.global qfp_fcos
|
|
|
.global qfp_fsin
|
|
|
.global qfp_ftan
|
|
|
.global qfp_fatan2
|
|
|
.global qfp_fexp
|
|
|
.global qfp_fln
|
|
|
|
|
|
.global qfp_dadd
|
|
|
.global qfp_dsub
|
|
|
.global qfp_dmul
|
|
|
.global qfp_ddiv
|
|
|
.global qfp_dsqrt
|
|
|
.global qfp_dcos
|
|
|
.global qfp_dsin
|
|
|
.global qfp_dtan
|
|
|
.global qfp_datan2
|
|
|
.global qfp_dexp
|
|
|
.global qfp_dln
|
|
|
.global qfp_dcmp
|
|
|
|
|
|
.global qfp_float2int64
|
|
|
.global qfp_float2fix64
|
|
|
.global qfp_float2uint64
|
|
|
.global qfp_float2ufix64
|
|
|
.global qfp_float2int_z
|
|
|
.global qfp_float2int64_z
|
|
|
|
|
|
.global qfp_double2int
|
|
|
.global qfp_double2fix
|
|
|
.global qfp_double2uint
|
|
|
.global qfp_double2ufix
|
|
|
.global qfp_double2int64
|
|
|
.global qfp_double2fix64
|
|
|
.global qfp_double2uint64
|
|
|
.global qfp_double2ufix64
|
|
|
.global qfp_double2int_z
|
|
|
.global qfp_double2int64_z
|
|
|
|
|
|
.global qfp_int2double
|
|
|
.global qfp_fix2double
|
|
|
.global qfp_uint2double
|
|
|
.global qfp_ufix2double
|
|
|
.global qfp_int642double
|
|
|
.global qfp_fix642double
|
|
|
.global qfp_uint642double
|
|
|
.global qfp_ufix642double
|
|
|
|
|
|
.global qfp_double2float
|
|
|
.global qfp_float2double
|
|
|
|
|
|
qfp_lib_start:
|
|
|
|
|
|
@ exchange r0<->r1, r2<->r3
|
|
|
xchxy:
|
|
|
push {r0,r2,r14}
|
|
|
mov r0,r1
|
|
|
mov r2,r3
|
|
|
pop {r1,r3,r15}
|
|
|
|
|
|
@ IEEE single in r0-> signed (two's complemennt) mantissa in r0 9Q23 (24 significant bits), signed exponent (bias removed) in r2
|
|
|
@ trashes r4; zero, denormal -> mantissa=+/-1, exponent=-380; Inf, NaN -> mantissa=+/-1, exponent=+640
|
|
|
unpackx:
|
|
|
lsrs r2,r0,#23 @ save exponent and sign
|
|
|
lsls r0,#9 @ extract mantissa
|
|
|
lsrs r0,#9
|
|
|
movs r4,#1
|
|
|
lsls r4,#23
|
|
|
orrs r0,r4 @ reinstate implied leading 1
|
|
|
cmp r2,#255 @ test sign bit
|
|
|
uxtb r2,r2 @ clear it
|
|
|
bls 1f @ branch on positive
|
|
|
rsbs r0,#0 @ negate mantissa
|
|
|
1:
|
|
|
subs r2,#1
|
|
|
cmp r2,#254 @ zero/denormal/Inf/NaN?
|
|
|
bhs 2f
|
|
|
subs r2,#126 @ remove exponent bias: can now be -126..+127
|
|
|
bx r14
|
|
|
|
|
|
2: @ here with special-case values
|
|
|
cmp r0,#0
|
|
|
mov r0,r4 @ set mantissa to +1
|
|
|
bpl 3f
|
|
|
rsbs r0,#0 @ zero/denormal/Inf/NaN: mantissa=+/-1
|
|
|
3:
|
|
|
subs r2,#126 @ zero/denormal: exponent -> -127; Inf, NaN: exponent -> 128
|
|
|
lsls r2,#2 @ zero/denormal: exponent -> -508; Inf, NaN: exponent -> 512
|
|
|
adds r2,#128 @ zero/denormal: exponent -> -380; Inf, NaN: exponent -> 640
|
|
|
bx r14
|
|
|
|
|
|
@ normalise and pack signed mantissa in r0 nominally 3Q29, signed exponent in r2-> IEEE single in r0
|
|
|
@ trashes r4, preserves r1,r3
|
|
|
@ r5: "sticky bits", must be zero iff all result bits below r0 are zero for correct rounding
|
|
|
packx:
|
|
|
lsrs r4,r0,#31 @ save sign bit
|
|
|
lsls r4,r4,#31 @ sign now in b31
|
|
|
bpl 2f @ skip if positive
|
|
|
cmp r5,#0
|
|
|
beq 11f
|
|
|
adds r0,#1 @ fiddle carry in to following rsb if sticky bits are non-zero
|
|
|
11:
|
|
|
rsbs r0,#0 @ can now treat r0 as unsigned
|
|
|
packx0:
|
|
|
bmi 3f @ catch r0=0x80000000 case
|
|
|
2:
|
|
|
subs r2,#1 @ normalisation loop
|
|
|
adds r0,r0
|
|
|
beq 1f @ zero? special case
|
|
|
bpl 2b @ normalise so leading "1" in bit 31
|
|
|
3:
|
|
|
adds r2,#129 @ (mis-)offset exponent
|
|
|
bne 12f @ special case: highest denormal can round to lowest normal
|
|
|
adds r0,#0x80 @ in special case, need to add 256 to r0 for rounding
|
|
|
bcs 4f @ tripped carry? then have leading 1 in C as required
|
|
|
12:
|
|
|
adds r0,#0x80 @ rounding
|
|
|
bcs 4f @ tripped carry? then have leading 1 in C as required (and result is even so can ignore sticky bits)
|
|
|
cmp r5,#0
|
|
|
beq 7f @ sticky bits zero?
|
|
|
8:
|
|
|
lsls r0,#1 @ remove leading 1
|
|
|
9:
|
|
|
subs r2,#1 @ compensate exponent on this path
|
|
|
4:
|
|
|
cmp r2,#254
|
|
|
bge 5f @ overflow?
|
|
|
adds r2,#1 @ correct exponent offset
|
|
|
ble 10f @ denormal/underflow?
|
|
|
lsrs r0,#9 @ align mantissa
|
|
|
lsls r2,#23 @ align exponent
|
|
|
orrs r0,r2 @ assemble exponent and mantissa
|
|
|
6:
|
|
|
orrs r0,r4 @ apply sign
|
|
|
1:
|
|
|
bx r14
|
|
|
|
|
|
5:
|
|
|
movs r0,#0xff @ create infinity
|
|
|
lsls r0,#23
|
|
|
b 6b
|
|
|
|
|
|
10:
|
|
|
movs r0,#0 @ create zero
|
|
|
bx r14
|
|
|
|
|
|
7: @ sticky bit rounding case
|
|
|
lsls r5,r0,#24 @ check bottom 8 bits of r0
|
|
|
bne 8b @ in rounding-tie case?
|
|
|
lsrs r0,#9 @ ensure even result
|
|
|
lsls r0,#10
|
|
|
b 9b
|
|
|
|
|
|
.align 2
|
|
|
.ltorg
|
|
|
|
|
|
@ signed multiply r0 1Q23 by r1 4Q23, result in r0 7Q25, sticky bits in r5
|
|
|
@ trashes r3,r4
|
|
|
mul0:
|
|
|
uxth r3,r0 @ Q23
|
|
|
asrs r4,r1,#16 @ Q7
|
|
|
muls r3,r4 @ L*H, Q30 signed
|
|
|
asrs r4,r0,#16 @ Q7
|
|
|
uxth r5,r1 @ Q23
|
|
|
muls r4,r5 @ H*L, Q30 signed
|
|
|
adds r3,r4 @ sum of middle partial products
|
|
|
uxth r4,r0
|
|
|
muls r4,r5 @ L*L, Q46 unsigned
|
|
|
lsls r5,r4,#16 @ initialise sticky bits from low half of low partial product
|
|
|
lsrs r4,#16 @ Q25
|
|
|
adds r3,r4 @ add high half of low partial product to sum of middle partial products
|
|
|
@ (cannot generate carry by limits on input arguments)
|
|
|
asrs r0,#16 @ Q7
|
|
|
asrs r1,#16 @ Q7
|
|
|
muls r0,r1 @ H*H, Q14 signed
|
|
|
lsls r0,#11 @ high partial product Q25
|
|
|
lsls r1,r3,#27 @ sticky
|
|
|
orrs r5,r1 @ collect further sticky bits
|
|
|
asrs r1,r3,#5 @ middle partial products Q25
|
|
|
adds r0,r1 @ final result
|
|
|
bx r14
|
|
|
|
|
|
.thumb_func
|
|
|
qfp_fcmp:
|
|
|
lsls r2,r0,#1
|
|
|
lsrs r2,#24
|
|
|
beq 1f
|
|
|
cmp r2,#0xff
|
|
|
bne 2f
|
|
|
1:
|
|
|
lsrs r0,#23 @ clear mantissa if NaN or denormal
|
|
|
lsls r0,#23
|
|
|
2:
|
|
|
lsls r2,r1,#1
|
|
|
lsrs r2,#24
|
|
|
beq 1f
|
|
|
cmp r2,#0xff
|
|
|
bne 2f
|
|
|
1:
|
|
|
lsrs r1,#23 @ clear mantissa if NaN or denormal
|
|
|
lsls r1,#23
|
|
|
2:
|
|
|
movs r2,#1 @ initialise result
|
|
|
eors r1,r0
|
|
|
bmi 4f @ opposite signs? then can proceed on basis of sign of x
|
|
|
eors r1,r0 @ restore y
|
|
|
bpl 1f
|
|
|
rsbs r2,#0 @ both negative? flip comparison
|
|
|
1:
|
|
|
cmp r0,r1
|
|
|
bgt 2f
|
|
|
blt 3f
|
|
|
5:
|
|
|
movs r2,#0
|
|
|
3:
|
|
|
rsbs r2,#0
|
|
|
2:
|
|
|
subs r0,r2,#0
|
|
|
bx r14
|
|
|
4:
|
|
|
orrs r1,r0
|
|
|
adds r1,r1
|
|
|
beq 5b
|
|
|
cmp r0,#0
|
|
|
bge 2b
|
|
|
b 3b
|
|
|
|
|
|
@ convert float to signed int, rounding towards 0, clamping
|
|
|
.thumb_func
|
|
|
qfp_float2int_z:
|
|
|
push {r14}
|
|
|
cmp r0,#0
|
|
|
blt 1f
|
|
|
bl qfp_float2int @ +ve or zero? then use rounding towards -Inf
|
|
|
cmp r0,#0
|
|
|
bge 2f
|
|
|
ldr r0,=#0x7fffffff
|
|
|
2:
|
|
|
pop {r15}
|
|
|
1:
|
|
|
lsls r0,#1 @ -ve: clear sign bit
|
|
|
lsrs r0,#1
|
|
|
bl qfp_float2uint @ convert to unsigned, rounding towards -Inf
|
|
|
cmp r0,#0
|
|
|
blt 1f
|
|
|
rsbs r0,#0
|
|
|
pop {r15}
|
|
|
1:
|
|
|
movs r0,#1
|
|
|
lsls r0,#31 @ 0x80000000
|
|
|
pop {r15}
|
|
|
|
|
|
.ltorg
|
|
|
|
|
|
@ convert float to signed int, rounding towards -Inf, clamping
|
|
|
.thumb_func
|
|
|
qfp_float2int:
|
|
|
movs r1,#0 @ fall through
|
|
|
|
|
|
@ convert float in r0 to signed fixed point in r0, clamping
|
|
|
.thumb_func
|
|
|
qfp_float2fix:
|
|
|
push {r4,r14}
|
|
|
bl unpackx
|
|
|
movs r3,r2
|
|
|
adds r3,#130
|
|
|
bmi 6f @ -0?
|
|
|
add r2,r1 @ incorporate binary point position into exponent
|
|
|
subs r2,#23 @ r2 is now amount of left shift required
|
|
|
blt 1f @ requires right shift?
|
|
|
cmp r2,#7 @ overflow?
|
|
|
ble 4f
|
|
|
3: @ overflow
|
|
|
asrs r1,r0,#31 @ +ve:0 -ve:0xffffffff
|
|
|
mvns r1,r1 @ +ve:0xffffffff -ve:0
|
|
|
movs r0,#1
|
|
|
lsls r0,#31
|
|
|
5:
|
|
|
eors r0,r1 @ +ve:0x7fffffff -ve:0x80000000 (unsigned path: 0xffffffff)
|
|
|
pop {r4,r15}
|
|
|
1:
|
|
|
rsbs r2,#0 @ right shift for r0, >0
|
|
|
cmp r2,#32
|
|
|
blt 2f @ more than 32 bits of right shift?
|
|
|
movs r2,#32
|
|
|
2:
|
|
|
asrs r0,r0,r2
|
|
|
pop {r4,r15}
|
|
|
6:
|
|
|
movs r0,#0
|
|
|
pop {r4,r15}
|
|
|
|
|
|
@ unsigned version
|
|
|
.thumb_func
|
|
|
qfp_float2uint:
|
|
|
movs r1,#0 @ fall through
|
|
|
.thumb_func
|
|
|
qfp_float2ufix:
|
|
|
push {r4,r14}
|
|
|
bl unpackx
|
|
|
add r2,r1 @ incorporate binary point position into exponent
|
|
|
movs r1,r0
|
|
|
bmi 5b @ negative? return zero
|
|
|
subs r2,#23 @ r2 is now amount of left shift required
|
|
|
blt 1b @ requires right shift?
|
|
|
mvns r1,r0 @ ready to return 0xffffffff
|
|
|
cmp r2,#8 @ overflow?
|
|
|
bgt 5b
|
|
|
4:
|
|
|
lsls r0,r0,r2 @ result fits, left shifted
|
|
|
pop {r4,r15}
|
|
|
|
|
|
|
|
|
@ convert uint64 to float, rounding
|
|
|
.thumb_func
|
|
|
qfp_uint642float:
|
|
|
movs r2,#0 @ fall through
|
|
|
|
|
|
@ convert unsigned 64-bit fix to float, rounding; number of r0:r1 bits after point in r2
|
|
|
.thumb_func
|
|
|
qfp_ufix642float:
|
|
|
push {r4,r5,r14}
|
|
|
cmp r1,#0
|
|
|
bpl 3f @ positive? we can use signed code
|
|
|
lsls r5,r1,#31 @ contribution to sticky bits
|
|
|
orrs r5,r0
|
|
|
lsrs r0,r1,#1
|
|
|
subs r2,#1
|
|
|
b 4f
|
|
|
|
|
|
@ convert int64 to float, rounding
|
|
|
.thumb_func
|
|
|
qfp_int642float:
|
|
|
movs r2,#0 @ fall through
|
|
|
|
|
|
@ convert signed 64-bit fix to float, rounding; number of r0:r1 bits after point in r2
|
|
|
.thumb_func
|
|
|
qfp_fix642float:
|
|
|
push {r4,r5,r14}
|
|
|
3:
|
|
|
movs r5,r0
|
|
|
orrs r5,r1
|
|
|
beq ret_pop45 @ zero? return +0
|
|
|
asrs r5,r1,#31 @ sign bits
|
|
|
2:
|
|
|
asrs r4,r1,#24 @ try shifting 7 bits at a time
|
|
|
cmp r4,r5
|
|
|
bne 1f @ next shift will overflow?
|
|
|
lsls r1,#7
|
|
|
lsrs r4,r0,#25
|
|
|
orrs r1,r4
|
|
|
lsls r0,#7
|
|
|
adds r2,#7
|
|
|
b 2b
|
|
|
1:
|
|
|
movs r5,r0
|
|
|
movs r0,r1
|
|
|
4:
|
|
|
rsbs r2,#0
|
|
|
adds r2,#32+29
|
|
|
b packret
|
|
|
|
|
|
@ convert signed int to float, rounding
|
|
|
.thumb_func
|
|
|
qfp_int2float:
|
|
|
movs r1,#0 @ fall through
|
|
|
|
|
|
@ convert signed fix to float, rounding; number of r0 bits after point in r1
|
|
|
.thumb_func
|
|
|
qfp_fix2float:
|
|
|
push {r4,r5,r14}
|
|
|
1:
|
|
|
movs r2,#29
|
|
|
subs r2,r1 @ fix exponent
|
|
|
packretns: @ pack and return, sticky bits=0
|
|
|
movs r5,#0
|
|
|
packret: @ common return point: "pack and return"
|
|
|
bl packx
|
|
|
ret_pop45:
|
|
|
pop {r4,r5,r15}
|
|
|
|
|
|
|
|
|
@ unsigned version
|
|
|
.thumb_func
|
|
|
qfp_uint2float:
|
|
|
movs r1,#0 @ fall through
|
|
|
.thumb_func
|
|
|
qfp_ufix2float:
|
|
|
push {r4,r5,r14}
|
|
|
cmp r0,#0
|
|
|
bge 1b @ treat <2^31 as signed
|
|
|
movs r2,#30
|
|
|
subs r2,r1 @ fix exponent
|
|
|
lsls r5,r0,#31 @ one sticky bit
|
|
|
lsrs r0,#1
|
|
|
b packret
|
|
|
|
|
|
@ All the scientific functions are implemented using the CORDIC algorithm. For notation,
|
|
|
@ details not explained in the comments below, and a good overall survey see
|
|
|
@ "50 Years of CORDIC: Algorithms, Architectures, and Applications" by Meher et al.,
|
|
|
@ IEEE Transactions on Circuits and Systems Part I, Volume 56 Issue 9.
|
|
|
|
|
|
@ Register use:
|
|
|
@ r0: x
|
|
|
@ r1: y
|
|
|
@ r2: z/omega
|
|
|
@ r3: coefficient pointer
|
|
|
@ r4,r12: m
|
|
|
@ r5: i (shift)
|
|
|
|
|
|
cordic_start: @ initialisation
|
|
|
movs r5,#0 @ initial shift=0
|
|
|
mov r12,r4
|
|
|
b 5f
|
|
|
|
|
|
cordic_vstep: @ one step of algorithm in vector mode
|
|
|
cmp r1,#0 @ check sign of y
|
|
|
bgt 4f
|
|
|
b 1f
|
|
|
cordic_rstep: @ one step of algorithm in rotation mode
|
|
|
cmp r2,#0 @ check sign of angle
|
|
|
bge 1f
|
|
|
4:
|
|
|
subs r1,r6 @ negative rotation: y=y-(x>>i)
|
|
|
rsbs r7,#0
|
|
|
adds r2,r4 @ accumulate angle
|
|
|
b 2f
|
|
|
1:
|
|
|
adds r1,r6 @ positive rotation: y=y+(x>>i)
|
|
|
subs r2,r4 @ accumulate angle
|
|
|
2:
|
|
|
mov r4,r12
|
|
|
muls r7,r4 @ apply sign from m
|
|
|
subs r0,r7 @ finish rotation: x=x{+/-}(y>>i)
|
|
|
5:
|
|
|
ldmia r3!,{r4} @ fetch next angle from table and bump pointer
|
|
|
lsrs r4,#1 @ repeated angle?
|
|
|
bcs 3f
|
|
|
adds r5,#1 @ adjust shift if not
|
|
|
3:
|
|
|
mov r6,r0
|
|
|
asrs r6,r5 @ x>>i
|
|
|
mov r7,r1
|
|
|
asrs r7,r5 @ y>>i
|
|
|
lsrs r4,#1 @ shift end flag into carry
|
|
|
bx r14
|
|
|
|
|
|
@ CORDIC rotation mode
|
|
|
cordic_rot:
|
|
|
push {r6,r7,r14}
|
|
|
bl cordic_start @ initialise
|
|
|
1:
|
|
|
bl cordic_rstep
|
|
|
bcc 1b @ step until table finished
|
|
|
asrs r6,r0,#14 @ remaining small rotations can be linearised: see IV.B of paper referenced above
|
|
|
asrs r7,r1,#14
|
|
|
asrs r2,#3
|
|
|
muls r6,r2 @ all remaining CORDIC steps in a multiplication
|
|
|
muls r7,r2
|
|
|
mov r4,r12
|
|
|
muls r7,r4
|
|
|
asrs r6,#12
|
|
|
asrs r7,#12
|
|
|
subs r0,r7 @ x=x{+/-}(yz>>k)
|
|
|
adds r1,r6 @ y=y+(xz>>k)
|
|
|
cordic_exit:
|
|
|
pop {r6,r7,r15}
|
|
|
|
|
|
@ CORDIC vector mode
|
|
|
cordic_vec:
|
|
|
push {r6,r7,r14}
|
|
|
bl cordic_start @ initialise
|
|
|
1:
|
|
|
bl cordic_vstep
|
|
|
bcc 1b @ step until table finished
|
|
|
4:
|
|
|
cmp r1,#0 @ continue as in cordic_vstep but without using table; x is not affected as y is small
|
|
|
bgt 2f @ check sign of y
|
|
|
adds r1,r6 @ positive rotation: y=y+(x>>i)
|
|
|
subs r2,r4 @ accumulate angle
|
|
|
b 3f
|
|
|
2:
|
|
|
subs r1,r6 @ negative rotation: y=y-(x>>i)
|
|
|
adds r2,r4 @ accumulate angle
|
|
|
3:
|
|
|
asrs r6,#1
|
|
|
asrs r4,#1 @ next "table entry"
|
|
|
bne 4b
|
|
|
b cordic_exit
|
|
|
|
|
|
.thumb_func
|
|
|
qfp_fsin: @ calculate sin and cos using CORDIC rotation method
|
|
|
push {r4,r5,r14}
|
|
|
movs r1,#24
|
|
|
bl qfp_float2fix @ range reduction by repeated subtraction/addition in fixed point
|
|
|
ldr r4,pi_q29
|
|
|
lsrs r4,#4 @ 2pi Q24
|
|
|
1:
|
|
|
subs r0,r4
|
|
|
bge 1b
|
|
|
1:
|
|
|
adds r0,r4
|
|
|
bmi 1b @ now in range 0..2pi
|
|
|
lsls r2,r0,#2 @ z Q26
|
|
|
lsls r5,r4,#1 @ pi Q26 (r4=pi/2 Q26)
|
|
|
ldr r0,=#0x136e9db4 @ initialise CORDIC x,y with scaling
|
|
|
movs r1,#0
|
|
|
1:
|
|
|
cmp r2,r4 @ >pi/2?
|
|
|
blt 2f
|
|
|
subs r2,r5 @ reduce range to -pi/2..pi/2
|
|
|
rsbs r0,#0 @ rotate vector by pi
|
|
|
b 1b
|
|
|
2:
|
|
|
lsls r2,#3 @ Q29
|
|
|
adr r3,tab_cc @ circular coefficients
|
|
|
movs r4,#1 @ m=1
|
|
|
bl cordic_rot
|
|
|
adds r1,#9 @ fiddle factor to make sin(0)==0
|
|
|
movs r2,#0 @ exponents to zero
|
|
|
movs r3,#0
|
|
|
movs r5,#0 @ no sticky bits
|
|
|
bl clampx
|
|
|
bl packx @ pack cosine
|
|
|
bl xchxy
|
|
|
bl clampx
|
|
|
b packretns @ pack sine
|
|
|
|
|
|
.thumb_func
|
|
|
qfp_fcos:
|
|
|
push {r14}
|
|
|
bl qfp_fsin
|
|
|
mov r0,r1 @ extract cosine result
|
|
|
pop {r15}
|
|
|
|
|
|
@ force r0 to lie in range [-1,1] Q29
|
|
|
clampx:
|
|
|
movs r4,#1
|
|
|
lsls r4,#29
|
|
|
cmp r0,r4
|
|
|
bgt 1f
|
|
|
rsbs r4,#0
|
|
|
cmp r0,r4
|
|
|
ble 1f
|
|
|
bx r14
|
|
|
1:
|
|
|
movs r0,r4
|
|
|
bx r14
|
|
|
|
|
|
.thumb_func
|
|
|
qfp_ftan:
|
|
|
push {r4,r5,r6,r14}
|
|
|
bl qfp_fsin @ sine in r0/r2, cosine in r1/r3
|
|
|
b fdiv_n @ sin/cos
|
|
|
|
|
|
.thumb_func
|
|
|
qfp_fexp:
|
|
|
push {r4,r5,r14}
|
|
|
movs r1,#24
|
|
|
bl qfp_float2fix @ Q24: covers entire valid input range
|
|
|
asrs r1,r0,#16 @ Q8
|
|
|
ldr r2,=#5909 @ log_2(e) Q12
|
|
|
muls r2,r1 @ estimate exponent of result Q20 (always an underestimate)
|
|
|
asrs r2,#20 @ Q0
|
|
|
lsls r1,r0,#6 @ Q30
|
|
|
ldr r0,=#0x2c5c85fe @ ln(2) Q30
|
|
|
muls r0,r2 @ accurate contribution of estimated exponent
|
|
|
subs r1,r0 @ residual to be exponentiated, guaranteed ≥0, < about 0.75 Q30
|
|
|
|
|
|
@ here
|
|
|
@ r1: mantissa to exponentiate, 0...~0.75 Q30
|
|
|
@ r2: first exponent estimate
|
|
|
|
|
|
movs r5,#1 @ shift
|
|
|
adr r3,ftab_exp @ could use alternate words from dtab_exp to save space if required
|
|
|
movs r0,#1
|
|
|
lsls r0,#29 @ x=1 Q29
|
|
|
|
|
|
3:
|
|
|
ldmia r3!,{r4}
|
|
|
subs r4,r1,r4
|
|
|
bmi 1f
|
|
|
movs r1,r4 @ keep result of subtraction
|
|
|
movs r4,r0
|
|
|
lsrs r4,r5
|
|
|
adcs r0,r4 @ x+=x>>i with rounding
|
|
|
|
|
|
1:
|
|
|
adds r5,#1
|
|
|
cmp r5,#15
|
|
|
bne 3b
|
|
|
|
|
|
@ here
|
|
|
@ r0: exp a Q29 1..2+
|
|
|
@ r1: ε (residual x where x=a+ε), < 2^-14 Q30
|
|
|
@ r2: first exponent estimate
|
|
|
@ and we wish to calculate exp x=exp a exp ε~(exp a)(1+ε)
|
|
|
|
|
|
lsrs r3,r0,#15 @ exp a Q14
|
|
|
muls r3,r1 @ ε exp a Q44
|
|
|
lsrs r3,#15 @ ε exp a Q29
|
|
|
adcs r0,r3 @ (1+ε) exp a Q29 with rounding
|
|
|
|
|
|
b packretns @ pack result
|
|
|
|
|
|
.thumb_func
|
|
|
qfp_fln:
|
|
|
push {r4,r5,r14}
|
|
|
asrs r1,r0,#23
|
|
|
bmi 3f @ -ve argument?
|
|
|
beq 3f @ 0 argument?
|
|
|
cmp r1,#0xff
|
|
|
beq 4f @ +Inf/NaN
|
|
|
bl unpackx
|
|
|
adds r2,#1
|
|
|
ldr r3,=#0x2c5c85fe @ ln(2) Q30
|
|
|
lsrs r1,r3,#14 @ ln(2) Q16
|
|
|
muls r1,r2 @ result estimate Q16
|
|
|
asrs r1,#16 @ integer contribution to result
|
|
|
muls r3,r2
|
|
|
lsls r4,r1,#30
|
|
|
subs r3,r4 @ fractional contribution to result Q30, signed
|
|
|
lsls r0,#8 @ Q31
|
|
|
|
|
|
@ here
|
|
|
@ r0: mantissa Q31
|
|
|
@ r1: integer contribution to result
|
|
|
@ r3: fractional contribution to result Q30, signed
|
|
|
|
|
|
movs r5,#1 @ shift
|
|
|
adr r4,ftab_exp @ could use alternate words from dtab_exp to save space if required
|
|
|
|
|
|
2:
|
|
|
movs r2,r0
|
|
|
lsrs r2,r5
|
|
|
adcs r2,r0 @ x+(x>>i) with rounding
|
|
|
bcs 1f @ >=2?
|
|
|
movs r0,r2 @ keep result
|
|
|
ldr r2,[r4]
|
|
|
subs r3,r2
|
|
|
1:
|
|
|
adds r4,#4
|
|
|
adds r5,#1
|
|
|
cmp r5,#15
|
|
|
bne 2b
|
|
|
|
|
|
@ here
|
|
|
@ r0: residual x, nearly 2 Q31
|
|
|
@ r1: integer contribution to result
|
|
|
@ r3: fractional part of result Q30
|
|
|
|
|
|
asrs r0,#2
|
|
|
adds r0,r3,r0
|
|
|
|
|
|
cmp r1,#0
|
|
|
bne 2f
|
|
|
|
|
|
asrs r0,#1
|
|
|
lsls r1,#29
|
|
|
adds r0,r1
|
|
|
movs r2,#0
|
|
|
b packretns
|
|
|
|
|
|
2:
|
|
|
lsls r1,#24
|
|
|
asrs r0,#6 @ Q24
|
|
|
adcs r0,r1 @ with rounding
|
|
|
movs r2,#5
|
|
|
b packretns
|
|
|
|
|
|
3:
|
|
|
ldr r0,=#0xff800000 @ -Inf
|
|
|
pop {r4,r5,r15}
|
|
|
4:
|
|
|
ldr r0,=#0x7f800000 @ +Inf
|
|
|
pop {r4,r5,r15}
|
|
|
|
|
|
.align 2
|
|
|
ftab_exp:
|
|
|
.word 0x19f323ed @ log 1+2^-1 Q30
|
|
|
.word 0x0e47fbe4 @ log 1+2^-2 Q30
|
|
|
.word 0x0789c1dc @ log 1+2^-3 Q30
|
|
|
.word 0x03e14618 @ log 1+2^-4 Q30
|
|
|
.word 0x01f829b1 @ log 1+2^-5 Q30
|
|
|
.word 0x00fe0546 @ log 1+2^-6 Q30
|
|
|
.word 0x007f80aa @ log 1+2^-7 Q30
|
|
|
.word 0x003fe015 @ log 1+2^-8 Q30
|
|
|
.word 0x001ff803 @ log 1+2^-9 Q30
|
|
|
.word 0x000ffe00 @ log 1+2^-10 Q30
|
|
|
.word 0x0007ff80 @ log 1+2^-11 Q30
|
|
|
.word 0x0003ffe0 @ log 1+2^-12 Q30
|
|
|
.word 0x0001fff8 @ log 1+2^-13 Q30
|
|
|
.word 0x0000fffe @ log 1+2^-14 Q30
|
|
|
|
|
|
.thumb_func
|
|
|
qfp_fatan2:
|
|
|
push {r4,r5,r14}
|
|
|
|
|
|
@ unpack arguments and shift one down to have common exponent
|
|
|
bl unpackx
|
|
|
bl xchxy
|
|
|
bl unpackx
|
|
|
lsls r0,r0,#5 @ Q28
|
|
|
lsls r1,r1,#5 @ Q28
|
|
|
adds r4,r2,r3 @ this is -760 if both arguments are 0 and at least -380-126=-506 otherwise
|
|
|
asrs r4,#9
|
|
|
adds r4,#1
|
|
|
bmi 2f @ force y to 0 proper, so result will be zero
|
|
|
subs r4,r2,r3 @ calculate shift
|
|
|
bge 1f @ ex>=ey?
|
|
|
rsbs r4,#0 @ make shift positive
|
|
|
asrs r0,r4
|
|
|
cmp r4,#28
|
|
|
blo 3f
|
|
|
asrs r0,#31
|
|
|
b 3f
|
|
|
1:
|
|
|
asrs r1,r4
|
|
|
cmp r4,#28
|
|
|
blo 3f
|
|
|
2:
|
|
|
@ here |x|>>|y| or both x and y are ±0
|
|
|
cmp r0,#0
|
|
|
bge 4f @ x positive, return signed 0
|
|
|
ldr r0,pi_q29 @ x negative, return +/- pi
|
|
|
asrs r1,#31
|
|
|
eors r0,r1
|
|
|
b 7f
|
|
|
4:
|
|
|
asrs r0,r1,#31
|
|
|
b 7f
|
|
|
3:
|
|
|
movs r2,#0 @ initial angle
|
|
|
cmp r0,#0 @ x negative
|
|
|
bge 5f
|
|
|
rsbs r0,#0 @ rotate to 1st/4th quadrants
|
|
|
rsbs r1,#0
|
|
|
ldr r2,pi_q29 @ pi Q29
|
|
|
5:
|
|
|
adr r3,tab_cc @ circular coefficients
|
|
|
movs r4,#1 @ m=1
|
|
|
bl cordic_vec @ also produces magnitude (with scaling factor 1.646760119), which is discarded
|
|
|
mov r0,r2 @ result here is -pi/2..3pi/2 Q29
|
|
|
@ asrs r2,#29
|
|
|
@ subs r0,r2
|
|
|
ldr r2,pi_q29 @ pi Q29
|
|
|
adds r4,r0,r2 @ attempt to fix -3pi/2..-pi case
|
|
|
bcs 6f @ -pi/2..0? leave result as is
|
|
|
subs r4,r0,r2 @ <pi? leave as is
|
|
|
bmi 6f
|
|
|
subs r0,r4,r2 @ >pi: take off 2pi
|
|
|
6:
|
|
|
subs r0,#1 @ fiddle factor so atan2(0,1)==0
|
|
|
7:
|
|
|
movs r2,#0 @ exponent for pack
|
|
|
b packretns
|
|
|
|
|
|
.align 2
|
|
|
.ltorg
|
|
|
|
|
|
@ first entry in following table is pi Q29
|
|
|
pi_q29:
|
|
|
@ circular CORDIC coefficients: atan(2^-i), b0=flag for preventing shift, b1=flag for end of table
|
|
|
tab_cc:
|
|
|
.word 0x1921fb54*4+1 @ no shift before first iteration
|
|
|
.word 0x0ed63383*4+0
|
|
|
.word 0x07d6dd7e*4+0
|
|
|
.word 0x03fab753*4+0
|
|
|
.word 0x01ff55bb*4+0
|
|
|
.word 0x00ffeaae*4+0
|
|
|
.word 0x007ffd55*4+0
|
|
|
.word 0x003fffab*4+0
|
|
|
.word 0x001ffff5*4+0
|
|
|
.word 0x000fffff*4+0
|
|
|
.word 0x0007ffff*4+0
|
|
|
.word 0x00040000*4+0
|
|
|
.word 0x00020000*4+0+2 @ +2 marks end
|
|
|
|
|
|
.align 2
|
|
|
.thumb_func
|
|
|
qfp_fsub:
|
|
|
ldr r2,=#0x80000000
|
|
|
eors r1,r2 @ flip sign on second argument
|
|
|
@ drop into fadd, on .align2:ed boundary
|
|
|
|
|
|
.thumb_func
|
|
|
qfp_fadd:
|
|
|
push {r4,r5,r6,r14}
|
|
|
asrs r4,r0,#31
|
|
|
lsls r2,r0,#1
|
|
|
lsrs r2,#24 @ x exponent
|
|
|
beq fa_xe0
|
|
|
cmp r2,#255
|
|
|
beq fa_xe255
|
|
|
fa_xe:
|
|
|
asrs r5,r1,#31
|
|
|
lsls r3,r1,#1
|
|
|
lsrs r3,#24 @ y exponent
|
|
|
beq fa_ye0
|
|
|
cmp r3,#255
|
|
|
beq fa_ye255
|
|
|
fa_ye:
|
|
|
ldr r6,=#0x007fffff
|
|
|
ands r0,r0,r6 @ extract mantissa bits
|
|
|
ands r1,r1,r6
|
|
|
adds r6,#1 @ r6=0x00800000
|
|
|
orrs r0,r0,r6 @ set implied 1
|
|
|
orrs r1,r1,r6
|
|
|
eors r0,r0,r4 @ complement...
|
|
|
eors r1,r1,r5
|
|
|
subs r0,r0,r4 @ ... and add 1 if sign bit is set: 2's complement
|
|
|
subs r1,r1,r5
|
|
|
subs r5,r3,r2 @ ye-xe
|
|
|
subs r4,r2,r3 @ xe-ye
|
|
|
bmi fa_ygtx
|
|
|
@ here xe>=ye
|
|
|
cmp r4,#30
|
|
|
bge fa_xmgty @ xe much greater than ye?
|
|
|
adds r5,#32
|
|
|
movs r3,r2 @ save exponent
|
|
|
@ here y in r1 must be shifted down r4 places to align with x in r0
|
|
|
movs r2,r1
|
|
|
lsls r2,r2,r5 @ keep the bits we will shift off the bottom of r1
|
|
|
asrs r1,r1,r4
|
|
|
b fa_0
|
|
|
|
|
|
.ltorg
|
|
|
|
|
|
fa_ymgtx:
|
|
|
movs r2,#0 @ result is just y
|
|
|
movs r0,r1
|
|
|
b fa_1
|
|
|
fa_xmgty:
|
|
|
movs r3,r2 @ result is just x
|
|
|
movs r2,#0
|
|
|
b fa_1
|
|
|
|
|
|
fa_ygtx:
|
|
|
@ here ye>xe
|
|
|
cmp r5,#30
|
|
|
bge fa_ymgtx @ ye much greater than xe?
|
|
|
adds r4,#32
|
|
|
@ here x in r0 must be shifted down r5 places to align with y in r1
|
|
|
movs r2,r0
|
|
|
lsls r2,r2,r4 @ keep the bits we will shift off the bottom of r0
|
|
|
asrs r0,r0,r5
|
|
|
fa_0:
|
|
|
adds r0,r1 @ result is now in r0:r2, possibly highly denormalised or zero; exponent in r3
|
|
|
beq fa_9 @ if zero, inputs must have been of identical magnitude and opposite sign, so return +0
|
|
|
fa_1:
|
|
|
lsrs r1,r0,#31 @ sign bit
|
|
|
beq fa_8
|
|
|
mvns r0,r0
|
|
|
rsbs r2,r2,#0
|
|
|
bne fa_8
|
|
|
adds r0,#1
|
|
|
fa_8:
|
|
|
adds r6,r6
|
|
|
@ r6=0x01000000
|
|
|
cmp r0,r6
|
|
|
bhs fa_2
|
|
|
fa_3:
|
|
|
adds r2,r2 @ normalisation loop
|
|
|
adcs r0,r0
|
|
|
subs r3,#1 @ adjust exponent
|
|
|
cmp r0,r6
|
|
|
blo fa_3
|
|
|
fa_2:
|
|
|
@ here r0:r2 is the result mantissa 0x01000000<=r0<0x02000000, r3 the exponent, and r1 the sign bit
|
|
|
lsrs r0,#1
|
|
|
bcc fa_4
|
|
|
@ rounding bits here are 1:r2
|
|
|
adds r0,#1 @ round up
|
|
|
cmp r2,#0
|
|
|
beq fa_5 @ sticky bits all zero?
|
|
|
fa_4:
|
|
|
cmp r3,#254
|
|
|
bhs fa_6 @ exponent too large or negative?
|
|
|
lsls r1,#31 @ pack everything
|
|
|
add r0,r1
|
|
|
lsls r3,#23
|
|
|
add r0,r3
|
|
|
fa_end:
|
|
|
pop {r4,r5,r6,r15}
|
|
|
|
|
|
fa_9:
|
|
|
cmp r2,#0 @ result zero?
|
|
|
beq fa_end @ return +0
|
|
|
b fa_1
|
|
|
|
|
|
fa_5:
|
|
|
lsrs r0,#1
|
|
|
lsls r0,#1 @ round to even
|
|
|
b fa_4
|
|
|
|
|
|
fa_6:
|
|
|
bge fa_7
|
|
|
@ underflow
|
|
|
@ can handle denormals here
|
|
|
lsls r0,r1,#31 @ result is signed zero
|
|
|
pop {r4,r5,r6,r15}
|
|
|
fa_7:
|
|
|
@ overflow
|
|
|
lsls r0,r1,#8
|
|
|
adds r0,#255
|
|
|
lsls r0,#23 @ result is signed infinity
|
|
|
pop {r4,r5,r6,r15}
|
|
|
|
|
|
|
|
|
fa_xe0:
|
|
|
@ can handle denormals here
|
|
|
subs r2,#32
|
|
|
adds r2,r4 @ exponent -32 for +Inf, -33 for -Inf
|
|
|
b fa_xe
|
|
|
|
|
|
fa_xe255:
|
|
|
@ can handle NaNs here
|
|
|
lsls r2,#8
|
|
|
add r2,r2,r4 @ exponent ~64k for +Inf, ~64k-1 for -Inf
|
|
|
b fa_xe
|
|
|
|
|
|
fa_ye0:
|
|
|
@ can handle denormals here
|
|
|
subs r3,#32
|
|
|
adds r3,r5 @ exponent -32 for +Inf, -33 for -Inf
|
|
|
b fa_ye
|
|
|
|
|
|
fa_ye255:
|
|
|
@ can handle NaNs here
|
|
|
lsls r3,#8
|
|
|
add r3,r3,r5 @ exponent ~64k for +Inf, ~64k-1 for -Inf
|
|
|
b fa_ye
|
|
|
|
|
|
|
|
|
.align 2
|
|
|
.thumb_func
|
|
|
qfp_fmul:
|
|
|
push {r7,r14}
|
|
|
mov r2,r0
|
|
|
eors r2,r1 @ sign of result
|
|
|
lsrs r2,#31
|
|
|
lsls r2,#31
|
|
|
mov r14,r2
|
|
|
lsls r0,#1
|
|
|
lsls r1,#1
|
|
|
lsrs r2,r0,#24 @ xe
|
|
|
beq fm_xe0
|
|
|
cmp r2,#255
|
|
|
beq fm_xe255
|
|
|
fm_xe:
|
|
|
lsrs r3,r1,#24 @ ye
|
|
|
beq fm_ye0
|
|
|
cmp r3,#255
|
|
|
beq fm_ye255
|
|
|
fm_ye:
|
|
|
adds r7,r2,r3 @ exponent of result (will possibly be incremented)
|
|
|
subs r7,#128 @ adjust bias for packing
|
|
|
lsls r0,#8 @ x mantissa
|
|
|
lsls r1,#8 @ y mantissa
|
|
|
lsrs r0,#9
|
|
|
lsrs r1,#9
|
|
|
|
|
|
adds r2,r0,r1 @ for later
|
|
|
mov r12,r2
|
|
|
lsrs r2,r0,#7 @ x[22..7] Q16
|
|
|
lsrs r3,r1,#7 @ y[22..7] Q16
|
|
|
muls r2,r2,r3 @ result [45..14] Q32: never an overestimate and worst case error is 2*(2^7-1)*(2^23-2^7)+(2^7-1)^2 = 2130690049 < 2^31
|
|
|
muls r0,r0,r1 @ result [31..0] Q46
|
|
|
lsrs r2,#18 @ result [45..32] Q14
|
|
|
bcc 1f
|
|
|
cmp r0,#0
|
|
|
bmi 1f
|
|
|
adds r2,#1 @ fix error in r2
|
|
|
1:
|
|
|
lsls r3,r0,#9 @ bits off bottom of result
|
|
|
lsrs r0,#23 @ Q23
|
|
|
lsls r2,#9
|
|
|
adds r0,r2 @ cut'n'shut
|
|
|
add r0,r12 @ implied 1*(x+y) to compensate for no insertion of implied 1s
|
|
|
@ result-1 in r3:r0 Q23+32, i.e., in range [0,3)
|
|
|
|
|
|
lsrs r1,r0,#23
|
|
|
bne fm_0 @ branch if we need to shift down one place
|
|
|
@ here 1<=result<2
|
|
|
cmp r7,#254
|
|
|
bhs fm_3a @ catches both underflow and overflow
|
|
|
lsls r3,#1 @ sticky bits at top of R3, rounding bit in carry
|
|
|
bcc fm_1 @ no rounding
|
|
|
beq fm_2 @ rounding tie?
|
|
|
adds r0,#1 @ round up
|
|
|
fm_1:
|
|
|
adds r7,#1 @ for implied 1
|
|
|
lsls r7,#23 @ pack result
|
|
|
add r0,r7
|
|
|
add r0,r14
|
|
|
pop {r7,r15}
|
|
|
fm_2: @ rounding tie
|
|
|
adds r0,#1
|
|
|
fm_3:
|
|
|
lsrs r0,#1
|
|
|
lsls r0,#1 @ clear bottom bit
|
|
|
b fm_1
|
|
|
|
|
|
@ here 1<=result-1<3
|
|
|
fm_0:
|
|
|
adds r7,#1 @ increment exponent
|
|
|
cmp r7,#254
|
|
|
bhs fm_3b @ catches both underflow and overflow
|
|
|
lsrs r0,#1 @ shift mantissa down
|
|
|
bcc fm_1a @ no rounding
|
|
|
adds r0,#1 @ assume we will round up
|
|
|
cmp r3,#0 @ sticky bits
|
|
|
beq fm_3c @ rounding tie?
|
|
|
fm_1a:
|
|
|
adds r7,r7
|
|
|
adds r7,#1 @ for implied 1
|
|
|
lsls r7,#22 @ pack result
|
|
|
add r0,r7
|
|
|
add r0,r14
|
|
|
pop {r7,r15}
|
|
|
|
|
|
fm_3c:
|
|
|
lsrs r0,#1
|
|
|
lsls r0,#1 @ clear bottom bit
|
|
|
b fm_1a
|
|
|
|
|
|
fm_xe0:
|
|
|
subs r2,#16
|
|
|
fm_xe255:
|
|
|
lsls r2,#8
|
|
|
b fm_xe
|
|
|
fm_ye0:
|
|
|
subs r3,#16
|
|
|
fm_ye255:
|
|
|
lsls r3,#8
|
|
|
b fm_ye
|
|
|
|
|
|
@ here the result is under- or overflowing
|
|
|
fm_3b:
|
|
|
bge fm_4 @ branch on overflow
|
|
|
@ trap case where result is denormal 0x007fffff + 0.5ulp or more
|
|
|
adds r7,#1 @ exponent=-1?
|
|
|
bne fm_5
|
|
|
@ corrected mantissa will be >= 3.FFFFFC (0x1fffffe Q23)
|
|
|
@ so r0 >= 2.FFFFFC (0x17ffffe Q23)
|
|
|
adds r0,#2
|
|
|
lsrs r0,#23
|
|
|
cmp r0,#3
|
|
|
bne fm_5
|
|
|
b fm_6
|
|
|
|
|
|
fm_3a:
|
|
|
bge fm_4 @ branch on overflow
|
|
|
@ trap case where result is denormal 0x007fffff + 0.5ulp or more
|
|
|
adds r7,#1 @ exponent=-1?
|
|
|
bne fm_5
|
|
|
adds r0,#1 @ mantissa=0xffffff (i.e., r0=0x7fffff)?
|
|
|
lsrs r0,#23
|
|
|
beq fm_5
|
|
|
fm_6:
|
|
|
movs r0,#1 @ return smallest normal
|
|
|
lsls r0,#23
|
|
|
add r0,r14
|
|
|
pop {r7,r15}
|
|
|
|
|
|
fm_5:
|
|
|
mov r0,r14
|
|
|
pop {r7,r15}
|
|
|
fm_4:
|
|
|
movs r0,#0xff
|
|
|
lsls r0,#23
|
|
|
add r0,r14
|
|
|
pop {r7,r15}
|
|
|
|
|
|
@ This version of the division algorithm uses external divider hardware to estimate the
|
|
|
@ reciprocal of the divisor to about 14 bits; then a multiplication step to get a first
|
|
|
@ quotient estimate; then the remainder based on this estimate is used to calculate a
|
|
|
@ correction to the quotient. The result is good to about 27 bits and so we only need
|
|
|
@ to calculate the exact remainder when close to a rounding boundary.
|
|
|
.align 2
|
|
|
.thumb_func
|
|
|
qfp_fdiv:
|
|
|
push {r4,r5,r6,r14}
|
|
|
fdiv_n:
|
|
|
|
|
|
movs r4,#1
|
|
|
lsls r4,#23 @ implied 1 position
|
|
|
lsls r2,r1,#9 @ clear out sign and exponent
|
|
|
lsrs r2,r2,#9
|
|
|
orrs r2,r2,r4 @ divisor mantissa Q23 with implied 1
|
|
|
|
|
|
@ here
|
|
|
@ r0=packed dividend
|
|
|
@ r1=packed divisor
|
|
|
@ r2=divisor mantissa Q23
|
|
|
@ r4=1<<23
|
|
|
|
|
|
// see divtest.c
|
|
|
lsrs r3,r2,#18 @ x2=x>>18; // Q5 32..63
|
|
|
adr r5,rcpapp-32
|
|
|
ldrb r3,[r5,r3] @ u=lut5[x2-32]; // Q8
|
|
|
lsls r5,r2,#5
|
|
|
muls r5,r5,r3
|
|
|
asrs r5,#14 @ e=(i32)(u*(x<<5))>>14; // Q22
|
|
|
asrs r6,r5,#11
|
|
|
muls r6,r6,r6 @ e2=(e>>11)*(e>>11); // Q22
|
|
|
subs r5,r6
|
|
|
muls r5,r5,r3 @ c=(e-e2)*u; // Q30
|
|
|
lsls r6,r3,#8
|
|
|
asrs r5,#13
|
|
|
adds r5,#1
|
|
|
asrs r5,#1
|
|
|
subs r5,r6,r5 @ u0=(u<<8)-((c+0x2000)>>14); // Q16
|
|
|
|
|
|
@ here
|
|
|
@ r0=packed dividend
|
|
|
@ r1=packed divisor
|
|
|
@ r2=divisor mantissa Q23
|
|
|
@ r4=1<<23
|
|
|
@ r5=reciprocal estimate Q16
|
|
|
|
|
|
lsrs r6,r0,#23
|
|
|
uxtb r3,r6 @ dividend exponent
|
|
|
lsls r0,#9
|
|
|
lsrs r0,#9
|
|
|
orrs r0,r0,r4 @ dividend mantissa Q23
|
|
|
|
|
|
lsrs r1,#23
|
|
|
eors r6,r1 @ sign of result in bit 8
|
|
|
lsrs r6,#8
|
|
|
lsls r6,#31 @ sign of result in bit 31, other bits clear
|
|
|
|
|
|
@ here
|
|
|
@ r0=dividend mantissa Q23
|
|
|
@ r1=divisor sign+exponent
|
|
|
@ r2=divisor mantissa Q23
|
|
|
@ r3=dividend exponent
|
|
|
@ r5=reciprocal estimate Q16
|
|
|
@ r6b31=sign of result
|
|
|
|
|
|
uxtb r1,r1 @ divisor exponent
|
|
|
cmp r1,#0
|
|
|
beq retinf
|
|
|
cmp r1,#255
|
|
|
beq 20f @ divisor is infinite
|
|
|
cmp r3,#0
|
|
|
beq retzero
|
|
|
cmp r3,#255
|
|
|
beq retinf
|
|
|
subs r3,r1 @ initial result exponent (no bias)
|
|
|
adds r3,#125 @ add bias
|
|
|
|
|
|
lsrs r1,r0,#8 @ dividend mantissa Q15
|
|
|
|
|
|
@ here
|
|
|
@ r0=dividend mantissa Q23
|
|
|
@ r1=dividend mantissa Q15
|
|
|
@ r2=divisor mantissa Q23
|
|
|
@ r3=initial result exponent
|
|
|
@ r5=reciprocal estimate Q16
|
|
|
@ r6b31=sign of result
|
|
|
|
|
|
muls r1,r5
|
|
|
|
|
|
lsrs r1,#16 @ Q15 qu0=(q15)(u*y0);
|
|
|
lsls r0,r0,#15 @ dividend Q38
|
|
|
movs r4,r2
|
|
|
muls r4,r1 @ Q38 qu0*x
|
|
|
subs r4,r0,r4 @ Q38 re0=(y<<15)-qu0*x; note this remainder is signed
|
|
|
asrs r4,#10
|
|
|
muls r4,r5 @ Q44 qu1=(re0>>10)*u; this quotient correction is also signed
|
|
|
asrs r4,#16 @ Q28
|
|
|
lsls r1,#13
|
|
|
adds r1,r1,r4 @ Q28 qu=(qu0<<13)+(qu1>>16);
|
|
|
|
|
|
@ here
|
|
|
@ r0=dividend mantissa Q38
|
|
|
@ r1=quotient Q28
|
|
|
@ r2=divisor mantissa Q23
|
|
|
@ r3=initial result exponent
|
|
|
@ r6b31=sign of result
|
|
|
|
|
|
lsrs r4,r1,#28
|
|
|
bne 1f
|
|
|
@ here the quotient is less than 1<<28 (i.e., result mantissa <1.0)
|
|
|
|
|
|
adds r1,#5
|
|
|
lsrs r4,r1,#4 @ rounding + small reduction in systematic bias
|
|
|
bcc 2f @ skip if we are not near a rounding boundary
|
|
|
lsrs r1,#3 @ quotient Q25
|
|
|
lsls r0,#10 @ dividend mantissa Q48
|
|
|
muls r1,r1,r2 @ quotient*divisor Q48
|
|
|
subs r0,r0,r1 @ remainder Q48
|
|
|
bmi 2f
|
|
|
b 3f
|
|
|
|
|
|
1:
|
|
|
@ here the quotient is at least 1<<28 (i.e., result mantissa >=1.0)
|
|
|
|
|
|
adds r3,#1 @ bump exponent (and shift mantissa down one more place)
|
|
|
adds r1,#9
|
|
|
lsrs r4,r1,#5 @ rounding + small reduction in systematic bias
|
|
|
bcc 2f @ skip if we are not near a rounding boundary
|
|
|
|
|
|
lsrs r1,#4 @ quotient Q24
|
|
|
lsls r0,#9 @ dividend mantissa Q47
|
|
|
muls r1,r1,r2 @ quotient*divisor Q47
|
|
|
subs r0,r0,r1 @ remainder Q47
|
|
|
bmi 2f
|
|
|
3:
|
|
|
adds r4,#1 @ increment quotient as we are above the rounding boundary
|
|
|
|
|
|
@ here
|
|
|
@ r3=result exponent
|
|
|
@ r4=correctly rounded quotient Q23 in range [1,2] *note closed interval*
|
|
|
@ r6b31=sign of result
|
|
|
|
|
|
2:
|
|
|
cmp r3,#254
|
|
|
bhs 10f @ this catches both underflow and overflow
|
|
|
lsls r1,r3,#23
|
|
|
adds r0,r4,r1
|
|
|
adds r0,r6
|
|
|
pop {r4,r5,r6,r15}
|
|
|
|
|
|
@ here divisor is infinite; dividend exponent in r3
|
|
|
20:
|
|
|
cmp r3,#255
|
|
|
bne retzero
|
|
|
|
|
|
retinf:
|
|
|
movs r0,#255
|
|
|
21:
|
|
|
lsls r0,#23
|
|
|
orrs r0,r6
|
|
|
pop {r4,r5,r6,r15}
|
|
|
|
|
|
10:
|
|
|
bge retinf @ overflow?
|
|
|
adds r1,r3,#1
|
|
|
bne retzero @ exponent <-1? return 0
|
|
|
@ here exponent is exactly -1
|
|
|
lsrs r1,r4,#25
|
|
|
bcc retzero @ mantissa is not 01000000?
|
|
|
@ return minimum normal
|
|
|
movs r0,#1
|
|
|
lsls r0,#23
|
|
|
orrs r0,r6
|
|
|
pop {r4,r5,r6,r15}
|
|
|
|
|
|
retzero:
|
|
|
movs r0,r6
|
|
|
pop {r4,r5,r6,r15}
|
|
|
|
|
|
@ x2=[32:1:63]/32;
|
|
|
@ round(256 ./(x2+1/64))
|
|
|
.align 2
|
|
|
rcpapp:
|
|
|
.byte 252,245,237,231,224,218,213,207,202,197,193,188,184,180,176,172
|
|
|
.byte 169,165,162,159,156,153,150,148,145,142,140,138,135,133,131,129
|
|
|
|
|
|
@ The square root routine uses an initial approximation to the reciprocal of the square root of the argument based
|
|
|
@ on the top four bits of the mantissa (possibly shifted one place to make the exponent even). It then performs two
|
|
|
@ Newton-Raphson iterations, resulting in about 14 bits of accuracy. This reciprocal is then multiplied by
|
|
|
@ the original argument to produce an approximation to the result, again with about 14 bits of accuracy.
|
|
|
@ Then a remainder is calculated, and multiplied by the reciprocal estiamte to generate a correction term
|
|
|
@ giving a final answer to about 28 bits of accuracy. A final remainder calculation rounds to the correct
|
|
|
@ result if necessary.
|
|
|
@ Again, the fixed-point calculation is carefully implemented to preserve accuracy, and similar comments to those
|
|
|
@ made above on the fast division routine apply.
|
|
|
@ The reciprocal square root calculation has been tested for all possible (possibly shifted) input mantissa values.
|
|
|
.align 2
|
|
|
.thumb_func
|
|
|
qfp_fsqrt:
|
|
|
push {r4}
|
|
|
lsls r1,r0,#1
|
|
|
bcs sq_0 @ negative?
|
|
|
lsls r1,#8
|
|
|
lsrs r1,#9 @ mantissa
|
|
|
movs r2,#1
|
|
|
lsls r2,#23
|
|
|
adds r1,r2 @ insert implied 1
|
|
|
lsrs r2,r0,#23 @ extract exponent
|
|
|
beq sq_2 @ zero?
|
|
|
cmp r2,#255 @ infinite?
|
|
|
beq sq_1
|
|
|
adds r2,#125 @ correction for packing
|
|
|
asrs r2,#1 @ exponent/2, LSB into carry
|
|
|
bcc 1f
|
|
|
lsls r1,#1 @ was even: double mantissa; mantissa y now 1..4 Q23
|
|
|
1:
|
|
|
adr r4,rsqrtapp-4@ first four table entries are never accessed because of the mantissa's leading 1
|
|
|
lsrs r3,r1,#21 @ y Q2
|
|
|
ldrb r4,[r4,r3] @ initial approximation to reciprocal square root a0 Q8
|
|
|
|
|
|
lsrs r0,r1,#7 @ y Q16: first Newton-Raphson iteration
|
|
|
muls r0,r4 @ a0*y Q24
|
|
|
muls r0,r4 @ r0=p0=a0*y*y Q32
|
|
|
asrs r0,#12 @ r0 Q20
|
|
|
muls r0,r4 @ dy0=a0*r0 Q28
|
|
|
asrs r0,#13 @ dy0 Q15
|
|
|
lsls r4,#8 @ a0 Q16
|
|
|
subs r4,r0 @ a1=a0-dy0/2 Q16-Q15/2 -> Q16
|
|
|
adds r4,#170 @ mostly remove systematic error in this approximation: gains approximately 1 bit
|
|
|
|
|
|
movs r0,r4 @ second Newton-Raphson iteration
|
|
|
muls r0,r0 @ a1*a1 Q32
|
|
|
lsrs r0,#15 @ a1*a1 Q17
|
|
|
lsrs r3,r1,#8 @ y Q15
|
|
|
muls r0,r3 @ r1=p1=a1*a1*y Q32
|
|
|
asrs r0,#12 @ r1 Q20
|
|
|
muls r0,r4 @ dy1=a1*r1 Q36
|
|
|
asrs r0,#21 @ dy1 Q15
|
|
|
subs r4,r0 @ a2=a1-dy1/2 Q16-Q15/2 -> Q16
|
|
|
|
|
|
muls r3,r4 @ a3=y*a2 Q31
|
|
|
lsrs r3,#15 @ a3 Q16
|
|
|
@ here a2 is an approximation to the reciprocal square root
|
|
|
@ and a3 is an approximation to the square root
|
|
|
movs r0,r3
|
|
|
muls r0,r0 @ a3*a3 Q32
|
|
|
lsls r1,#9 @ y Q32
|
|
|
subs r0,r1,r0 @ r2=y-a3*a3 Q32 remainder
|
|
|
asrs r0,#5 @ r2 Q27
|
|
|
muls r4,r0 @ r2*a2 Q43
|
|
|
lsls r3,#7 @ a3 Q23
|
|
|
asrs r0,r4,#15 @ r2*a2 Q28
|
|
|
adds r0,#16 @ rounding to Q24
|
|
|
asrs r0,r0,#6 @ r2*a2 Q22
|
|
|
add r3,r0 @ a4 Q23: candidate final result
|
|
|
bcc sq_3 @ near rounding boundary? skip if no rounding needed
|
|
|
mov r4,r3
|
|
|
adcs r4,r4 @ a4+0.5ulp Q24
|
|
|
muls r4,r4 @ Q48
|
|
|
lsls r1,#16 @ y Q48
|
|
|
subs r1,r4 @ remainder Q48
|
|
|
bmi sq_3
|
|
|
adds r3,#1 @ round up
|
|
|
sq_3:
|
|
|
lsls r2,#23 @ pack exponent
|
|
|
adds r0,r2,r3
|
|
|
sq_6:
|
|
|
pop {r4}
|
|
|
bx r14
|
|
|
|
|
|
sq_0:
|
|
|
lsrs r1,#24
|
|
|
beq sq_2 @ -0: return it
|
|
|
@ here negative and not -0: return -Inf
|
|
|
asrs r0,#31
|
|
|
sq_5:
|
|
|
lsls r0,#23
|
|
|
b sq_6
|
|
|
sq_1: @ +Inf
|
|
|
lsrs r0,#23
|
|
|
b sq_5
|
|
|
sq_2:
|
|
|
lsrs r0,#31
|
|
|
lsls r0,#31
|
|
|
b sq_6
|
|
|
|
|
|
@ round(sqrt(2^22./[72:16:248]))
|
|
|
rsqrtapp:
|
|
|
.byte 0xf1,0xda,0xc9,0xbb, 0xb0,0xa6,0x9e,0x97, 0x91,0x8b,0x86,0x82
|
|
|
|
|
|
|
|
|
|
|
|
@ Notation:
|
|
|
@ rx:ry means the concatenation of rx and ry with rx having the less significant bits
|
|
|
|
|
|
@ IEEE double in ra:rb ->
|
|
|
@ mantissa in ra:rb 12Q52 (53 significant bits) with implied 1 set
|
|
|
@ exponent in re
|
|
|
@ sign in rs
|
|
|
@ trashes rt
|
|
|
.macro mdunpack ra,rb,re,rs,rt
|
|
|
lsrs \re,\rb,#20 @ extract sign and exponent
|
|
|
subs \rs,\re,#1
|
|
|
lsls \rs,#20
|
|
|
subs \rb,\rs @ clear sign and exponent in mantissa; insert implied 1
|
|
|
lsrs \rs,\re,#11 @ sign
|
|
|
lsls \re,#21
|
|
|
lsrs \re,#21 @ exponent
|
|
|
beq l\@_1 @ zero exponent?
|
|
|
adds \rt,\re,#1
|
|
|
lsrs \rt,#11
|
|
|
beq l\@_2 @ exponent != 0x7ff? then done
|
|
|
l\@_1:
|
|
|
movs \ra,#0
|
|
|
movs \rb,#1
|
|
|
lsls \rb,#20
|
|
|
subs \re,#128
|
|
|
lsls \re,#12
|
|
|
l\@_2:
|
|
|
.endm
|
|
|
|
|
|
@ IEEE double in ra:rb ->
|
|
|
@ signed mantissa in ra:rb 12Q52 (53 significant bits) with implied 1
|
|
|
@ exponent in re
|
|
|
@ trashes rt0 and rt1
|
|
|
@ +zero, +denormal -> exponent=-0x80000
|
|
|
@ -zero, -denormal -> exponent=-0x80000
|
|
|
@ +Inf, +NaN -> exponent=+0x77f000
|
|
|
@ -Inf, -NaN -> exponent=+0x77e000
|
|
|
.macro mdunpacks ra,rb,re,rt0,rt1
|
|
|
lsrs \re,\rb,#20 @ extract sign and exponent
|
|
|
lsrs \rt1,\rb,#31 @ sign only
|
|
|
subs \rt0,\re,#1
|
|
|
lsls \rt0,#20
|
|
|
subs \rb,\rt0 @ clear sign and exponent in mantissa; insert implied 1
|
|
|
lsls \re,#21
|
|
|
bcc l\@_1 @ skip on positive
|
|
|
mvns \rb,\rb @ negate mantissa
|
|
|
rsbs \ra,#0
|
|
|
bcc l\@_1
|
|
|
adds \rb,#1
|
|
|
l\@_1:
|
|
|
lsrs \re,#21
|
|
|
beq l\@_2 @ zero exponent?
|
|
|
adds \rt0,\re,#1
|
|
|
lsrs \rt0,#11
|
|
|
beq l\@_3 @ exponent != 0x7ff? then done
|
|
|
subs \re,\rt1
|
|
|
l\@_2:
|
|
|
movs \ra,#0
|
|
|
lsls \rt1,#1 @ +ve: 0 -ve: 2
|
|
|
adds \rb,\rt1,#1 @ +ve: 1 -ve: 3
|
|
|
lsls \rb,#30 @ create +/-1 mantissa
|
|
|
asrs \rb,#10
|
|
|
subs \re,#128
|
|
|
lsls \re,#12
|
|
|
l\@_3:
|
|
|
.endm
|
|
|
|
|
|
.align 2
|
|
|
.thumb_func
|
|
|
qfp_dsub:
|
|
|
push {r4-r7,r14}
|
|
|
movs r4,#1
|
|
|
lsls r4,#31
|
|
|
eors r3,r4 @ flip sign on second argument
|
|
|
b da_entry @ continue in dadd
|
|
|
|
|
|
.align 2
|
|
|
.thumb_func
|
|
|
qfp_dadd:
|
|
|
push {r4-r7,r14}
|
|
|
da_entry:
|
|
|
mdunpacks r0,r1,r4,r6,r7
|
|
|
mdunpacks r2,r3,r5,r6,r7
|
|
|
subs r7,r5,r4 @ ye-xe
|
|
|
subs r6,r4,r5 @ xe-ye
|
|
|
bmi da_ygtx
|
|
|
@ here xe>=ye: need to shift y down r6 places
|
|
|
mov r12,r4 @ save exponent
|
|
|
cmp r6,#32
|
|
|
bge da_xrgty @ xe rather greater than ye?
|
|
|
adds r7,#32
|
|
|
movs r4,r2
|
|
|
lsls r4,r4,r7 @ rounding bit + sticky bits
|
|
|
da_xgty0:
|
|
|
movs r5,r3
|
|
|
lsls r5,r5,r7
|
|
|
lsrs r2,r6
|
|
|
asrs r3,r6
|
|
|
orrs r2,r5
|
|
|
da_add:
|
|
|
adds r0,r2
|
|
|
adcs r1,r3
|
|
|
da_pack:
|
|
|
@ here unnormalised signed result (possibly 0) is in r0:r1 with exponent r12, rounding + sticky bits in r4
|
|
|
@ Note that if a large normalisation shift is required then the arguments were close in magnitude and so we
|
|
|
@ cannot have not gone via the xrgty/yrgtx paths. There will therefore always be enough high bits in r4
|
|
|
@ to provide a correct continuation of the exact result.
|
|
|
@ now pack result back up
|
|
|
lsrs r3,r1,#31 @ get sign bit
|
|
|
beq 1f @ skip on positive
|
|
|
mvns r1,r1 @ negate mantissa
|
|
|
mvns r0,r0
|
|
|
movs r2,#0
|
|
|
rsbs r4,#0
|
|
|
adcs r0,r2
|
|
|
adcs r1,r2
|
|
|
1:
|
|
|
mov r2,r12 @ get exponent
|
|
|
lsrs r5,r1,#21
|
|
|
bne da_0 @ shift down required?
|
|
|
lsrs r5,r1,#20
|
|
|
bne da_1 @ normalised?
|
|
|
cmp r0,#0
|
|
|
beq da_5 @ could mantissa be zero?
|
|
|
da_2:
|
|
|
adds r4,r4
|
|
|
adcs r0,r0
|
|
|
adcs r1,r1
|
|
|
subs r2,#1 @ adjust exponent
|
|
|
lsrs r5,r1,#20
|
|
|
beq da_2
|
|
|
da_1:
|
|
|
lsls r4,#1 @ check rounding bit
|
|
|
bcc da_3
|
|
|
da_4:
|
|
|
adds r0,#1 @ round up
|
|
|
bcc 2f
|
|
|
adds r1,#1
|
|
|
2:
|
|
|
cmp r4,#0 @ sticky bits zero?
|
|
|
bne da_3
|
|
|
lsrs r0,#1 @ round to even
|
|
|
lsls r0,#1
|
|
|
da_3:
|
|
|
subs r2,#1
|
|
|
bmi da_6
|
|
|
adds r4,r2,#2 @ check if exponent is overflowing
|
|
|
lsrs r4,#11
|
|
|
bne da_7
|
|
|
lsls r2,#20 @ pack exponent and sign
|
|
|
add r1,r2
|
|
|
lsls r3,#31
|
|
|
add r1,r3
|
|
|
pop {r4-r7,r15}
|
|
|
|
|
|
da_7:
|
|
|
@ here exponent overflow: return signed infinity
|
|
|
lsls r1,r3,#31
|
|
|
ldr r3,=#0x7ff00000
|
|
|
orrs r1,r3
|
|
|
b 1f
|
|
|
da_6:
|
|
|
@ here exponent underflow: return signed zero
|
|
|
lsls r1,r3,#31
|
|
|
1:
|
|
|
movs r0,#0
|
|
|
pop {r4-r7,r15}
|
|
|
|
|
|
da_5:
|
|
|
@ here mantissa could be zero
|
|
|
cmp r1,#0
|
|
|
bne da_2
|
|
|
cmp r4,#0
|
|
|
bne da_2
|
|
|
@ inputs must have been of identical magnitude and opposite sign, so return +0
|
|
|
pop {r4-r7,r15}
|
|
|
|
|
|
da_0:
|
|
|
@ here a shift down by one place is required for normalisation
|
|
|
adds r2,#1 @ adjust exponent
|
|
|
lsls r6,r0,#31 @ save rounding bit
|
|
|
lsrs r0,#1
|
|
|
lsls r5,r1,#31
|
|
|
orrs r0,r5
|
|
|
lsrs r1,#1
|
|
|
cmp r6,#0
|
|
|
beq da_3
|
|
|
b da_4
|
|
|
|
|
|
da_xrgty: @ xe>ye and shift>=32 places
|
|
|
cmp r6,#60
|
|
|
bge da_xmgty @ xe much greater than ye?
|
|
|
subs r6,#32
|
|
|
adds r7,#64
|
|
|
|
|
|
movs r4,r2
|
|
|
lsls r4,r4,r7 @ these would be shifted off the bottom of the sticky bits
|
|
|
beq 1f
|
|
|
movs r4,#1
|
|
|
1:
|
|
|
lsrs r2,r2,r6
|
|
|
orrs r4,r2
|
|
|
movs r2,r3
|
|
|
lsls r3,r3,r7
|
|
|
orrs r4,r3
|
|
|
asrs r3,r2,#31 @ propagate sign bit
|
|
|
b da_xgty0
|
|
|
|
|
|
da_ygtx:
|
|
|
@ here ye>xe: need to shift x down r7 places
|
|
|
mov r12,r5 @ save exponent
|
|
|
cmp r7,#32
|
|
|
bge da_yrgtx @ ye rather greater than xe?
|
|
|
adds r6,#32
|
|
|
movs r4,r0
|
|
|
lsls r4,r4,r6 @ rounding bit + sticky bits
|
|
|
da_ygtx0:
|
|
|
movs r5,r1
|
|
|
lsls r5,r5,r6
|
|
|
lsrs r0,r7
|
|
|
asrs r1,r7
|
|
|
orrs r0,r5
|
|
|
b da_add
|
|
|
|
|
|
da_yrgtx:
|
|
|
cmp r7,#60
|
|
|
bge da_ymgtx @ ye much greater than xe?
|
|
|
subs r7,#32
|
|
|
adds r6,#64
|
|
|
|
|
|
movs r4,r0
|
|
|
lsls r4,r4,r6 @ these would be shifted off the bottom of the sticky bits
|
|
|
beq 1f
|
|
|
movs r4,#1
|
|
|
1:
|
|
|
lsrs r0,r0,r7
|
|
|
orrs r4,r0
|
|
|
movs r0,r1
|
|
|
lsls r1,r1,r6
|
|
|
orrs r4,r1
|
|
|
asrs r1,r0,#31 @ propagate sign bit
|
|
|
b da_ygtx0
|
|
|
|
|
|
da_ymgtx: @ result is just y
|
|
|
movs r0,r2
|
|
|
movs r1,r3
|
|
|
da_xmgty: @ result is just x
|
|
|
movs r4,#0 @ clear sticky bits
|
|
|
b da_pack
|
|
|
|
|
|
.ltorg
|
|
|
|
|
|
@ equivalent of UMULL
|
|
|
@ needs five temporary registers
|
|
|
@ can have rt3==rx, in which case rx trashed
|
|
|
@ can have rt4==ry, in which case ry trashed
|
|
|
@ can have rzl==rx
|
|
|
@ can have rzh==ry
|
|
|
@ can have rzl,rzh==rt3,rt4
|
|
|
.macro mul32_32_64 rx,ry,rzl,rzh,rt0,rt1,rt2,rt3,rt4
|
|
|
@ t0 t1 t2 t3 t4
|
|
|
@ (x) (y)
|
|
|
uxth \rt0,\rx @ xl
|
|
|
uxth \rt1,\ry @ yl
|
|
|
muls \rt0,\rt1 @ xlyl=L
|
|
|
lsrs \rt2,\rx,#16 @ xh
|
|
|
muls \rt1,\rt2 @ xhyl=M0
|
|
|
lsrs \rt4,\ry,#16 @ yh
|
|
|
muls \rt2,\rt4 @ xhyh=H
|
|
|
uxth \rt3,\rx @ xl
|
|
|
muls \rt3,\rt4 @ xlyh=M1
|
|
|
adds \rt1,\rt3 @ M0+M1=M
|
|
|
bcc l\@_1 @ addition of the two cross terms can overflow, so add carry into H
|
|
|
movs \rt3,#1 @ 1
|
|
|
lsls \rt3,#16 @ 0x10000
|
|
|
adds \rt2,\rt3 @ H'
|
|
|
l\@_1:
|
|
|
@ t0 t1 t2 t3 t4
|
|
|
@ (zl) (zh)
|
|
|
lsls \rzl,\rt1,#16 @ ML
|
|
|
lsrs \rzh,\rt1,#16 @ MH
|
|
|
adds \rzl,\rt0 @ ZL
|
|
|
adcs \rzh,\rt2 @ ZH
|
|
|
.endm
|
|
|
|
|
|
@ SUMULL: x signed, y unsigned
|
|
|
@ in table below ¯ means signed variable
|
|
|
@ needs five temporary registers
|
|
|
@ can have rt3==rx, in which case rx trashed
|
|
|
@ can have rt4==ry, in which case ry trashed
|
|
|
@ can have rzl==rx
|
|
|
@ can have rzh==ry
|
|
|
@ can have rzl,rzh==rt3,rt4
|
|
|
.macro muls32_32_64 rx,ry,rzl,rzh,rt0,rt1,rt2,rt3,rt4
|
|
|
@ t0 t1 t2 t3 t4
|
|
|
@ ¯(x) (y)
|
|
|
uxth \rt0,\rx @ xl
|
|
|
uxth \rt1,\ry @ yl
|
|
|
muls \rt0,\rt1 @ xlyl=L
|
|
|
asrs \rt2,\rx,#16 @ ¯xh
|
|
|
muls \rt1,\rt2 @ ¯xhyl=M0
|
|
|
lsrs \rt4,\ry,#16 @ yh
|
|
|
muls \rt2,\rt4 @ ¯xhyh=H
|
|
|
uxth \rt3,\rx @ xl
|
|
|
muls \rt3,\rt4 @ xlyh=M1
|
|
|
asrs \rt4,\rt1,#31 @ M0sx (M1 sign extension is zero)
|
|
|
adds \rt1,\rt3 @ M0+M1=M
|
|
|
movs \rt3,#0 @ 0
|
|
|
adcs \rt4,\rt3 @ ¯Msx
|
|
|
lsls \rt4,#16 @ ¯Msx<<16
|
|
|
adds \rt2,\rt4 @ H'
|
|
|
|
|
|
@ t0 t1 t2 t3 t4
|
|
|
@ (zl) (zh)
|
|
|
lsls \rzl,\rt1,#16 @ M~
|
|
|
lsrs \rzh,\rt1,#16 @ M~
|
|
|
adds \rzl,\rt0 @ ZL
|
|
|
adcs \rzh,\rt2 @ ¯ZH
|
|
|
.endm
|
|
|
|
|
|
@ SSMULL: x signed, y signed
|
|
|
@ in table below ¯ means signed variable
|
|
|
@ needs five temporary registers
|
|
|
@ can have rt3==rx, in which case rx trashed
|
|
|
@ can have rt4==ry, in which case ry trashed
|
|
|
@ can have rzl==rx
|
|
|
@ can have rzh==ry
|
|
|
@ can have rzl,rzh==rt3,rt4
|
|
|
.macro muls32_s32_64 rx,ry,rzl,rzh,rt0,rt1,rt2,rt3,rt4
|
|
|
@ t0 t1 t2 t3 t4
|
|
|
@ ¯(x) (y)
|
|
|
uxth \rt0,\rx @ xl
|
|
|
uxth \rt1,\ry @ yl
|
|
|
muls \rt0,\rt1 @ xlyl=L
|
|
|
asrs \rt2,\rx,#16 @ ¯xh
|
|
|
muls \rt1,\rt2 @ ¯xhyl=M0
|
|
|
asrs \rt4,\ry,#16 @ ¯yh
|
|
|
muls \rt2,\rt4 @ ¯xhyh=H
|
|
|
uxth \rt3,\rx @ xl
|
|
|
muls \rt3,\rt4 @ ¯xlyh=M1
|
|
|
adds \rt1,\rt3 @ ¯M0+M1=M
|
|
|
asrs \rt3,\rt1,#31 @ Msx
|
|
|
bvc l\@_1 @
|
|
|
mvns \rt3,\rt3 @ ¯Msx flip sign extension bits if overflow
|
|
|
l\@_1:
|
|
|
lsls \rt3,#16 @ ¯Msx<<16
|
|
|
adds \rt2,\rt3 @ H'
|
|
|
|
|
|
@ t0 t1 t2 t3 t4
|
|
|
@ (zl) (zh)
|
|
|
lsls \rzl,\rt1,#16 @ M~
|
|
|
lsrs \rzh,\rt1,#16 @ M~
|
|
|
adds \rzl,\rt0 @ ZL
|
|
|
adcs \rzh,\rt2 @ ¯ZH
|
|
|
.endm
|
|
|
|
|
|
@ can have rt2==rx, in which case rx trashed
|
|
|
@ can have rzl==rx
|
|
|
@ can have rzh==rt1
|
|
|
.macro square32_64 rx,rzl,rzh,rt0,rt1,rt2
|
|
|
@ t0 t1 t2 zl zh
|
|
|
uxth \rt0,\rx @ xl
|
|
|
muls \rt0,\rt0 @ xlxl=L
|
|
|
uxth \rt1,\rx @ xl
|
|
|
lsrs \rt2,\rx,#16 @ xh
|
|
|
muls \rt1,\rt2 @ xlxh=M
|
|
|
muls \rt2,\rt2 @ xhxh=H
|
|
|
lsls \rzl,\rt1,#17 @ ML
|
|
|
lsrs \rzh,\rt1,#15 @ MH
|
|
|
adds \rzl,\rt0 @ ZL
|
|
|
adcs \rzh,\rt2 @ ZH
|
|
|
.endm
|
|
|
|
|
|
.align 2
|
|
|
.thumb_func
|
|
|
qfp_dmul:
|
|
|
push {r4-r7,r14}
|
|
|
mdunpack r0,r1,r4,r6,r5
|
|
|
mov r12,r4
|
|
|
mdunpack r2,r3,r4,r7,r5
|
|
|
eors r7,r6 @ sign of result
|
|
|
add r4,r12 @ exponent of result
|
|
|
push {r0-r2,r4,r7}
|
|
|
|
|
|
@ accumulate full product in r12:r5:r6:r7
|
|
|
mul32_32_64 r0,r2, r0,r5, r4,r6,r7,r0,r5 @ XL*YL
|
|
|
mov r12,r0 @ save LL bits
|
|
|
|
|
|
mul32_32_64 r1,r3, r6,r7, r0,r2,r4,r6,r7 @ XH*YH
|
|
|
|
|
|
pop {r0} @ XL
|
|
|
mul32_32_64 r0,r3, r0,r3, r1,r2,r4,r0,r3 @ XL*YH
|
|
|
adds r5,r0
|
|
|
adcs r6,r3
|
|
|
movs r0,#0
|
|
|
adcs r7,r0
|
|
|
|
|
|
pop {r1,r2} @ XH,YL
|
|
|
mul32_32_64 r1,r2, r1,r2, r0,r3,r4, r1,r2 @ XH*YL
|
|
|
adds r5,r1
|
|
|
adcs r6,r2
|
|
|
movs r0,#0
|
|
|
adcs r7,r0
|
|
|
|
|
|
@ here r5:r6:r7 holds the product [1..4) in Q(104-32)=Q72, with extra LSBs in r12
|
|
|
pop {r3,r4} @ exponent in r3, sign in r4
|
|
|
lsls r1,r7,#11
|
|
|
lsrs r2,r6,#21
|
|
|
orrs r1,r2
|
|
|
lsls r0,r6,#11
|
|
|
lsrs r2,r5,#21
|
|
|
orrs r0,r2
|
|
|
lsls r5,#11 @ now r5:r0:r1 Q83=Q(51+32), extra LSBs in r12
|
|
|
lsrs r2,r1,#20
|
|
|
bne 1f @ skip if in range [2..4)
|
|
|
adds r5,r5 @ shift up so always [2..4) Q83, i.e. [1..2) Q84=Q(52+32)
|
|
|
adcs r0,r0
|
|
|
adcs r1,r1
|
|
|
subs r3,#1 @ correct exponent
|
|
|
1:
|
|
|
ldr r6,=#0x3ff
|
|
|
subs r3,r6 @ correct for exponent bias
|
|
|
lsls r6,#1 @ 0x7fe
|
|
|
cmp r3,r6
|
|
|
bhs dm_0 @ exponent over- or underflow
|
|
|
lsls r5,#1 @ rounding bit to carry
|
|
|
bcc 1f @ result is correctly rounded
|
|
|
adds r0,#1
|
|
|
movs r6,#0
|
|
|
adcs r1,r6 @ round up
|
|
|
mov r6,r12 @ remaining sticky bits
|
|
|
orrs r5,r6
|
|
|
bne 1f @ some sticky bits set?
|
|
|
lsrs r0,#1
|
|
|
lsls r0,#1 @ round to even
|
|
|
1:
|
|
|
lsls r3,#20
|
|
|
adds r1,r3
|
|
|
dm_2:
|
|
|
lsls r4,#31
|
|
|
add r1,r4
|
|
|
pop {r4-r7,r15}
|
|
|
|
|
|
@ here for exponent over- or underflow
|
|
|
dm_0:
|
|
|
bge dm_1 @ overflow?
|
|
|
adds r3,#1 @ would-be zero exponent?
|
|
|
bne 1f
|
|
|
adds r0,#1
|
|
|
bne 1f @ all-ones mantissa?
|
|
|
adds r1,#1
|
|
|
lsrs r7,r1,#21
|
|
|
beq 1f
|
|
|
lsrs r1,#1
|
|
|
b dm_2
|
|
|
1:
|
|
|
lsls r1,r4,#31
|
|
|
movs r0,#0
|
|
|
pop {r4-r7,r15}
|
|
|
|
|
|
@ here for exponent overflow
|
|
|
dm_1:
|
|
|
adds r6,#1 @ 0x7ff
|
|
|
lsls r1,r6,#20
|
|
|
movs r0,#0
|
|
|
b dm_2
|
|
|
|
|
|
.ltorg
|
|
|
|
|
|
@ Approach to division y/x is as follows.
|
|
|
@
|
|
|
@ First generate u1, an approximation to 1/x to about 29 bits. Multiply this by the top
|
|
|
@ 32 bits of y to generate a0, a first approximation to the result (good to 28 bits or so).
|
|
|
@ Calculate the exact remainder r0=y-a0*x, which will be about 0. Calculate a correction
|
|
|
@ d0=r0*u1, and then write a1=a0+d0. If near a rounding boundary, compute the exact
|
|
|
@ remainder r1=y-a1*x (which can be done using r0 as a basis) to determine whether to
|
|
|
@ round up or down.
|
|
|
@
|
|
|
@ The calculation of 1/x is as given in dreciptest.c. That code verifies exhaustively
|
|
|
@ that | u1*x-1 | < 10*2^-32.
|
|
|
@
|
|
|
@ More precisely:
|
|
|
@
|
|
|
@ x0=(q16)x;
|
|
|
@ x1=(q30)x;
|
|
|
@ y0=(q31)y;
|
|
|
@ u0=(q15~)"(0xffffffffU/(unsigned int)roundq(x/x_ulp))/powq(2,16)"(x0); // q15 approximation to 1/x; "~" denotes rounding rather than truncation
|
|
|
@ v=(q30)(u0*x1-1);
|
|
|
@ u1=(q30)u0-(q30~)(u0*v);
|
|
|
@
|
|
|
@ a0=(q30)(u1*y0);
|
|
|
@ r0=(q82)y-a0*x;
|
|
|
@ r0x=(q57)r0;
|
|
|
@ d0=r0x*u1;
|
|
|
@ a1=d0+a0;
|
|
|
@
|
|
|
@ Error analysis
|
|
|
@
|
|
|
@ Use Greek letters to represent the errors introduced by rounding and truncation.
|
|
|
@
|
|
|
@ r₀ = y - a₀x
|
|
|
@ = y - [ u₁ ( y - α ) - β ] x where 0 ≤ α < 2^-31, 0 ≤ β < 2^-30
|
|
|
@ = y ( 1 - u₁x ) + ( u₁α + β ) x
|
|
|
@
|
|
|
@ Hence
|
|
|
@
|
|
|
@ | r₀ / x | < 2 * 10*2^-32 + 2^-31 + 2^-30
|
|
|
@ = 26*2^-32
|
|
|
@
|
|
|
@ r₁ = y - a₁x
|
|
|
@ = y - a₀x - d₀x
|
|
|
@ = r₀ - d₀x
|
|
|
@ = r₀ - u₁ ( r₀ - γ ) x where 0 ≤ γ < 2^-57
|
|
|
@ = r₀ ( 1 - u₁x ) + u₁γx
|
|
|
@
|
|
|
@ Hence
|
|
|
@
|
|
|
@ | r₁ / x | < 26*2^-32 * 10*2^-32 + 2^-57
|
|
|
@ = (260+128)*2^-64
|
|
|
@ < 2^-55
|
|
|
@
|
|
|
@ Empirically it seems to be nearly twice as good as this.
|
|
|
@
|
|
|
@ To determine correctly whether the exact remainder calculation can be skipped we need a result
|
|
|
@ accurate to < 0.25ulp. In the case where x>y the quotient will be shifted up one place for normalisation
|
|
|
@ and so 1ulp is 2^-53 and so the calculation above suffices.
|
|
|
|
|
|
.align 2
|
|
|
.thumb_func
|
|
|
qfp_ddiv:
|
|
|
push {r4-r7,r14}
|
|
|
ddiv0: @ entry point from dtan
|
|
|
mdunpack r2,r3,r4,r7,r6 @ unpack divisor
|
|
|
|
|
|
@ unpack dividend by hand to save on register use
|
|
|
lsrs r6,r1,#31
|
|
|
adds r6,r7
|
|
|
mov r12,r6 @ result sign in r12b0; r12b1 trashed
|
|
|
lsls r1,#1
|
|
|
lsrs r7,r1,#21 @ exponent
|
|
|
beq 1f @ zero exponent?
|
|
|
adds r6,r7,#1
|
|
|
lsrs r6,#11
|
|
|
beq 2f @ exponent != 0x7ff? then done
|
|
|
1:
|
|
|
movs r0,#0
|
|
|
movs r1,#0
|
|
|
subs r7,#64 @ less drastic fiddling of exponents to get 0/0, Inf/Inf correct
|
|
|
lsls r7,#12
|
|
|
2:
|
|
|
subs r6,r7,r4
|
|
|
lsls r6,#2
|
|
|
add r12,r12,r6 @ (signed) exponent in r12[31..8]
|
|
|
subs r7,#1 @ implied 1
|
|
|
lsls r7,#21
|
|
|
subs r1,r7
|
|
|
lsrs r1,#1
|
|
|
|
|
|
// see dreciptest-boxc.c
|
|
|
lsrs r4,r3,#15 @ x2=x>>15; // Q5 32..63
|
|
|
ldr r5,=#(rcpapp-32)
|
|
|
ldrb r4,[r5,r4] @ u=lut5[x2-32]; // Q8
|
|
|
lsls r5,r3,#8
|
|
|
muls r5,r5,r4
|
|
|
asrs r5,#14 @ e=(i32)(u*(x<<8))>>14; // Q22
|
|
|
asrs r6,r5,#11
|
|
|
muls r6,r6,r6 @ e2=(e>>11)*(e>>11); // Q22
|
|
|
subs r5,r6
|
|
|
muls r5,r5,r4 @ c=(e-e2)*u; // Q30
|
|
|
lsls r6,r4,#7
|
|
|
asrs r5,#14
|
|
|
adds r5,#1
|
|
|
asrs r5,#1
|
|
|
subs r6,r5 @ u0=(u<<7)-((c+0x4000)>>15); // Q15
|
|
|
|
|
|
@ here
|
|
|
@ r0:r1 y mantissa
|
|
|
@ r2:r3 x mantissa
|
|
|
@ r6 u0, first approximation to 1/x Q15
|
|
|
@ r12: result sign, exponent
|
|
|
|
|
|
lsls r4,r3,#10
|
|
|
lsrs r5,r2,#22
|
|
|
orrs r5,r4 @ x1=(q30)x
|
|
|
muls r5,r6 @ u0*x1 Q45
|
|
|
asrs r5,#15 @ v=u0*x1-1 Q30
|
|
|
muls r5,r6 @ u0*v Q45
|
|
|
asrs r5,#14
|
|
|
adds r5,#1
|
|
|
asrs r5,#1 @ round u0*v to Q30
|
|
|
lsls r6,#15
|
|
|
subs r6,r5 @ u1 Q30
|
|
|
|
|
|
@ here
|
|
|
@ r0:r1 y mantissa
|
|
|
@ r2:r3 x mantissa
|
|
|
@ r6 u1, second approximation to 1/x Q30
|
|
|
@ r12: result sign, exponent
|
|
|
|
|
|
push {r2,r3}
|
|
|
lsls r4,r1,#11
|
|
|
lsrs r5,r0,#21
|
|
|
orrs r4,r5 @ y0=(q31)y
|
|
|
mul32_32_64 r4,r6, r4,r5, r2,r3,r7,r4,r5 @ y0*u1 Q61
|
|
|
adds r4,r4
|
|
|
adcs r5,r5 @ a0=(q30)(y0*u1)
|
|
|
|
|
|
@ here
|
|
|
@ r0:r1 y mantissa
|
|
|
@ r5 a0, first approximation to y/x Q30
|
|
|
@ r6 u1, second approximation to 1/x Q30
|
|
|
@ r12 result sign, exponent
|
|
|
|
|
|
ldr r2,[r13,#0] @ xL
|
|
|
mul32_32_64 r2,r5, r2,r3, r1,r4,r7,r2,r3 @ xL*a0
|
|
|
ldr r4,[r13,#4] @ xH
|
|
|
muls r4,r5 @ xH*a0
|
|
|
adds r3,r4 @ r2:r3 now x*a0 Q82
|
|
|
lsrs r2,#25
|
|
|
lsls r1,r3,#7
|
|
|
orrs r2,r1 @ r2 now x*a0 Q57; r7:r2 is x*a0 Q89
|
|
|
lsls r4,r0,#5 @ y Q57
|
|
|
subs r0,r4,r2 @ r0x=y-x*a0 Q57 (signed)
|
|
|
|
|
|
@ here
|
|
|
@ r0 r0x Q57
|
|
|
@ r5 a0, first approximation to y/x Q30
|
|
|
@ r4 yL Q57
|
|
|
@ r6 u1 Q30
|
|
|
@ r12 result sign, exponent
|
|
|
|
|
|
muls32_32_64 r0,r6, r7,r6, r1,r2,r3, r7,r6 @ r7:r6 r0x*u1 Q87
|
|
|
asrs r3,r6,#25
|
|
|
adds r5,r3
|
|
|
lsls r3,r6,#7 @ r3:r5 a1 Q62 (but bottom 7 bits are zero so 55 bits of precision after binary point)
|
|
|
@ here we could recover another 7 bits of precision (but not accuracy) from the top of r7
|
|
|
@ but these bits are thrown away in the rounding and conversion to Q52 below
|
|
|
|
|
|
@ here
|
|
|
@ r3:r5 a1 Q62 candidate quotient [0.5,2) or so
|
|
|
@ r4 yL Q57
|
|
|
@ r12 result sign, exponent
|
|
|
|
|
|
movs r6,#0
|
|
|
adds r3,#128 @ for initial rounding to Q53
|
|
|
adcs r5,r5,r6
|
|
|
lsrs r1,r5,#30
|
|
|
bne dd_0
|
|
|
@ here candidate quotient a1 is in range [0.5,1)
|
|
|
@ so 30 significant bits in r5
|
|
|
|
|
|
lsls r4,#1 @ y now Q58
|
|
|
lsrs r1,r5,#9 @ to Q52
|
|
|
lsls r0,r5,#23
|
|
|
lsrs r3,#9 @ 0.5ulp-significance bit in carry: if this is 1 we may need to correct result
|
|
|
orrs r0,r3
|
|
|
bcs dd_1
|
|
|
b dd_2
|
|
|
dd_0:
|
|
|
@ here candidate quotient a1 is in range [1,2)
|
|
|
@ so 31 significant bits in r5
|
|
|
|
|
|
movs r2,#4
|
|
|
add r12,r12,r2 @ fix exponent; r3:r5 now effectively Q61
|
|
|
adds r3,#128 @ complete rounding to Q53
|
|
|
adcs r5,r5,r6
|
|
|
lsrs r1,r5,#10
|
|
|
lsls r0,r5,#22
|
|
|
lsrs r3,#10 @ 0.5ulp-significance bit in carry: if this is 1 we may need to correct result
|
|
|
orrs r0,r3
|
|
|
bcc dd_2
|
|
|
dd_1:
|
|
|
|
|
|
@ here
|
|
|
@ r0:r1 rounded result Q53 [0.5,1) or Q52 [1,2), but may not be correctly rounded-to-nearest
|
|
|
@ r4 yL Q58 or Q57
|
|
|
@ r12 result sign, exponent
|
|
|
@ carry set
|
|
|
|
|
|
adcs r0,r0,r0
|
|
|
adcs r1,r1,r1 @ z Q53 with 1 in LSB
|
|
|
lsls r4,#16 @ Q105-32=Q73
|
|
|
ldr r2,[r13,#0] @ xL Q52
|
|
|
ldr r3,[r13,#4] @ xH Q20
|
|
|
|
|
|
movs r5,r1 @ zH Q21
|
|
|
muls r5,r2 @ zH*xL Q73
|
|
|
subs r4,r5
|
|
|
muls r3,r0 @ zL*xH Q73
|
|
|
subs r4,r3
|
|
|
mul32_32_64 r2,r0, r2,r3, r5,r6,r7,r2,r3 @ xL*zL
|
|
|
rsbs r2,#0 @ borrow from low half?
|
|
|
sbcs r4,r3 @ y-xz Q73 (remainder bits 52..73)
|
|
|
|
|
|
cmp r4,#0
|
|
|
|
|
|
bmi 1f
|
|
|
movs r2,#0 @ round up
|
|
|
adds r0,#1
|
|
|
adcs r1,r2
|
|
|
1:
|
|
|
lsrs r0,#1 @ shift back down to Q52
|
|
|
lsls r2,r1,#31
|
|
|
orrs r0,r2
|
|
|
lsrs r1,#1
|
|
|
dd_2:
|
|
|
add r13,#8
|
|
|
mov r2,r12
|
|
|
lsls r7,r2,#31 @ result sign
|
|
|
asrs r2,#2 @ result exponent
|
|
|
ldr r3,=#0x3fd
|
|
|
adds r2,r3
|
|
|
ldr r3,=#0x7fe
|
|
|
cmp r2,r3
|
|
|
bhs dd_3 @ over- or underflow?
|
|
|
lsls r2,#20
|
|
|
adds r1,r2 @ pack exponent
|
|
|
dd_5:
|
|
|
adds r1,r7 @ pack sign
|
|
|
pop {r4-r7,r15}
|
|
|
|
|
|
dd_3:
|
|
|
movs r0,#0
|
|
|
cmp r2,#0
|
|
|
bgt dd_4 @ overflow?
|
|
|
movs r1,r7
|
|
|
pop {r4-r7,r15}
|
|
|
|
|
|
dd_4:
|
|
|
adds r3,#1 @ 0x7ff
|
|
|
lsls r1,r3,#20
|
|
|
b dd_5
|
|
|
|
|
|
/*
|
|
|
Approach to square root x=sqrt(y) is as follows.
|
|
|
|
|
|
First generate a3, an approximation to 1/sqrt(y) to about 30 bits. Multiply this by y
|
|
|
to give a4~sqrt(y) to about 28 bits and a remainder r4=y-a4^2. Then, because
|
|
|
d sqrt(y) / dy = 1 / (2 sqrt(y)) let d4=r4*a3/2 and then the value a5=a4+d4 is
|
|
|
a better approximation to sqrt(y). If this is near a rounding boundary we
|
|
|
compute an exact remainder y-a5*a5 to decide whether to round up or down.
|
|
|
|
|
|
The calculation of a3 and a4 is as given in dsqrttest.c. That code verifies exhaustively
|
|
|
that | 1 - a3a4 | < 10*2^-32, | r4 | < 40*2^-32 and | r4/y | < 20*2^-32.
|
|
|
|
|
|
More precisely, with "y" representing y truncated to 30 binary places:
|
|
|
|
|
|
u=(q3)y; // 24-entry table
|
|
|
a0=(q8~)"1/sqrtq(x+x_ulp/2)"(u); // first approximation from table
|
|
|
p0=(q16)(a0*a0) * (q16)y;
|
|
|
r0=(q20)(p0-1);
|
|
|
dy0=(q15)(r0*a0); // Newton-Raphson correction term
|
|
|
a1=(q16)a0-dy0/2; // good to ~9 bits
|
|
|
|
|
|
p1=(q19)(a1*a1)*(q19)y;
|
|
|
r1=(q23)(p1-1);
|
|
|
dy1=(q15~)(r1*a1); // second Newton-Raphson correction
|
|
|
a2x=(q16)a1-dy1/2; // good to ~16 bits
|
|
|
a2=a2x-a2x/1t16; // prevent overflow of a2*a2 in 32 bits
|
|
|
|
|
|
p2=(a2*a2)*(q30)y; // Q62
|
|
|
r2=(q36)(p2-1+1t-31);
|
|
|
dy2=(q30)(r2*a2); // Q52->Q30
|
|
|
a3=(q31)a2-dy2/2; // good to about 30 bits
|
|
|
a4=(q30)(a3*(q30)y+1t-31); // good to about 28 bits
|
|
|
|
|
|
Error analysis
|
|
|
|
|
|
r₄ = y - a₄²
|
|
|
d₄ = 1/2 a₃r₄
|
|
|
a₅ = a₄ + d₄
|
|
|
r₅ = y - a₅²
|
|
|
= y - ( a₄ + d₄ )²
|
|
|
= y - a₄² - a₃a₄r₄ - 1/4 a₃²r₄²
|
|
|
= r₄ - a₃a₄r₄ - 1/4 a₃²r₄²
|
|
|
|
|
|
| r₅ | < | r₄ | | 1 - a₃a₄ | + 1/4 r₄²
|
|
|
|
|
|
a₅ = √y √( 1 - r₅/y )
|
|
|
= √y ( 1 - 1/2 r₅/y + ... )
|
|
|
|
|
|
So to first order (second order being very tiny)
|
|
|
|
|
|
√y - a₅ = 1/2 r₅/y
|
|
|
|
|
|
and
|
|
|
|
|
|
| √y - a₅ | < 1/2 ( | r₄/y | | 1 - a₃a₄ | + 1/4 r₄²/y )
|
|
|
|
|
|
From dsqrttest.c (conservatively):
|
|
|
|
|
|
< 1/2 ( 20*2^-32 * 10*2^-32 + 1/4 * 40*2^-32*20*2^-32 )
|
|
|
= 1/2 ( 200 + 200 ) * 2^-64
|
|
|
< 2^-56
|
|
|
|
|
|
Empirically we see about 1ulp worst-case error including rounding at Q57.
|
|
|
|
|
|
To determine correctly whether the exact remainder calculation can be skipped we need a result
|
|
|
accurate to < 0.25ulp at Q52, or 2^-54.
|
|
|
*/
|
|
|
|
|
|
dq_2:
|
|
|
bge dq_3 @ +Inf?
|
|
|
movs r1,#0
|
|
|
b dq_4
|
|
|
|
|
|
dq_0:
|
|
|
lsrs r1,#31
|
|
|
lsls r1,#31 @ preserve sign bit
|
|
|
lsrs r2,#21 @ extract exponent
|
|
|
beq dq_4 @ -0? return it
|
|
|
asrs r1,#11 @ make -Inf
|
|
|
b dq_4
|
|
|
|
|
|
dq_3:
|
|
|
ldr r1,=#0x7ff
|
|
|
lsls r1,#20 @ return +Inf
|
|
|
dq_4:
|
|
|
movs r0,#0
|
|
|
dq_1:
|
|
|
bx r14
|
|
|
|
|
|
.align 2
|
|
|
.thumb_func
|
|
|
qfp_dsqrt:
|
|
|
lsls r2,r1,#1
|
|
|
bcs dq_0 @ negative?
|
|
|
lsrs r2,#21 @ extract exponent
|
|
|
subs r2,#1
|
|
|
ldr r3,=#0x7fe
|
|
|
cmp r2,r3
|
|
|
bhs dq_2 @ catches 0 and +Inf
|
|
|
push {r4-r7,r14}
|
|
|
lsls r4,r2,#20
|
|
|
subs r1,r4 @ insert implied 1
|
|
|
lsrs r2,#1
|
|
|
bcc 1f @ even exponent? skip
|
|
|
adds r0,r0,r0 @ odd exponent: shift up mantissa
|
|
|
adcs r1,r1,r1
|
|
|
1:
|
|
|
lsrs r3,#2
|
|
|
adds r2,r3
|
|
|
lsls r2,#20
|
|
|
mov r12,r2 @ save result exponent
|
|
|
|
|
|
@ here
|
|
|
@ r0:r1 y mantissa Q52 [1,4)
|
|
|
@ r12 result exponent
|
|
|
|
|
|
adr r4,drsqrtapp-8 @ first eight table entries are never accessed because of the mantissa's leading 1
|
|
|
lsrs r2,r1,#17 @ y Q3
|
|
|
ldrb r2,[r4,r2] @ initial approximation to reciprocal square root a0 Q8
|
|
|
lsrs r3,r1,#4 @ first Newton-Raphson iteration
|
|
|
muls r3,r2
|
|
|
muls r3,r2 @ i32 p0=a0*a0*(y>>14); // Q32
|
|
|
asrs r3,r3,#12 @ i32 r0=p0>>12; // Q20
|
|
|
muls r3,r2
|
|
|
asrs r3,#13 @ i32 dy0=(r0*a0)>>13; // Q15
|
|
|
lsls r2,#8
|
|
|
subs r2,r3 @ i32 a1=(a0<<8)-dy0; // Q16
|
|
|
|
|
|
movs r3,r2
|
|
|
muls r3,r3
|
|
|
lsrs r3,#13
|
|
|
lsrs r4,r1,#1
|
|
|
muls r3,r4 @ i32 p1=((a1*a1)>>11)*(y>>11); // Q19*Q19=Q38
|
|
|
asrs r3,#15 @ i32 r1=p1>>15; // Q23
|
|
|
muls r3,r2
|
|
|
asrs r3,#23
|
|
|
adds r3,#1
|
|
|
asrs r3,#1 @ i32 dy1=(r1*a1+(1<<23))>>24; // Q23*Q16=Q39; Q15
|
|
|
subs r2,r3 @ i32 a2=a1-dy1; // Q16
|
|
|
lsrs r3,r2,#16
|
|
|
subs r2,r3 @ if(a2>=0x10000) a2=0xffff; to prevent overflow of a2*a2
|
|
|
|
|
|
@ here
|
|
|
@ r0:r1 y mantissa
|
|
|
@ r2 a2 ~ 1/sqrt(y) Q16
|
|
|
@ r12 result exponent
|
|
|
|
|
|
movs r3,r2
|
|
|
muls r3,r3
|
|
|
lsls r1,#10
|
|
|
lsrs r4,r0,#22
|
|
|
orrs r1,r4 @ y Q30
|
|
|
mul32_32_64 r1,r3, r4,r3, r5,r6,r7,r4,r3 @ i64 p2=(ui64)(a2*a2)*(ui64)y; // Q62 r4:r3
|
|
|
lsls r5,r3,#6
|
|
|
lsrs r4,#26
|
|
|
orrs r4,r5
|
|
|
adds r4,#0x20 @ i32 r2=(p2>>26)+0x20; // Q36 r4
|
|
|
uxth r5,r4
|
|
|
muls r5,r2
|
|
|
asrs r4,#16
|
|
|
muls r4,r2
|
|
|
lsrs r5,#16
|
|
|
adds r4,r5
|
|
|
asrs r4,#6 @ i32 dy2=((i64)r2*(i64)a2)>>22; // Q36*Q16=Q52; Q30
|
|
|
lsls r2,#15
|
|
|
subs r2,r4
|
|
|
|
|
|
@ here
|
|
|
@ r0 y low bits
|
|
|
@ r1 y Q30
|
|
|
@ r2 a3 ~ 1/sqrt(y) Q31
|
|
|
@ r12 result exponent
|
|
|
|
|
|
mul32_32_64 r2,r1, r3,r4, r5,r6,r7,r3,r4
|
|
|
adds r3,r3,r3
|
|
|
adcs r4,r4,r4
|
|
|
adds r3,r3,r3
|
|
|
movs r3,#0
|
|
|
adcs r3,r4 @ ui32 a4=((ui64)a3*(ui64)y+(1U<<31))>>31; // Q30
|
|
|
|
|
|
@ here
|
|
|
@ r0 y low bits
|
|
|
@ r1 y Q30
|
|
|
@ r2 a3 Q31 ~ 1/sqrt(y)
|
|
|
@ r3 a4 Q30 ~ sqrt(y)
|
|
|
@ r12 result exponent
|
|
|
|
|
|
square32_64 r3, r4,r5, r6,r5,r7
|
|
|
lsls r6,r0,#8
|
|
|
lsrs r7,r1,#2
|
|
|
subs r6,r4
|
|
|
sbcs r7,r5 @ r4=(q60)y-a4*a4
|
|
|
|
|
|
@ by exhaustive testing, r4 = fffffffc0e134fdc .. 00000003c2bf539c Q60
|
|
|
|
|
|
lsls r5,r7,#29
|
|
|
lsrs r6,#3
|
|
|
adcs r6,r5 @ r4 Q57 with rounding
|
|
|
muls32_32_64 r6,r2, r6,r2, r4,r5,r7,r6,r2 @ d4=a3*r4/2 Q89
|
|
|
@ r4+d4 is correct to 1ULP at Q57, tested on ~9bn cases including all extreme values of r4 for each possible y Q30
|
|
|
|
|
|
adds r2,#8
|
|
|
asrs r2,#5 @ d4 Q52, rounded to Q53 with spare bit in carry
|
|
|
|
|
|
@ here
|
|
|
@ r0 y low bits
|
|
|
@ r1 y Q30
|
|
|
@ r2 d4 Q52, rounded to Q53
|
|
|
@ C flag contains d4_b53
|
|
|
@ r3 a4 Q30
|
|
|
|
|
|
bcs dq_5
|
|
|
|
|
|
lsrs r5,r3,#10 @ a4 Q52
|
|
|
lsls r4,r3,#22
|
|
|
|
|
|
asrs r1,r2,#31
|
|
|
adds r0,r2,r4
|
|
|
adcs r1,r5 @ a4+d4
|
|
|
|
|
|
add r1,r12 @ pack exponent
|
|
|
pop {r4-r7,r15}
|
|
|
|
|
|
.ltorg
|
|
|
|
|
|
|
|
|
@ round(sqrt(2^22./[68:8:252]))
|
|
|
drsqrtapp:
|
|
|
.byte 0xf8,0xeb,0xdf,0xd6,0xcd,0xc5,0xbe,0xb8
|
|
|
.byte 0xb2,0xad,0xa8,0xa4,0xa0,0x9c,0x99,0x95
|
|
|
.byte 0x92,0x8f,0x8d,0x8a,0x88,0x85,0x83,0x81
|
|
|
|
|
|
dq_5:
|
|
|
@ here we are near a rounding boundary, C is set
|
|
|
adcs r2,r2,r2 @ d4 Q53+1ulp
|
|
|
lsrs r5,r3,#9
|
|
|
lsls r4,r3,#23 @ r4:r5 a4 Q53
|
|
|
asrs r1,r2,#31
|
|
|
adds r4,r2,r4
|
|
|
adcs r5,r1 @ r4:r5 a5=a4+d4 Q53+1ulp
|
|
|
movs r3,r5
|
|
|
muls r3,r4
|
|
|
square32_64 r4,r1,r2,r6,r2,r7
|
|
|
adds r2,r3
|
|
|
adds r2,r3 @ r1:r2 a5^2 Q106
|
|
|
lsls r0,#22 @ y Q84
|
|
|
|
|
|
rsbs r1,#0
|
|
|
sbcs r0,r2 @ remainder y-a5^2
|
|
|
bmi 1f @ y<a5^2: no need to increment a5
|
|
|
movs r3,#0
|
|
|
adds r4,#1
|
|
|
adcs r5,r3 @ bump a5 if over rounding boundary
|
|
|
1:
|
|
|
lsrs r0,r4,#1
|
|
|
lsrs r1,r5,#1
|
|
|
lsls r5,#31
|
|
|
orrs r0,r5
|
|
|
add r1,r12
|
|
|
pop {r4-r7,r15}
|
|
|
|
|
|
@ compare r0:r1 against r2:r3, returning -1/0/1 for <, =, >
|
|
|
@ also set flags accordingly
|
|
|
.thumb_func
|
|
|
qfp_dcmp:
|
|
|
push {r4,r6,r7,r14}
|
|
|
ldr r7,=#0x7ff @ flush NaNs and denormals
|
|
|
lsls r4,r1,#1
|
|
|
lsrs r4,#21
|
|
|
beq 1f
|
|
|
cmp r4,r7
|
|
|
bne 2f
|
|
|
1:
|
|
|
movs r0,#0
|
|
|
lsrs r1,#20
|
|
|
lsls r1,#20
|
|
|
2:
|
|
|
lsls r4,r3,#1
|
|
|
lsrs r4,#21
|
|
|
beq 1f
|
|
|
cmp r4,r7
|
|
|
bne 2f
|
|
|
1:
|
|
|
movs r2,#0
|
|
|
lsrs r3,#20
|
|
|
lsls r3,#20
|
|
|
2:
|
|
|
dcmp_fast_entry:
|
|
|
movs r6,#1
|
|
|
eors r3,r1
|
|
|
bmi 4f @ opposite signs? then can proceed on basis of sign of x
|
|
|
eors r3,r1 @ restore r3
|
|
|
bpl 1f
|
|
|
rsbs r6,#0 @ negative? flip comparison
|
|
|
1:
|
|
|
cmp r1,r3
|
|
|
bne 1f
|
|
|
cmp r0,r2
|
|
|
bhi 2f
|
|
|
blo 3f
|
|
|
5:
|
|
|
movs r6,#0 @ equal? result is 0
|
|
|
1:
|
|
|
bgt 2f
|
|
|
3:
|
|
|
rsbs r6,#0
|
|
|
2:
|
|
|
subs r0,r6,#0 @ copy and set flags
|
|
|
pop {r4,r6,r7,r15}
|
|
|
4:
|
|
|
orrs r3,r1 @ make -0==+0
|
|
|
adds r3,r3
|
|
|
orrs r3,r0
|
|
|
orrs r3,r2
|
|
|
beq 5b
|
|
|
cmp r1,#0
|
|
|
bge 2b
|
|
|
b 3b
|
|
|
|
|
|
|
|
|
@ "scientific" functions start here
|
|
|
|
|
|
.thumb_func
|
|
|
push_r8_r11:
|
|
|
mov r4,r8
|
|
|
mov r5,r9
|
|
|
mov r6,r10
|
|
|
mov r7,r11
|
|
|
push {r4-r7}
|
|
|
bx r14
|
|
|
|
|
|
.thumb_func
|
|
|
pop_r8_r11:
|
|
|
pop {r4-r7}
|
|
|
mov r8,r4
|
|
|
mov r9,r5
|
|
|
mov r10,r6
|
|
|
mov r11,r7
|
|
|
bx r14
|
|
|
|
|
|
@ double-length CORDIC rotation step
|
|
|
|
|
|
@ r0:r1 ω
|
|
|
@ r6 32-i (complementary shift)
|
|
|
@ r7 i (shift)
|
|
|
@ r8:r9 x
|
|
|
@ r10:r11 y
|
|
|
@ r12 coefficient pointer
|
|
|
|
|
|
@ an option in rotation mode would be to compute the sequence of σ values
|
|
|
@ in one pass, rotate the initial vector by the residual ω and then run a
|
|
|
@ second pass to compute the final x and y. This would relieve pressure
|
|
|
@ on registers and hence possibly be faster. The same trick does not work
|
|
|
@ in vectoring mode (but perhaps one could work to single precision in
|
|
|
@ a first pass and then double precision in a second pass?).
|
|
|
|
|
|
.thumb_func
|
|
|
dcordic_vec_step:
|
|
|
mov r2,r12
|
|
|
ldmia r2!,{r3,r4}
|
|
|
mov r12,r2
|
|
|
mov r2,r11
|
|
|
cmp r2,#0
|
|
|
blt 1f
|
|
|
b 2f
|
|
|
|
|
|
.thumb_func
|
|
|
dcordic_rot_step:
|
|
|
mov r2,r12
|
|
|
ldmia r2!,{r3,r4}
|
|
|
mov r12,r2
|
|
|
cmp r1,#0
|
|
|
bge 1f
|
|
|
2:
|
|
|
@ ω<0 / y>=0
|
|
|
@ ω+=dω
|
|
|
@ x+=y>>i, y-=x>>i
|
|
|
adds r0,r3
|
|
|
adcs r1,r4
|
|
|
|
|
|
mov r3,r11
|
|
|
asrs r3,r7
|
|
|
mov r4,r11
|
|
|
lsls r4,r6
|
|
|
mov r2,r10
|
|
|
lsrs r2,r7
|
|
|
orrs r2,r4 @ r2:r3 y>>i, rounding in carry
|
|
|
mov r4,r8
|
|
|
mov r5,r9 @ r4:r5 x
|
|
|
adcs r2,r4
|
|
|
adcs r3,r5 @ r2:r3 x+(y>>i)
|
|
|
mov r8,r2
|
|
|
mov r9,r3
|
|
|
|
|
|
mov r3,r5
|
|
|
lsls r3,r6
|
|
|
asrs r5,r7
|
|
|
lsrs r4,r7
|
|
|
orrs r4,r3 @ r4:r5 x>>i, rounding in carry
|
|
|
mov r2,r10
|
|
|
mov r3,r11
|
|
|
sbcs r2,r4
|
|
|
sbcs r3,r5 @ r2:r3 y-(x>>i)
|
|
|
mov r10,r2
|
|
|
mov r11,r3
|
|
|
bx r14
|
|
|
|
|
|
|
|
|
@ ω>0 / y<0
|
|
|
@ ω-=dω
|
|
|
@ x-=y>>i, y+=x>>i
|
|
|
1:
|
|
|
subs r0,r3
|
|
|
sbcs r1,r4
|
|
|
|
|
|
mov r3,r9
|
|
|
asrs r3,r7
|
|
|
mov r4,r9
|
|
|
lsls r4,r6
|
|
|
mov r2,r8
|
|
|
lsrs r2,r7
|
|
|
orrs r2,r4 @ r2:r3 x>>i, rounding in carry
|
|
|
mov r4,r10
|
|
|
mov r5,r11 @ r4:r5 y
|
|
|
adcs r2,r4
|
|
|
adcs r3,r5 @ r2:r3 y+(x>>i)
|
|
|
mov r10,r2
|
|
|
mov r11,r3
|
|
|
|
|
|
mov r3,r5
|
|
|
lsls r3,r6
|
|
|
asrs r5,r7
|
|
|
lsrs r4,r7
|
|
|
orrs r4,r3 @ r4:r5 y>>i, rounding in carry
|
|
|
mov r2,r8
|
|
|
mov r3,r9
|
|
|
sbcs r2,r4
|
|
|
sbcs r3,r5 @ r2:r3 x-(y>>i)
|
|
|
mov r8,r2
|
|
|
mov r9,r3
|
|
|
bx r14
|
|
|
|
|
|
ret_dzero:
|
|
|
movs r0,#0
|
|
|
movs r1,#0
|
|
|
bx r14
|
|
|
|
|
|
@ convert double to signed int, rounding towards 0, clamping
|
|
|
.thumb_func
|
|
|
qfp_double2int_z:
|
|
|
cmp r1,#0
|
|
|
bge qfp_double2int @ +ve or zero? then use rounding towards -Inf
|
|
|
push {r14}
|
|
|
lsls r1,#1 @ -ve: clear sign bit
|
|
|
lsrs r1,#1
|
|
|
bl qfp_double2uint @ convert to unsigned, rounding towards -Inf
|
|
|
movs r1,#1
|
|
|
lsls r1,#31 @ r1=0x80000000
|
|
|
cmp r0,r1
|
|
|
bhi 1f
|
|
|
rsbs r0,#0
|
|
|
pop {r15}
|
|
|
1:
|
|
|
mov r0,r1
|
|
|
pop {r15}
|
|
|
|
|
|
@ convert packed double in r0:r1 to signed/unsigned 32/64-bit integer/fixed-point value in r0:r1 [with r2 places after point], with rounding towards -Inf
|
|
|
@ fixed-point versions only work with reasonable values in r2 because of the way dunpacks works
|
|
|
|
|
|
.thumb_func
|
|
|
qfp_double2int:
|
|
|
movs r2,#0 @ and fall through
|
|
|
.thumb_func
|
|
|
qfp_double2fix:
|
|
|
push {r14}
|
|
|
adds r2,#32
|
|
|
bl qfp_double2fix64
|
|
|
movs r0,r1
|
|
|
pop {r15}
|
|
|
|
|
|
.thumb_func
|
|
|
qfp_double2uint:
|
|
|
movs r2,#0 @ and fall through
|
|
|
.thumb_func
|
|
|
qfp_double2ufix:
|
|
|
push {r14}
|
|
|
adds r2,#32
|
|
|
bl qfp_double2ufix64
|
|
|
movs r0,r1
|
|
|
pop {r15}
|
|
|
|
|
|
.thumb_func
|
|
|
qfp_float2int64_z:
|
|
|
cmp r0,#0
|
|
|
bge qfp_float2int64 @ +ve or zero? then use rounding towards -Inf
|
|
|
push {r14}
|
|
|
lsls r0,#1 @ -ve: clear sign bit
|
|
|
lsrs r0,#1
|
|
|
bl qfp_float2uint64 @ convert to unsigned, rounding towards -Inf
|
|
|
movs r2,#1
|
|
|
lsls r2,#31 @ r2=0x80000000
|
|
|
cmp r1,r2
|
|
|
bhs 1f
|
|
|
mvns r1,r1
|
|
|
rsbs r0,#0
|
|
|
bcc 2f
|
|
|
adds r1,#1
|
|
|
2:
|
|
|
pop {r15}
|
|
|
1:
|
|
|
movs r0,#0
|
|
|
mov r1,r2
|
|
|
pop {r15}
|
|
|
|
|
|
.thumb_func
|
|
|
qfp_float2int64:
|
|
|
movs r1,#0 @ and fall through
|
|
|
.thumb_func
|
|
|
qfp_float2fix64:
|
|
|
push {r14}
|
|
|
bl f2fix
|
|
|
b d2f64_a
|
|
|
|
|
|
.thumb_func
|
|
|
qfp_float2uint64:
|
|
|
movs r1,#0 @ and fall through
|
|
|
.thumb_func
|
|
|
qfp_float2ufix64:
|
|
|
asrs r3,r0,#23 @ negative? return 0
|
|
|
bmi ret_dzero
|
|
|
@ and fall through
|
|
|
|
|
|
@ convert float in r0 to signed fixed point in r0:r1:r3, r1 places after point, rounding towards -Inf
|
|
|
@ result clamped so that r3 can only be 0 or -1
|
|
|
@ trashes r12
|
|
|
.thumb_func
|
|
|
f2fix:
|
|
|
push {r4,r14}
|
|
|
mov r12,r1
|
|
|
asrs r3,r0,#31
|
|
|
lsls r0,#1
|
|
|
lsrs r2,r0,#24
|
|
|
beq 1f @ zero?
|
|
|
cmp r2,#0xff @ Inf?
|
|
|
beq 2f
|
|
|
subs r1,r2,#1
|
|
|
subs r2,#0x7f @ remove exponent bias
|
|
|
lsls r1,#24
|
|
|
subs r0,r1 @ insert implied 1
|
|
|
eors r0,r3
|
|
|
subs r0,r3 @ top two's complement
|
|
|
asrs r1,r0,#4 @ convert to double format
|
|
|
lsls r0,#28
|
|
|
b d2fix_a
|
|
|
1:
|
|
|
movs r0,#0
|
|
|
movs r1,r0
|
|
|
movs r3,r0
|
|
|
pop {r4,r15}
|
|
|
2:
|
|
|
mvns r0,r3 @ return max/min value
|
|
|
mvns r1,r3
|
|
|
pop {r4,r15}
|
|
|
|
|
|
.thumb_func
|
|
|
qfp_double2int64_z:
|
|
|
cmp r1,#0
|
|
|
bge qfp_double2int64 @ +ve or zero? then use rounding towards -Inf
|
|
|
push {r14}
|
|
|
lsls r1,#1 @ -ve: clear sign bit
|
|
|
lsrs r1,#1
|
|
|
bl qfp_double2uint64 @ convert to unsigned, rounding towards -Inf
|
|
|
cmp r1,#0
|
|
|
blt 1f
|
|
|
mvns r1,r1
|
|
|
rsbs r0,#0
|
|
|
bcc 2f
|
|
|
adds r1,#1
|
|
|
2:
|
|
|
pop {r15}
|
|
|
1:
|
|
|
movs r0,#0
|
|
|
movs r1,#1
|
|
|
lsls r1,#31 @ 0x80000000
|
|
|
pop {r15}
|
|
|
|
|
|
.thumb_func
|
|
|
qfp_double2int64:
|
|
|
movs r2,#0 @ and fall through
|
|
|
.thumb_func
|
|
|
qfp_double2fix64:
|
|
|
push {r14}
|
|
|
bl d2fix
|
|
|
d2f64_a:
|
|
|
asrs r2,r1,#31
|
|
|
cmp r2,r3
|
|
|
bne 1f @ sign extension bits fail to match sign of result?
|
|
|
pop {r15}
|
|
|
1:
|
|
|
mvns r0,r3
|
|
|
movs r1,#1
|
|
|
lsls r1,#31
|
|
|
eors r1,r1,r0 @ generate extreme fixed-point values
|
|
|
pop {r15}
|
|
|
|
|
|
.thumb_func
|
|
|
qfp_double2uint64:
|
|
|
movs r2,#0 @ and fall through
|
|
|
.thumb_func
|
|
|
qfp_double2ufix64:
|
|
|
asrs r3,r1,#20 @ negative? return 0
|
|
|
bmi ret_dzero
|
|
|
@ and fall through
|
|
|
|
|
|
@ convert double in r0:r1 to signed fixed point in r0:r1:r3, r2 places after point, rounding towards -Inf
|
|
|
@ result clamped so that r3 can only be 0 or -1
|
|
|
@ trashes r12
|
|
|
.thumb_func
|
|
|
d2fix:
|
|
|
push {r4,r14}
|
|
|
mov r12,r2
|
|
|
bl dunpacks
|
|
|
asrs r4,r2,#16
|
|
|
adds r4,#1
|
|
|
bge 1f
|
|
|
movs r1,#0 @ -0 -> +0
|
|
|
1:
|
|
|
asrs r3,r1,#31
|
|
|
d2fix_a:
|
|
|
@ here
|
|
|
@ r0:r1 two's complement mantissa
|
|
|
@ r2 unbaised exponent
|
|
|
@ r3 mantissa sign extension bits
|
|
|
add r2,r12 @ exponent plus offset for required binary point position
|
|
|
subs r2,#52 @ required shift
|
|
|
bmi 1f @ shift down?
|
|
|
@ here a shift up by r2 places
|
|
|
cmp r2,#12 @ will clamp?
|
|
|
bge 2f
|
|
|
movs r4,r0
|
|
|
lsls r1,r2
|
|
|
lsls r0,r2
|
|
|
rsbs r2,#0
|
|
|
adds r2,#32 @ complementary shift
|
|
|
lsrs r4,r2
|
|
|
orrs r1,r4
|
|
|
pop {r4,r15}
|
|
|
2:
|
|
|
mvns r0,r3
|
|
|
mvns r1,r3 @ overflow: clamp to extreme fixed-point values
|
|
|
pop {r4,r15}
|
|
|
1:
|
|
|
@ here a shift down by -r2 places
|
|
|
adds r2,#32
|
|
|
bmi 1f @ long shift?
|
|
|
mov r4,r1
|
|
|
lsls r4,r2
|
|
|
rsbs r2,#0
|
|
|
adds r2,#32 @ complementary shift
|
|
|
asrs r1,r2
|
|
|
lsrs r0,r2
|
|
|
orrs r0,r4
|
|
|
pop {r4,r15}
|
|
|
1:
|
|
|
@ here a long shift down
|
|
|
movs r0,r1
|
|
|
asrs r1,#31 @ shift down 32 places
|
|
|
adds r2,#32
|
|
|
bmi 1f @ very long shift?
|
|
|
rsbs r2,#0
|
|
|
adds r2,#32
|
|
|
asrs r0,r2
|
|
|
pop {r4,r15}
|
|
|
1:
|
|
|
movs r0,r3 @ result very near zero: use sign extension bits
|
|
|
movs r1,r3
|
|
|
pop {r4,r15}
|
|
|
|
|
|
@ float <-> double conversions
|
|
|
.thumb_func
|
|
|
qfp_float2double:
|
|
|
lsrs r3,r0,#31 @ sign bit
|
|
|
lsls r3,#31
|
|
|
lsls r1,r0,#1
|
|
|
lsrs r2,r1,#24 @ exponent
|
|
|
beq 1f @ zero?
|
|
|
cmp r2,#0xff @ Inf?
|
|
|
beq 2f
|
|
|
lsrs r1,#4 @ exponent and top 20 bits of mantissa
|
|
|
ldr r2,=#(0x3ff-0x7f)<<20 @ difference in exponent offsets
|
|
|
adds r1,r2
|
|
|
orrs r1,r3
|
|
|
lsls r0,#29 @ bottom 3 bits of mantissa
|
|
|
bx r14
|
|
|
1:
|
|
|
movs r1,r3 @ return signed zero
|
|
|
3:
|
|
|
movs r0,#0
|
|
|
bx r14
|
|
|
2:
|
|
|
ldr r1,=#0x7ff00000 @ return signed infinity
|
|
|
adds r1,r3
|
|
|
b 3b
|
|
|
|
|
|
.thumb_func
|
|
|
qfp_double2float:
|
|
|
lsls r2,r1,#1
|
|
|
lsrs r2,#21 @ exponent
|
|
|
ldr r3,=#0x3ff-0x7f
|
|
|
subs r2,r3 @ fix exponent bias
|
|
|
ble 1f @ underflow or zero
|
|
|
cmp r2,#0xff
|
|
|
bge 2f @ overflow or infinity
|
|
|
lsls r2,#23 @ position exponent of result
|
|
|
lsrs r3,r1,#31
|
|
|
lsls r3,#31
|
|
|
orrs r2,r3 @ insert sign
|
|
|
lsls r3,r0,#3 @ rounding bits
|
|
|
lsrs r0,#29
|
|
|
lsls r1,#12
|
|
|
lsrs r1,#9
|
|
|
orrs r0,r1 @ assemble mantissa
|
|
|
orrs r0,r2 @ insert exponent and sign
|
|
|
lsls r3,#1
|
|
|
bcc 3f @ no rounding
|
|
|
beq 4f @ all sticky bits 0?
|
|
|
5:
|
|
|
adds r0,#1
|
|
|
3:
|
|
|
bx r14
|
|
|
4:
|
|
|
lsrs r3,r0,#1 @ odd? then round up
|
|
|
bcs 5b
|
|
|
bx r14
|
|
|
1:
|
|
|
beq 6f @ check case where value is just less than smallest normal
|
|
|
7:
|
|
|
lsrs r0,r1,#31
|
|
|
lsls r0,#31
|
|
|
bx r14
|
|
|
6:
|
|
|
lsls r2,r1,#12 @ 20 1:s at top of mantissa?
|
|
|
asrs r2,#12
|
|
|
adds r2,#1
|
|
|
bne 7b
|
|
|
lsrs r2,r0,#29 @ and 3 more 1:s?
|
|
|
cmp r2,#7
|
|
|
bne 7b
|
|
|
movs r2,#1 @ return smallest normal with correct sign
|
|
|
b 8f
|
|
|
2:
|
|
|
movs r2,#0xff
|
|
|
8:
|
|
|
lsrs r0,r1,#31 @ return signed infinity
|
|
|
lsls r0,#8
|
|
|
adds r0,r2
|
|
|
lsls r0,#23
|
|
|
bx r14
|
|
|
|
|
|
@ convert signed/unsigned 32/64-bit integer/fixed-point value in r0:r1 [with r2 places after point] to packed double in r0:r1, with rounding
|
|
|
|
|
|
.thumb_func
|
|
|
qfp_uint2double:
|
|
|
movs r1,#0 @ and fall through
|
|
|
.thumb_func
|
|
|
qfp_ufix2double:
|
|
|
movs r2,r1
|
|
|
movs r1,#0
|
|
|
b qfp_ufix642double
|
|
|
|
|
|
.thumb_func
|
|
|
qfp_int2double:
|
|
|
movs r1,#0 @ and fall through
|
|
|
.thumb_func
|
|
|
qfp_fix2double:
|
|
|
movs r2,r1
|
|
|
asrs r1,r0,#31 @ sign extend
|
|
|
b qfp_fix642double
|
|
|
|
|
|
.thumb_func
|
|
|
qfp_uint642double:
|
|
|
movs r2,#0 @ and fall through
|
|
|
.thumb_func
|
|
|
qfp_ufix642double:
|
|
|
movs r3,#0
|
|
|
b uf2d
|
|
|
|
|
|
.thumb_func
|
|
|
qfp_int642double:
|
|
|
movs r2,#0 @ and fall through
|
|
|
.thumb_func
|
|
|
qfp_fix642double:
|
|
|
asrs r3,r1,#31 @ sign bit across all bits
|
|
|
eors r0,r3
|
|
|
eors r1,r3
|
|
|
subs r0,r3
|
|
|
sbcs r1,r3
|
|
|
uf2d:
|
|
|
push {r4,r5,r14}
|
|
|
ldr r4,=#0x432
|
|
|
subs r2,r4,r2 @ form biased exponent
|
|
|
@ here
|
|
|
@ r0:r1 unnormalised mantissa
|
|
|
@ r2 -Q (will become exponent)
|
|
|
@ r3 sign across all bits
|
|
|
cmp r1,#0
|
|
|
bne 1f @ short normalising shift?
|
|
|
movs r1,r0
|
|
|
beq 2f @ zero? return it
|
|
|
movs r0,#0
|
|
|
subs r2,#32 @ fix exponent
|
|
|
1:
|
|
|
asrs r4,r1,#21
|
|
|
bne 3f @ will need shift down (and rounding?)
|
|
|
bcs 4f @ normalised already?
|
|
|
5:
|
|
|
subs r2,#1
|
|
|
adds r0,r0 @ shift up
|
|
|
adcs r1,r1
|
|
|
lsrs r4,r1,#21
|
|
|
bcc 5b
|
|
|
4:
|
|
|
ldr r4,=#0x7fe
|
|
|
cmp r2,r4
|
|
|
bhs 6f @ over/underflow? return signed zero/infinity
|
|
|
7:
|
|
|
lsls r2,#20 @ pack and return
|
|
|
adds r1,r2
|
|
|
lsls r3,#31
|
|
|
adds r1,r3
|
|
|
2:
|
|
|
pop {r4,r5,r15}
|
|
|
6: @ return signed zero/infinity according to unclamped exponent in r2
|
|
|
mvns r2,r2
|
|
|
lsrs r2,#21
|
|
|
movs r0,#0
|
|
|
movs r1,#0
|
|
|
b 7b
|
|
|
|
|
|
3:
|
|
|
@ here we need to shift down to normalise and possibly round
|
|
|
bmi 1f @ already normalised to Q63?
|
|
|
2:
|
|
|
subs r2,#1
|
|
|
adds r0,r0 @ shift up
|
|
|
adcs r1,r1
|
|
|
bpl 2b
|
|
|
1:
|
|
|
@ here we have a 1 in b63 of r0:r1
|
|
|
adds r2,#11 @ correct exponent for subsequent shift down
|
|
|
lsls r4,r0,#21 @ save bits for rounding
|
|
|
lsrs r0,#11
|
|
|
lsls r5,r1,#21
|
|
|
orrs r0,r5
|
|
|
lsrs r1,#11
|
|
|
lsls r4,#1
|
|
|
beq 1f @ sticky bits are zero?
|
|
|
8:
|
|
|
movs r4,#0
|
|
|
adcs r0,r4
|
|
|
adcs r1,r4
|
|
|
b 4b
|
|
|
1:
|
|
|
bcc 4b @ sticky bits are zero but not on rounding boundary
|
|
|
lsrs r4,r0,#1 @ increment if odd (force round to even)
|
|
|
b 8b
|
|
|
|
|
|
|
|
|
.ltorg
|
|
|
|
|
|
.thumb_func
|
|
|
dunpacks:
|
|
|
mdunpacks r0,r1,r2,r3,r4
|
|
|
ldr r3,=#0x3ff
|
|
|
subs r2,r3 @ exponent without offset
|
|
|
bx r14
|
|
|
|
|
|
@ r0:r1 signed mantissa Q52
|
|
|
@ r2 unbiased exponent < 10 (i.e., |x|<2^10)
|
|
|
@ r4 pointer to:
|
|
|
@ - divisor reciprocal approximation r=1/d Q15
|
|
|
@ - divisor d Q62 0..20
|
|
|
@ - divisor d Q62 21..41
|
|
|
@ - divisor d Q62 42..62
|
|
|
@ returns:
|
|
|
@ r0:r1 reduced result y Q62, -0.6 d < y < 0.6 d (better in practice)
|
|
|
@ r2 quotient q (number of reductions)
|
|
|
@ if exponent >=10, returns r0:r1=0, r2=1024*mantissa sign
|
|
|
@ designed to work for 0.5<d<2, in particular d=ln2 (~0.7) and d=π/2 (~1.6)
|
|
|
.thumb_func
|
|
|
dreduce:
|
|
|
adds r2,#2 @ e+2
|
|
|
bmi 1f @ |x|<0.25, too small to need adjustment
|
|
|
cmp r2,#12
|
|
|
bge 4f
|
|
|
2:
|
|
|
movs r5,#17
|
|
|
subs r5,r2 @ 15-e
|
|
|
movs r3,r1 @ Q20
|
|
|
asrs r3,r5 @ x Q5
|
|
|
adds r2,#8 @ e+10
|
|
|
adds r5,#7 @ 22-e = 32-(e+10)
|
|
|
movs r6,r0
|
|
|
lsrs r6,r5
|
|
|
lsls r0,r2
|
|
|
lsls r1,r2
|
|
|
orrs r1,r6 @ r0:r1 x Q62
|
|
|
ldmia r4,{r4-r7}
|
|
|
muls r3,r4 @ rx Q20
|
|
|
asrs r2,r3,#20
|
|
|
movs r3,#0
|
|
|
adcs r2,r3 @ rx Q0 rounded = q; for e.g. r=1.5 |q|<1.5*2^10
|
|
|
muls r5,r2 @ qd in pieces: L Q62
|
|
|
muls r6,r2 @ M Q41
|
|
|
muls r7,r2 @ H Q20
|
|
|
lsls r7,#10
|
|
|
asrs r4,r6,#11
|
|
|
lsls r6,#21
|
|
|
adds r6,r5
|
|
|
adcs r7,r4
|
|
|
asrs r5,#31
|
|
|
adds r7,r5 @ r6:r7 qd Q62
|
|
|
subs r0,r6
|
|
|
sbcs r1,r7 @ remainder Q62
|
|
|
bx r14
|
|
|
4:
|
|
|
movs r2,#12 @ overflow: clamp to +/-1024
|
|
|
movs r0,#0
|
|
|
asrs r1,#31
|
|
|
lsls r1,#1
|
|
|
adds r1,#1
|
|
|
lsls r1,#20
|
|
|
b 2b
|
|
|
|
|
|
1:
|
|
|
lsls r1,#8
|
|
|
lsrs r3,r0,#24
|
|
|
orrs r1,r3
|
|
|
lsls r0,#8 @ r0:r1 Q60, to be shifted down -r2 places
|
|
|
rsbs r3,r2,#0
|
|
|
adds r2,#32 @ shift down in r3, complementary shift in r2
|
|
|
bmi 1f @ long shift?
|
|
|
2:
|
|
|
movs r4,r1
|
|
|
asrs r1,r3
|
|
|
lsls r4,r2
|
|
|
lsrs r0,r3
|
|
|
orrs r0,r4
|
|
|
movs r2,#0 @ rounding
|
|
|
adcs r0,r2
|
|
|
adcs r1,r2
|
|
|
bx r14
|
|
|
|
|
|
1:
|
|
|
movs r0,r1 @ down 32 places
|
|
|
asrs r1,#31
|
|
|
subs r3,#32
|
|
|
adds r2,#32
|
|
|
bpl 2b
|
|
|
movs r0,#0 @ very long shift? return 0
|
|
|
movs r1,#0
|
|
|
movs r2,#0
|
|
|
bx r14
|
|
|
|
|
|
.thumb_func
|
|
|
qfp_dtan:
|
|
|
push {r4-r7,r14}
|
|
|
bl push_r8_r11
|
|
|
bl dsincos
|
|
|
mov r12,r0 @ save ε
|
|
|
bl dcos_finish
|
|
|
push {r0,r1}
|
|
|
mov r0,r12
|
|
|
bl dsin_finish
|
|
|
pop {r2,r3}
|
|
|
bl pop_r8_r11
|
|
|
b ddiv0 @ compute sin θ/cos θ
|
|
|
|
|
|
.thumb_func
|
|
|
qfp_dcos:
|
|
|
push {r4-r7,r14}
|
|
|
bl push_r8_r11
|
|
|
bl dsincos
|
|
|
bl dcos_finish
|
|
|
b 1f
|
|
|
|
|
|
.thumb_func
|
|
|
qfp_dsin:
|
|
|
push {r4-r7,r14}
|
|
|
bl push_r8_r11
|
|
|
bl dsincos
|
|
|
bl dsin_finish
|
|
|
1:
|
|
|
bl pop_r8_r11
|
|
|
pop {r4-r7,r15}
|
|
|
|
|
|
|
|
|
@ unpack double θ in r0:r1, range reduce and calculate ε, cos α and sin α such that
|
|
|
@ θ=α+ε and |ε|≤2^-32
|
|
|
@ on return:
|
|
|
@ r0:r1 ε (residual ω, where θ=α+ε) Q62, |ε|≤2^-32 (so fits in r0)
|
|
|
@ r8:r9 cos α Q62
|
|
|
@ r10:r11 sin α Q62
|
|
|
.thumb_func
|
|
|
dsincos:
|
|
|
push {r14}
|
|
|
bl dunpacks
|
|
|
adr r4,dreddata0
|
|
|
bl dreduce
|
|
|
|
|
|
movs r4,#0
|
|
|
ldr r5,=#0x9df04dbb @ this value compensates for the non-unity scaling of the CORDIC rotations
|
|
|
ldr r6,=#0x36f656c5
|
|
|
lsls r2,#31
|
|
|
bcc 1f
|
|
|
@ quadrant 2 or 3
|
|
|
mvns r6,r6
|
|
|
rsbs r5,r5,#0
|
|
|
adcs r6,r4
|
|
|
1:
|
|
|
lsls r2,#1
|
|
|
bcs 1f
|
|
|
@ even quadrant
|
|
|
mov r10,r4
|
|
|
mov r11,r4
|
|
|
mov r8,r5
|
|
|
mov r9,r6
|
|
|
b 2f
|
|
|
1:
|
|
|
@ odd quadrant
|
|
|
mov r8,r4
|
|
|
mov r9,r4
|
|
|
mov r10,r5
|
|
|
mov r11,r6
|
|
|
2:
|
|
|
adr r4,dtab_cc
|
|
|
mov r12,r4
|
|
|
movs r7,#1
|
|
|
movs r6,#31
|
|
|
1:
|
|
|
bl dcordic_rot_step
|
|
|
adds r7,#1
|
|
|
subs r6,#1
|
|
|
cmp r7,#33
|
|
|
bne 1b
|
|
|
pop {r15}
|
|
|
|
|
|
dcos_finish:
|
|
|
@ here
|
|
|
@ r0:r1 ε (residual ω, where θ=α+ε) Q62, |ε|≤2^-32 (so fits in r0)
|
|
|
@ r8:r9 cos α Q62
|
|
|
@ r10:r11 sin α Q62
|
|
|
@ and we wish to calculate cos θ=cos(α+ε)~cos α - ε sin α
|
|
|
mov r1,r11
|
|
|
@ mov r2,r10
|
|
|
@ lsrs r2,#31
|
|
|
@ adds r1,r2 @ rounding improves accuracy very slightly
|
|
|
muls32_s32_64 r0,r1, r2,r3, r4,r5,r6,r2,r3
|
|
|
@ r2:r3 ε sin α Q(62+62-32)=Q92
|
|
|
mov r0,r8
|
|
|
mov r1,r9
|
|
|
lsls r5,r3,#2
|
|
|
asrs r3,r3,#30
|
|
|
lsrs r2,r2,#30
|
|
|
orrs r2,r5
|
|
|
sbcs r0,r2 @ include rounding
|
|
|
sbcs r1,r3
|
|
|
movs r2,#62
|
|
|
b qfp_fix642double
|
|
|
|
|
|
dsin_finish:
|
|
|
@ here
|
|
|
@ r0:r1 ε (residual ω, where θ=α+ε) Q62, |ε|≤2^-32 (so fits in r0)
|
|
|
@ r8:r9 cos α Q62
|
|
|
@ r10:r11 sin α Q62
|
|
|
@ and we wish to calculate sin θ=sin(α+ε)~sin α + ε cos α
|
|
|
mov r1,r9
|
|
|
muls32_s32_64 r0,r1, r2,r3, r4,r5,r6,r2,r3
|
|
|
@ r2:r3 ε cos α Q(62+62-32)=Q92
|
|
|
mov r0,r10
|
|
|
mov r1,r11
|
|
|
lsls r5,r3,#2
|
|
|
asrs r3,r3,#30
|
|
|
lsrs r2,r2,#30
|
|
|
orrs r2,r5
|
|
|
adcs r0,r2 @ include rounding
|
|
|
adcs r1,r3
|
|
|
movs r2,#62
|
|
|
b qfp_fix642double
|
|
|
|
|
|
.ltorg
|
|
|
.align 2
|
|
|
dreddata0:
|
|
|
.word 0x0000517d @ 2/π Q15
|
|
|
.word 0x0014611A @ π/2 Q62=6487ED5110B4611A split into 21-bit pieces
|
|
|
.word 0x000A8885
|
|
|
.word 0x001921FB
|
|
|
|
|
|
.thumb_func
|
|
|
qfp_datan2:
|
|
|
@ r0:r1 y
|
|
|
@ r2:r3 x
|
|
|
push {r4-r7,r14}
|
|
|
bl push_r8_r11
|
|
|
ldr r5,=#0x7ff00000
|
|
|
movs r4,r1
|
|
|
ands r4,r5 @ y==0?
|
|
|
beq 1f
|
|
|
cmp r4,r5 @ or Inf/NaN?
|
|
|
bne 2f
|
|
|
1:
|
|
|
lsrs r1,#20 @ flush
|
|
|
lsls r1,#20
|
|
|
movs r0,#0
|
|
|
2:
|
|
|
movs r4,r3
|
|
|
ands r4,r5 @ x==0?
|
|
|
beq 1f
|
|
|
cmp r4,r5 @ or Inf/NaN?
|
|
|
bne 2f
|
|
|
1:
|
|
|
lsrs r3,#20 @ flush
|
|
|
lsls r3,#20
|
|
|
movs r2,#0
|
|
|
2:
|
|
|
movs r6,#0 @ quadrant offset
|
|
|
lsls r5,#11 @ constant 0x80000000
|
|
|
cmp r3,#0
|
|
|
bpl 1f @ skip if x positive
|
|
|
movs r6,#2
|
|
|
eors r3,r5
|
|
|
eors r1,r5
|
|
|
bmi 1f @ quadrant offset=+2 if y was positive
|
|
|
rsbs r6,#0 @ quadrant offset=-2 if y was negative
|
|
|
1:
|
|
|
@ now in quadrant 0 or 3
|
|
|
adds r7,r1,r5 @ r7=-r1
|
|
|
bpl 1f
|
|
|
@ y>=0: in quadrant 0
|
|
|
cmp r1,r3
|
|
|
ble 2f @ y<~x so 0≤θ<~π/4: skip
|
|
|
adds r6,#1
|
|
|
eors r1,r5 @ negate x
|
|
|
b 3f @ and exchange x and y = rotate by -π/2
|
|
|
1:
|
|
|
cmp r3,r7
|
|
|
bge 2f @ -y<~x so -π/4<~θ≤0: skip
|
|
|
subs r6,#1
|
|
|
eors r3,r5 @ negate y and ...
|
|
|
3:
|
|
|
movs r7,r0 @ exchange x and y
|
|
|
movs r0,r2
|
|
|
movs r2,r7
|
|
|
movs r7,r1
|
|
|
movs r1,r3
|
|
|
movs r3,r7
|
|
|
2:
|
|
|
@ here -π/4<~θ<~π/4
|
|
|
@ r6 has quadrant offset
|
|
|
push {r6}
|
|
|
cmp r2,#0
|
|
|
bne 1f
|
|
|
cmp r3,#0
|
|
|
beq 10f @ x==0 going into division?
|
|
|
lsls r4,r3,#1
|
|
|
asrs r4,#21
|
|
|
adds r4,#1
|
|
|
bne 1f @ x==Inf going into division?
|
|
|
lsls r4,r1,#1
|
|
|
asrs r4,#21
|
|
|
adds r4,#1 @ y also ±Inf?
|
|
|
bne 10f
|
|
|
subs r1,#1 @ make them both just finite
|
|
|
subs r3,#1
|
|
|
b 1f
|
|
|
|
|
|
10:
|
|
|
movs r0,#0
|
|
|
movs r1,#0
|
|
|
b 12f
|
|
|
|
|
|
1:
|
|
|
bl qfp_ddiv
|
|
|
movs r2,#62
|
|
|
bl qfp_double2fix64
|
|
|
@ r0:r1 y/x
|
|
|
mov r10,r0
|
|
|
mov r11,r1
|
|
|
movs r0,#0 @ ω=0
|
|
|
movs r1,#0
|
|
|
mov r8,r0
|
|
|
movs r2,#1
|
|
|
lsls r2,#30
|
|
|
mov r9,r2 @ x=1
|
|
|
|
|
|
adr r4,dtab_cc
|
|
|
mov r12,r4
|
|
|
movs r7,#1
|
|
|
movs r6,#31
|
|
|
1:
|
|
|
bl dcordic_vec_step
|
|
|
adds r7,#1
|
|
|
subs r6,#1
|
|
|
cmp r7,#33
|
|
|
bne 1b
|
|
|
@ r0:r1 atan(y/x) Q62
|
|
|
@ r8:r9 x residual Q62
|
|
|
@ r10:r11 y residual Q62
|
|
|
mov r2,r9
|
|
|
mov r3,r10
|
|
|
subs r2,#12 @ this makes atan(0)==0
|
|
|
@ the following is basically a division residual y/x ~ atan(residual y/x)
|
|
|
movs r4,#1
|
|
|
lsls r4,#29
|
|
|
movs r7,#0
|
|
|
2:
|
|
|
lsrs r2,#1
|
|
|
movs r3,r3 @ preserve carry
|
|
|
bmi 1f
|
|
|
sbcs r3,r2
|
|
|
adds r0,r4
|
|
|
adcs r1,r7
|
|
|
lsrs r4,#1
|
|
|
bne 2b
|
|
|
b 3f
|
|
|
1:
|
|
|
adcs r3,r2
|
|
|
subs r0,r4
|
|
|
sbcs r1,r7
|
|
|
lsrs r4,#1
|
|
|
bne 2b
|
|
|
3:
|
|
|
lsls r6,r1,#31
|
|
|
asrs r1,#1
|
|
|
lsrs r0,#1
|
|
|
orrs r0,r6 @ Q61
|
|
|
|
|
|
12:
|
|
|
pop {r6}
|
|
|
|
|
|
cmp r6,#0
|
|
|
beq 1f
|
|
|
ldr r4,=#0x885A308D @ π/2 Q61
|
|
|
ldr r5,=#0x3243F6A8
|
|
|
bpl 2f
|
|
|
mvns r4,r4 @ negative quadrant offset
|
|
|
mvns r5,r5
|
|
|
2:
|
|
|
lsls r6,#31
|
|
|
bne 2f @ skip if quadrant offset is ±1
|
|
|
adds r0,r4
|
|
|
adcs r1,r5
|
|
|
2:
|
|
|
adds r0,r4
|
|
|
adcs r1,r5
|
|
|
1:
|
|
|
movs r2,#61
|
|
|
bl qfp_fix642double
|
|
|
|
|
|
bl pop_r8_r11
|
|
|
pop {r4-r7,r15}
|
|
|
|
|
|
.ltorg
|
|
|
|
|
|
dtab_cc:
|
|
|
.word 0x61bb4f69, 0x1dac6705 @ atan 2^-1 Q62
|
|
|
.word 0x96406eb1, 0x0fadbafc @ atan 2^-2 Q62
|
|
|
.word 0xab0bdb72, 0x07f56ea6 @ atan 2^-3 Q62
|
|
|
.word 0xe59fbd39, 0x03feab76 @ atan 2^-4 Q62
|
|
|
.word 0xba97624b, 0x01ffd55b @ atan 2^-5 Q62
|
|
|
.word 0xdddb94d6, 0x00fffaaa @ atan 2^-6 Q62
|
|
|
.word 0x56eeea5d, 0x007fff55 @ atan 2^-7 Q62
|
|
|
.word 0xaab7776e, 0x003fffea @ atan 2^-8 Q62
|
|
|
.word 0x5555bbbc, 0x001ffffd @ atan 2^-9 Q62
|
|
|
.word 0xaaaaadde, 0x000fffff @ atan 2^-10 Q62
|
|
|
.word 0xf555556f, 0x0007ffff @ atan 2^-11 Q62
|
|
|
.word 0xfeaaaaab, 0x0003ffff @ atan 2^-12 Q62
|
|
|
.word 0xffd55555, 0x0001ffff @ atan 2^-13 Q62
|
|
|
.word 0xfffaaaab, 0x0000ffff @ atan 2^-14 Q62
|
|
|
.word 0xffff5555, 0x00007fff @ atan 2^-15 Q62
|
|
|
.word 0xffffeaab, 0x00003fff @ atan 2^-16 Q62
|
|
|
.word 0xfffffd55, 0x00001fff @ atan 2^-17 Q62
|
|
|
.word 0xffffffab, 0x00000fff @ atan 2^-18 Q62
|
|
|
.word 0xfffffff5, 0x000007ff @ atan 2^-19 Q62
|
|
|
.word 0xffffffff, 0x000003ff @ atan 2^-20 Q62
|
|
|
.word 0x00000000, 0x00000200 @ atan 2^-21 Q62 @ consider optimising these
|
|
|
.word 0x00000000, 0x00000100 @ atan 2^-22 Q62
|
|
|
.word 0x00000000, 0x00000080 @ atan 2^-23 Q62
|
|
|
.word 0x00000000, 0x00000040 @ atan 2^-24 Q62
|
|
|
.word 0x00000000, 0x00000020 @ atan 2^-25 Q62
|
|
|
.word 0x00000000, 0x00000010 @ atan 2^-26 Q62
|
|
|
.word 0x00000000, 0x00000008 @ atan 2^-27 Q62
|
|
|
.word 0x00000000, 0x00000004 @ atan 2^-28 Q62
|
|
|
.word 0x00000000, 0x00000002 @ atan 2^-29 Q62
|
|
|
.word 0x00000000, 0x00000001 @ atan 2^-30 Q62
|
|
|
.word 0x80000000, 0x00000000 @ atan 2^-31 Q62
|
|
|
.word 0x40000000, 0x00000000 @ atan 2^-32 Q62
|
|
|
|
|
|
.thumb_func
|
|
|
qfp_dexp:
|
|
|
push {r4-r7,r14}
|
|
|
bl dunpacks
|
|
|
adr r4,dreddata1
|
|
|
bl dreduce
|
|
|
cmp r1,#0
|
|
|
bge 1f
|
|
|
ldr r4,=#0xF473DE6B
|
|
|
ldr r5,=#0x2C5C85FD @ ln2 Q62
|
|
|
adds r0,r4
|
|
|
adcs r1,r5
|
|
|
subs r2,#1
|
|
|
1:
|
|
|
push {r2}
|
|
|
movs r7,#1 @ shift
|
|
|
adr r6,dtab_exp
|
|
|
movs r2,#0
|
|
|
movs r3,#1
|
|
|
lsls r3,#30 @ x=1 Q62
|
|
|
|
|
|
3:
|
|
|
ldmia r6!,{r4,r5}
|
|
|
mov r12,r6
|
|
|
subs r0,r4
|
|
|
sbcs r1,r5
|
|
|
bmi 1f
|
|
|
|
|
|
rsbs r6,r7,#0
|
|
|
adds r6,#32 @ complementary shift
|
|
|
movs r5,r3
|
|
|
asrs r5,r7
|
|
|
movs r4,r3
|
|
|
lsls r4,r6
|
|
|
movs r6,r2
|
|
|
lsrs r6,r7 @ rounding bit in carry
|
|
|
orrs r4,r6
|
|
|
adcs r2,r4
|
|
|
adcs r3,r5 @ x+=x>>i
|
|
|
b 2f
|
|
|
|
|
|
1:
|
|
|
adds r0,r4 @ restore argument
|
|
|
adcs r1,r5
|
|
|
2:
|
|
|
mov r6,r12
|
|
|
adds r7,#1
|
|
|
cmp r7,#33
|
|
|
bne 3b
|
|
|
|
|
|
@ here
|
|
|
@ r0:r1 ε (residual x, where x=a+ε) Q62, |ε|≤2^-32 (so fits in r0)
|
|
|
@ r2:r3 exp a Q62
|
|
|
@ and we wish to calculate exp x=exp a exp ε~(exp a)(1+ε)
|
|
|
muls32_32_64 r0,r3, r4,r1, r5,r6,r7,r4,r1
|
|
|
@ r4:r1 ε exp a Q(62+62-32)=Q92
|
|
|
lsrs r4,#30
|
|
|
lsls r0,r1,#2
|
|
|
orrs r0,r4
|
|
|
asrs r1,#30
|
|
|
adds r0,r2
|
|
|
adcs r1,r3
|
|
|
|
|
|
pop {r2}
|
|
|
rsbs r2,#0
|
|
|
adds r2,#62
|
|
|
bl qfp_fix642double @ in principle we can pack faster than this because we know the exponent
|
|
|
pop {r4-r7,r15}
|
|
|
|
|
|
.ltorg
|
|
|
|
|
|
.thumb_func
|
|
|
qfp_dln:
|
|
|
push {r4-r7,r14}
|
|
|
lsls r7,r1,#1
|
|
|
bcs 5f @ <0 ...
|
|
|
asrs r7,#21
|
|
|
beq 5f @ ... or =0? return -Inf
|
|
|
adds r7,#1
|
|
|
beq 6f @ Inf/NaN? return +Inf
|
|
|
bl dunpacks
|
|
|
push {r2}
|
|
|
lsls r1,#9
|
|
|
lsrs r2,r0,#23
|
|
|
orrs r1,r2
|
|
|
lsls r0,#9
|
|
|
@ r0:r1 m Q61 = m/2 Q62 0.5≤m/2<1
|
|
|
|
|
|
movs r7,#1 @ shift
|
|
|
adr r6,dtab_exp
|
|
|
mov r12,r6
|
|
|
movs r2,#0
|
|
|
movs r3,#0 @ y=0 Q62
|
|
|
|
|
|
3:
|
|
|
rsbs r6,r7,#0
|
|
|
adds r6,#32 @ complementary shift
|
|
|
movs r5,r1
|
|
|
asrs r5,r7
|
|
|
movs r4,r1
|
|
|
lsls r4,r6
|
|
|
movs r6,r0
|
|
|
lsrs r6,r7
|
|
|
orrs r4,r6 @ x>>i, rounding bit in carry
|
|
|
adcs r4,r0
|
|
|
adcs r5,r1 @ x+(x>>i)
|
|
|
|
|
|
lsrs r6,r5,#30
|
|
|
bne 1f @ x+(x>>i)>1?
|
|
|
movs r0,r4
|
|
|
movs r1,r5 @ x+=x>>i
|
|
|
mov r6,r12
|
|
|
ldmia r6!,{r4,r5}
|
|
|
subs r2,r4
|
|
|
sbcs r3,r5
|
|
|
|
|
|
1:
|
|
|
movs r4,#8
|
|
|
add r12,r4
|
|
|
adds r7,#1
|
|
|
cmp r7,#33
|
|
|
bne 3b
|
|
|
@ here:
|
|
|
@ r0:r1 residual x, nearly 1 Q62
|
|
|
@ r2:r3 y ~ ln m/2 = ln m - ln2 Q62
|
|
|
@ result is y + ln2 + ln x ~ y + ln2 + (x-1)
|
|
|
lsls r1,#2
|
|
|
asrs r1,#2 @ x-1
|
|
|
adds r2,r0
|
|
|
adcs r3,r1
|
|
|
|
|
|
pop {r7}
|
|
|
@ here:
|
|
|
@ r2:r3 ln m/2 = ln m - ln2 Q62
|
|
|
@ r7 unbiased exponent
|
|
|
|
|
|
adr r4,dreddata1+4
|
|
|
ldmia r4,{r0,r1,r4}
|
|
|
adds r7,#1
|
|
|
muls r0,r7 @ Q62
|
|
|
muls r1,r7 @ Q41
|
|
|
muls r4,r7 @ Q20
|
|
|
lsls r7,r1,#21
|
|
|
asrs r1,#11
|
|
|
asrs r5,r1,#31
|
|
|
adds r0,r7
|
|
|
adcs r1,r5
|
|
|
lsls r7,r4,#10
|
|
|
asrs r4,#22
|
|
|
asrs r5,r1,#31
|
|
|
adds r1,r7
|
|
|
adcs r4,r5
|
|
|
@ r0:r1:r4 exponent*ln2 Q62
|
|
|
asrs r5,r3,#31
|
|
|
adds r0,r2
|
|
|
adcs r1,r3
|
|
|
adcs r4,r5
|
|
|
@ r0:r1:r4 result Q62
|
|
|
movs r2,#62
|
|
|
1:
|
|
|
asrs r5,r1,#31
|
|
|
cmp r4,r5
|
|
|
beq 2f @ r4 a sign extension of r1?
|
|
|
lsrs r0,#4 @ no: shift down 4 places and try again
|
|
|
lsls r6,r1,#28
|
|
|
orrs r0,r6
|
|
|
lsrs r1,#4
|
|
|
lsls r6,r4,#28
|
|
|
orrs r1,r6
|
|
|
asrs r4,#4
|
|
|
subs r2,#4
|
|
|
b 1b
|
|
|
2:
|
|
|
bl qfp_fix642double
|
|
|
pop {r4-r7,r15}
|
|
|
|
|
|
5:
|
|
|
ldr r1,=#0xfff00000
|
|
|
movs r0,#0
|
|
|
pop {r4-r7,r15}
|
|
|
|
|
|
6:
|
|
|
ldr r1,=#0x7ff00000
|
|
|
movs r0,#0
|
|
|
pop {r4-r7,r15}
|
|
|
|
|
|
|
|
|
.ltorg
|
|
|
|
|
|
.align 2
|
|
|
dreddata1:
|
|
|
.word 0x0000B8AA @ 1/ln2 Q15
|
|
|
.word 0x0013DE6B @ ln2 Q62 Q62=2C5C85FDF473DE6B split into 21-bit pieces
|
|
|
.word 0x000FEFA3
|
|
|
.word 0x000B1721
|
|
|
|
|
|
dtab_exp:
|
|
|
.word 0xbf984bf3, 0x19f323ec @ log 1+2^-1 Q62
|
|
|
.word 0xcd4d10d6, 0x0e47fbe3 @ log 1+2^-2 Q62
|
|
|
.word 0x8abcb97a, 0x0789c1db @ log 1+2^-3 Q62
|
|
|
.word 0x022c54cc, 0x03e14618 @ log 1+2^-4 Q62
|
|
|
.word 0xe7833005, 0x01f829b0 @ log 1+2^-5 Q62
|
|
|
.word 0x87e01f1e, 0x00fe0545 @ log 1+2^-6 Q62
|
|
|
.word 0xac419e24, 0x007f80a9 @ log 1+2^-7 Q62
|
|
|
.word 0x45621781, 0x003fe015 @ log 1+2^-8 Q62
|
|
|
.word 0xa9ab10e6, 0x001ff802 @ log 1+2^-9 Q62
|
|
|
.word 0x55455888, 0x000ffe00 @ log 1+2^-10 Q62
|
|
|
.word 0x0aa9aac4, 0x0007ff80 @ log 1+2^-11 Q62
|
|
|
.word 0x01554556, 0x0003ffe0 @ log 1+2^-12 Q62
|
|
|
.word 0x002aa9ab, 0x0001fff8 @ log 1+2^-13 Q62
|
|
|
.word 0x00055545, 0x0000fffe @ log 1+2^-14 Q62
|
|
|
.word 0x8000aaaa, 0x00007fff @ log 1+2^-15 Q62
|
|
|
.word 0xe0001555, 0x00003fff @ log 1+2^-16 Q62
|
|
|
.word 0xf80002ab, 0x00001fff @ log 1+2^-17 Q62
|
|
|
.word 0xfe000055, 0x00000fff @ log 1+2^-18 Q62
|
|
|
.word 0xff80000b, 0x000007ff @ log 1+2^-19 Q62
|
|
|
.word 0xffe00001, 0x000003ff @ log 1+2^-20 Q62
|
|
|
.word 0xfff80000, 0x000001ff @ log 1+2^-21 Q62
|
|
|
.word 0xfffe0000, 0x000000ff @ log 1+2^-22 Q62
|
|
|
.word 0xffff8000, 0x0000007f @ log 1+2^-23 Q62
|
|
|
.word 0xffffe000, 0x0000003f @ log 1+2^-24 Q62
|
|
|
.word 0xfffff800, 0x0000001f @ log 1+2^-25 Q62
|
|
|
.word 0xfffffe00, 0x0000000f @ log 1+2^-26 Q62
|
|
|
.word 0xffffff80, 0x00000007 @ log 1+2^-27 Q62
|
|
|
.word 0xffffffe0, 0x00000003 @ log 1+2^-28 Q62
|
|
|
.word 0xfffffff8, 0x00000001 @ log 1+2^-29 Q62
|
|
|
.word 0xfffffffe, 0x00000000 @ log 1+2^-30 Q62
|
|
|
.word 0x80000000, 0x00000000 @ log 1+2^-31 Q62
|
|
|
.word 0x40000000, 0x00000000 @ log 1+2^-32 Q62
|
|
|
|
|
|
qfp_lib_end:
|