You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1040 lines
25 KiB
ArmAsm
1040 lines
25 KiB
ArmAsm
@ Copyright 2019-2024 Mark Owen
|
|
@ http://www.quinapalus.com
|
|
@ E-mail: qfp@quinapalus.com
|
|
@
|
|
@ This file is free software: you can redistribute it and/or modify
|
|
@ it under the terms of version 2 of the GNU General Public License
|
|
@ as published by the Free Software Foundation.
|
|
@
|
|
@ This file is distributed in the hope that it will be useful,
|
|
@ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
@ GNU General Public License for more details.
|
|
@
|
|
@ You should have received a copy of the GNU General Public License
|
|
@ along with this file. If not, see <http://www.gnu.org/licenses/> or
|
|
@ write to the Free Software Foundation, Inc., 51 Franklin Street,
|
|
@ Fifth Floor, Boston, MA 02110-1301, USA.
|
|
|
|
.syntax unified
|
|
.cpu cortex-m0plus
|
|
.thumb
|
|
|
|
@ exported symbols
|
|
|
|
.global qfp_fadd
|
|
.global qfp_fsub
|
|
.global qfp_fmul
|
|
.global qfp_fdiv
|
|
.global qfp_fcmp
|
|
.global qfp_fsqrt
|
|
.global qfp_float2int
|
|
.global qfp_float2fix
|
|
.global qfp_float2uint
|
|
.global qfp_float2ufix
|
|
.global qfp_int2float
|
|
.global qfp_fix2float
|
|
.global qfp_uint2float
|
|
.global qfp_ufix2float
|
|
.global qfp_fexp
|
|
.global qfp_fln
|
|
|
|
qfp_lib_start:
|
|
|
|
@ exchange r0<->r1, r2<->r3
|
|
xchxy:
|
|
push {r0,r2,r14}
|
|
mov r0,r1
|
|
mov r2,r3
|
|
pop {r1,r3,r15}
|
|
|
|
@ IEEE single in r0-> signed (two's complemennt) mantissa in r0 9Q23 (24 significant bits), signed exponent (bias removed) in r2
|
|
@ trashes r4; zero, denormal -> mantissa=+/-1, exponent=-380; Inf, NaN -> mantissa=+/-1, exponent=+640
|
|
unpackx:
|
|
lsrs r2,r0,#23 @ save exponent and sign
|
|
lsls r0,#9 @ extract mantissa
|
|
lsrs r0,#9
|
|
movs r4,#1
|
|
lsls r4,#23
|
|
orrs r0,r4 @ reinstate implied leading 1
|
|
cmp r2,#255 @ test sign bit
|
|
uxtb r2,r2 @ clear it
|
|
bls 1f @ branch on positive
|
|
rsbs r0,#0 @ negate mantissa
|
|
1:
|
|
subs r2,#1
|
|
cmp r2,#254 @ zero/denormal/Inf/NaN?
|
|
bhs 2f
|
|
subs r2,#126 @ remove exponent bias: can now be -126..+127
|
|
bx r14
|
|
|
|
2: @ here with special-case values
|
|
cmp r0,#0
|
|
mov r0,r4 @ set mantissa to +1
|
|
bpl 3f
|
|
rsbs r0,#0 @ zero/denormal/Inf/NaN: mantissa=+/-1
|
|
3:
|
|
subs r2,#126 @ zero/denormal: exponent -> -127; Inf, NaN: exponent -> 128
|
|
lsls r2,#2 @ zero/denormal: exponent -> -508; Inf, NaN: exponent -> 512
|
|
adds r2,#128 @ zero/denormal: exponent -> -380; Inf, NaN: exponent -> 640
|
|
bx r14
|
|
|
|
@ normalise and pack signed mantissa in r0 nominally 3Q29, signed exponent in r2-> IEEE single in r0
|
|
@ trashes r4, preserves r1,r3
|
|
@ r5: "sticky bits", must be zero iff all result bits below r0 are zero for correct rounding
|
|
packx:
|
|
lsrs r4,r0,#31 @ save sign bit
|
|
lsls r4,r4,#31 @ sign now in b31
|
|
bpl 2f @ skip if positive
|
|
cmp r5,#0
|
|
beq 11f
|
|
adds r0,#1 @ fiddle carry in to following rsb if sticky bits are non-zero
|
|
11:
|
|
rsbs r0,#0 @ can now treat r0 as unsigned
|
|
packx0:
|
|
bmi 3f @ catch r0=0x80000000 case
|
|
2:
|
|
subs r2,#1 @ normalisation loop
|
|
adds r0,r0
|
|
beq 1f @ zero? special case
|
|
bpl 2b @ normalise so leading "1" in bit 31
|
|
3:
|
|
adds r2,#129 @ (mis-)offset exponent
|
|
bne 12f @ special case: highest denormal can round to lowest normal
|
|
adds r0,#0x80 @ in special case, need to add 256 to r0 for rounding
|
|
bcs 4f @ tripped carry? then have leading 1 in C as required
|
|
12:
|
|
adds r0,#0x80 @ rounding
|
|
bcs 4f @ tripped carry? then have leading 1 in C as required (and result is even so can ignore sticky bits)
|
|
cmp r5,#0
|
|
beq 7f @ sticky bits zero?
|
|
8:
|
|
lsls r0,#1 @ remove leading 1
|
|
9:
|
|
subs r2,#1 @ compensate exponent on this path
|
|
4:
|
|
cmp r2,#254
|
|
bge 5f @ overflow?
|
|
adds r2,#1 @ correct exponent offset
|
|
ble 10f @ denormal/underflow?
|
|
lsrs r0,#9 @ align mantissa
|
|
lsls r2,#23 @ align exponent
|
|
orrs r0,r2 @ assemble exponent and mantissa
|
|
6:
|
|
orrs r0,r4 @ apply sign
|
|
1:
|
|
bx r14
|
|
|
|
5:
|
|
movs r0,#0xff @ create infinity
|
|
lsls r0,#23
|
|
b 6b
|
|
|
|
10:
|
|
movs r0,#0 @ create zero
|
|
bx r14
|
|
|
|
7: @ sticky bit rounding case
|
|
lsls r5,r0,#24 @ check bottom 8 bits of r0
|
|
bne 8b @ in rounding-tie case?
|
|
lsrs r0,#9 @ ensure even result
|
|
lsls r0,#10
|
|
b 9b
|
|
|
|
.align 2
|
|
.ltorg
|
|
|
|
@ signed multiply r0 1Q23 by r1 4Q23, result in r0 7Q25, sticky bits in r5
|
|
@ trashes r3,r4
|
|
mul0:
|
|
uxth r3,r0 @ Q23
|
|
asrs r4,r1,#16 @ Q7
|
|
muls r3,r4 @ L*H, Q30 signed
|
|
asrs r4,r0,#16 @ Q7
|
|
uxth r5,r1 @ Q23
|
|
muls r4,r5 @ H*L, Q30 signed
|
|
adds r3,r4 @ sum of middle partial products
|
|
uxth r4,r0
|
|
muls r4,r5 @ L*L, Q46 unsigned
|
|
lsls r5,r4,#16 @ initialise sticky bits from low half of low partial product
|
|
lsrs r4,#16 @ Q25
|
|
adds r3,r4 @ add high half of low partial product to sum of middle partial products
|
|
@ (cannot generate carry by limits on input arguments)
|
|
asrs r0,#16 @ Q7
|
|
asrs r1,#16 @ Q7
|
|
muls r0,r1 @ H*H, Q14 signed
|
|
lsls r0,#11 @ high partial product Q25
|
|
lsls r1,r3,#27 @ sticky
|
|
orrs r5,r1 @ collect further sticky bits
|
|
asrs r1,r3,#5 @ middle partial products Q25
|
|
adds r0,r1 @ final result
|
|
bx r14
|
|
|
|
.thumb_func
|
|
qfp_fcmp:
|
|
lsls r2,r0,#1
|
|
lsrs r2,#24
|
|
beq 1f
|
|
cmp r2,#0xff
|
|
bne 2f
|
|
1:
|
|
lsrs r0,#23 @ clear mantissa if NaN or denormal
|
|
lsls r0,#23
|
|
2:
|
|
lsls r2,r1,#1
|
|
lsrs r2,#24
|
|
beq 1f
|
|
cmp r2,#0xff
|
|
bne 2f
|
|
1:
|
|
lsrs r1,#23 @ clear mantissa if NaN or denormal
|
|
lsls r1,#23
|
|
2:
|
|
movs r2,#1 @ initialise result
|
|
eors r1,r0
|
|
bmi 4f @ opposite signs? then can proceed on basis of sign of x
|
|
eors r1,r0 @ restore y
|
|
bpl 1f
|
|
rsbs r2,#0 @ both negative? flip comparison
|
|
1:
|
|
cmp r0,r1
|
|
bgt 2f
|
|
blt 3f
|
|
5:
|
|
movs r2,#0
|
|
3:
|
|
rsbs r2,#0
|
|
2:
|
|
subs r0,r2,#0
|
|
bx r14
|
|
4:
|
|
orrs r1,r0
|
|
adds r1,r1
|
|
beq 5b
|
|
cmp r0,#0
|
|
bge 2b
|
|
b 3b
|
|
|
|
.ltorg
|
|
|
|
@ convert float to signed int, rounding towards -Inf, clamping
|
|
.thumb_func
|
|
qfp_float2int:
|
|
movs r1,#0 @ fall through
|
|
|
|
@ convert float in r0 to signed fixed point in r0, clamping
|
|
.thumb_func
|
|
qfp_float2fix:
|
|
push {r4,r14}
|
|
bl unpackx
|
|
movs r3,r2
|
|
adds r3,#130
|
|
bmi 6f @ -0?
|
|
add r2,r1 @ incorporate binary point position into exponent
|
|
subs r2,#23 @ r2 is now amount of left shift required
|
|
blt 1f @ requires right shift?
|
|
cmp r2,#7 @ overflow?
|
|
ble 4f
|
|
3: @ overflow
|
|
asrs r1,r0,#31 @ +ve:0 -ve:0xffffffff
|
|
mvns r1,r1 @ +ve:0xffffffff -ve:0
|
|
movs r0,#1
|
|
lsls r0,#31
|
|
5:
|
|
eors r0,r1 @ +ve:0x7fffffff -ve:0x80000000 (unsigned path: 0xffffffff)
|
|
pop {r4,r15}
|
|
1:
|
|
rsbs r2,#0 @ right shift for r0, >0
|
|
cmp r2,#32
|
|
blt 2f @ more than 32 bits of right shift?
|
|
movs r2,#32
|
|
2:
|
|
asrs r0,r0,r2
|
|
pop {r4,r15}
|
|
6:
|
|
movs r0,#0
|
|
pop {r4,r15}
|
|
|
|
@ unsigned version
|
|
.thumb_func
|
|
qfp_float2uint:
|
|
movs r1,#0 @ fall through
|
|
.thumb_func
|
|
qfp_float2ufix:
|
|
push {r4,r14}
|
|
bl unpackx
|
|
add r2,r1 @ incorporate binary point position into exponent
|
|
movs r1,r0
|
|
bmi 5b @ negative? return zero
|
|
subs r2,#23 @ r2 is now amount of left shift required
|
|
blt 1b @ requires right shift?
|
|
mvns r1,r0 @ ready to return 0xffffffff
|
|
cmp r2,#8 @ overflow?
|
|
bgt 5b
|
|
4:
|
|
lsls r0,r0,r2 @ result fits, left shifted
|
|
pop {r4,r15}
|
|
|
|
@ convert signed int to float, rounding
|
|
.thumb_func
|
|
qfp_int2float:
|
|
movs r1,#0 @ fall through
|
|
|
|
@ convert signed fix to float, rounding; number of r0 bits after point in r1
|
|
.thumb_func
|
|
qfp_fix2float:
|
|
push {r4,r5,r14}
|
|
1:
|
|
movs r2,#29
|
|
subs r2,r1 @ fix exponent
|
|
packretns: @ pack and return, sticky bits=0
|
|
movs r5,#0
|
|
packret: @ common return point: "pack and return"
|
|
bl packx
|
|
ret_pop45:
|
|
pop {r4,r5,r15}
|
|
|
|
|
|
@ unsigned version
|
|
.thumb_func
|
|
qfp_uint2float:
|
|
movs r1,#0 @ fall through
|
|
.thumb_func
|
|
qfp_ufix2float:
|
|
push {r4,r5,r14}
|
|
cmp r0,#0
|
|
bge 1b @ treat <2^31 as signed
|
|
movs r2,#30
|
|
subs r2,r1 @ fix exponent
|
|
lsls r5,r0,#31 @ one sticky bit
|
|
lsrs r0,#1
|
|
b packret
|
|
|
|
.thumb_func
|
|
qfp_fexp:
|
|
push {r4,r5,r14}
|
|
movs r1,#24
|
|
bl qfp_float2fix @ Q24: covers entire valid input range
|
|
asrs r1,r0,#16 @ Q8
|
|
ldr r2,=#5909 @ log_2(e) Q12
|
|
muls r2,r1 @ estimate exponent of result Q20 (always an underestimate)
|
|
asrs r2,#20 @ Q0
|
|
lsls r1,r0,#6 @ Q30
|
|
ldr r0,=#0x2c5c85fe @ ln(2) Q30
|
|
muls r0,r2 @ accurate contribution of estimated exponent
|
|
subs r1,r0 @ residual to be exponentiated, guaranteed ≥0, < about 0.75 Q30
|
|
|
|
@ here
|
|
@ r1: mantissa to exponentiate, 0...~0.75 Q30
|
|
@ r2: first exponent estimate
|
|
|
|
movs r5,#1 @ shift
|
|
adr r3,ftab_exp @ could use alternate words from dtab_exp to save space if required
|
|
movs r0,#1
|
|
lsls r0,#29 @ x=1 Q29
|
|
|
|
3:
|
|
ldmia r3!,{r4}
|
|
subs r4,r1,r4
|
|
bmi 1f
|
|
movs r1,r4 @ keep result of subtraction
|
|
movs r4,r0
|
|
lsrs r4,r5
|
|
adcs r0,r4 @ x+=x>>i with rounding
|
|
|
|
1:
|
|
adds r5,#1
|
|
cmp r5,#15
|
|
bne 3b
|
|
|
|
@ here
|
|
@ r0: exp a Q29 1..2+
|
|
@ r1: ε (residual x where x=a+ε), < 2^-14 Q30
|
|
@ r2: first exponent estimate
|
|
@ and we wish to calculate exp x=exp a exp ε~(exp a)(1+ε)
|
|
|
|
lsrs r3,r0,#15 @ exp a Q14
|
|
muls r3,r1 @ ε exp a Q44
|
|
lsrs r3,#15 @ ε exp a Q29
|
|
adcs r0,r3 @ (1+ε) exp a Q29 with rounding
|
|
|
|
b packretns @ pack result
|
|
|
|
.thumb_func
|
|
qfp_fln:
|
|
push {r4,r5,r14}
|
|
asrs r1,r0,#23
|
|
bmi 3f @ -ve argument?
|
|
beq 3f @ 0 argument?
|
|
cmp r1,#0xff
|
|
beq 4f @ +Inf/NaN
|
|
bl unpackx
|
|
adds r2,#1
|
|
ldr r3,=#0x2c5c85fe @ ln(2) Q30
|
|
lsrs r1,r3,#14 @ ln(2) Q16
|
|
muls r1,r2 @ result estimate Q16
|
|
asrs r1,#16 @ integer contribution to result
|
|
muls r3,r2
|
|
lsls r4,r1,#30
|
|
subs r3,r4 @ fractional contribution to result Q30, signed
|
|
lsls r0,#8 @ Q31
|
|
|
|
@ here
|
|
@ r0: mantissa Q31
|
|
@ r1: integer contribution to result
|
|
@ r3: fractional contribution to result Q30, signed
|
|
|
|
movs r5,#1 @ shift
|
|
adr r4,ftab_exp @ could use alternate words from dtab_exp to save space if required
|
|
|
|
2:
|
|
movs r2,r0
|
|
lsrs r2,r5
|
|
adcs r2,r0 @ x+(x>>i) with rounding
|
|
bcs 1f @ >=2?
|
|
movs r0,r2 @ keep result
|
|
ldr r2,[r4]
|
|
subs r3,r2
|
|
1:
|
|
adds r4,#4
|
|
adds r5,#1
|
|
cmp r5,#15
|
|
bne 2b
|
|
|
|
@ here
|
|
@ r0: residual x, nearly 2 Q31
|
|
@ r1: integer contribution to result
|
|
@ r3: fractional part of result Q30
|
|
|
|
asrs r0,#2
|
|
adds r0,r3,r0
|
|
|
|
cmp r1,#0
|
|
bne 2f
|
|
|
|
asrs r0,#1
|
|
lsls r1,#29
|
|
adds r0,r1
|
|
movs r2,#0
|
|
b packretns
|
|
|
|
2:
|
|
lsls r1,#24
|
|
asrs r0,#6 @ Q24
|
|
adcs r0,r1 @ with rounding
|
|
movs r2,#5
|
|
b packretns
|
|
|
|
3:
|
|
ldr r0,=#0xff800000 @ -Inf
|
|
pop {r4,r5,r15}
|
|
4:
|
|
ldr r0,=#0x7f800000 @ +Inf
|
|
pop {r4,r5,r15}
|
|
|
|
.align 2
|
|
ftab_exp:
|
|
.word 0x19f323ed @ log 1+2^-1 Q30
|
|
.word 0x0e47fbe4 @ log 1+2^-2 Q30
|
|
.word 0x0789c1dc @ log 1+2^-3 Q30
|
|
.word 0x03e14618 @ log 1+2^-4 Q30
|
|
.word 0x01f829b1 @ log 1+2^-5 Q30
|
|
.word 0x00fe0546 @ log 1+2^-6 Q30
|
|
.word 0x007f80aa @ log 1+2^-7 Q30
|
|
.word 0x003fe015 @ log 1+2^-8 Q30
|
|
.word 0x001ff803 @ log 1+2^-9 Q30
|
|
.word 0x000ffe00 @ log 1+2^-10 Q30
|
|
.word 0x0007ff80 @ log 1+2^-11 Q30
|
|
.word 0x0003ffe0 @ log 1+2^-12 Q30
|
|
.word 0x0001fff8 @ log 1+2^-13 Q30
|
|
.word 0x0000fffe @ log 1+2^-14 Q30
|
|
|
|
.align 2
|
|
.thumb_func
|
|
qfp_fsub:
|
|
ldr r2,=#0x80000000
|
|
eors r1,r2 @ flip sign on second argument
|
|
@ drop into fadd, on .align2:ed boundary
|
|
|
|
.thumb_func
|
|
qfp_fadd:
|
|
push {r4,r5,r6,r14}
|
|
asrs r4,r0,#31
|
|
lsls r2,r0,#1
|
|
lsrs r2,#24 @ x exponent
|
|
beq fa_xe0
|
|
cmp r2,#255
|
|
beq fa_xe255
|
|
fa_xe:
|
|
asrs r5,r1,#31
|
|
lsls r3,r1,#1
|
|
lsrs r3,#24 @ y exponent
|
|
beq fa_ye0
|
|
cmp r3,#255
|
|
beq fa_ye255
|
|
fa_ye:
|
|
ldr r6,=#0x007fffff
|
|
ands r0,r0,r6 @ extract mantissa bits
|
|
ands r1,r1,r6
|
|
adds r6,#1 @ r6=0x00800000
|
|
orrs r0,r0,r6 @ set implied 1
|
|
orrs r1,r1,r6
|
|
eors r0,r0,r4 @ complement...
|
|
eors r1,r1,r5
|
|
subs r0,r0,r4 @ ... and add 1 if sign bit is set: 2's complement
|
|
subs r1,r1,r5
|
|
subs r5,r3,r2 @ ye-xe
|
|
subs r4,r2,r3 @ xe-ye
|
|
bmi fa_ygtx
|
|
@ here xe>=ye
|
|
cmp r4,#30
|
|
bge fa_xmgty @ xe much greater than ye?
|
|
adds r5,#32
|
|
movs r3,r2 @ save exponent
|
|
@ here y in r1 must be shifted down r4 places to align with x in r0
|
|
movs r2,r1
|
|
lsls r2,r2,r5 @ keep the bits we will shift off the bottom of r1
|
|
asrs r1,r1,r4
|
|
b fa_0
|
|
|
|
.ltorg
|
|
|
|
fa_ymgtx:
|
|
movs r2,#0 @ result is just y
|
|
movs r0,r1
|
|
b fa_1
|
|
fa_xmgty:
|
|
movs r3,r2 @ result is just x
|
|
movs r2,#0
|
|
b fa_1
|
|
|
|
fa_ygtx:
|
|
@ here ye>xe
|
|
cmp r5,#30
|
|
bge fa_ymgtx @ ye much greater than xe?
|
|
adds r4,#32
|
|
@ here x in r0 must be shifted down r5 places to align with y in r1
|
|
movs r2,r0
|
|
lsls r2,r2,r4 @ keep the bits we will shift off the bottom of r0
|
|
asrs r0,r0,r5
|
|
fa_0:
|
|
adds r0,r1 @ result is now in r0:r2, possibly highly denormalised or zero; exponent in r3
|
|
beq fa_9 @ if zero, inputs must have been of identical magnitude and opposite sign, so return +0
|
|
fa_1:
|
|
lsrs r1,r0,#31 @ sign bit
|
|
beq fa_8
|
|
mvns r0,r0
|
|
rsbs r2,r2,#0
|
|
bne fa_8
|
|
adds r0,#1
|
|
fa_8:
|
|
adds r6,r6
|
|
@ r6=0x01000000
|
|
cmp r0,r6
|
|
bhs fa_2
|
|
fa_3:
|
|
adds r2,r2 @ normalisation loop
|
|
adcs r0,r0
|
|
subs r3,#1 @ adjust exponent
|
|
cmp r0,r6
|
|
blo fa_3
|
|
fa_2:
|
|
@ here r0:r2 is the result mantissa 0x01000000<=r0<0x02000000, r3 the exponent, and r1 the sign bit
|
|
lsrs r0,#1
|
|
bcc fa_4
|
|
@ rounding bits here are 1:r2
|
|
adds r0,#1 @ round up
|
|
cmp r2,#0
|
|
beq fa_5 @ sticky bits all zero?
|
|
fa_4:
|
|
cmp r3,#254
|
|
bhs fa_6 @ exponent too large or negative?
|
|
lsls r1,#31 @ pack everything
|
|
add r0,r1
|
|
lsls r3,#23
|
|
add r0,r3
|
|
fa_end:
|
|
pop {r4,r5,r6,r15}
|
|
|
|
fa_9:
|
|
cmp r2,#0 @ result zero?
|
|
beq fa_end @ return +0
|
|
b fa_1
|
|
|
|
fa_5:
|
|
lsrs r0,#1
|
|
lsls r0,#1 @ round to even
|
|
b fa_4
|
|
|
|
fa_6:
|
|
bge fa_7
|
|
@ underflow
|
|
@ can handle denormals here
|
|
lsls r0,r1,#31 @ result is signed zero
|
|
pop {r4,r5,r6,r15}
|
|
fa_7:
|
|
@ overflow
|
|
lsls r0,r1,#8
|
|
adds r0,#255
|
|
lsls r0,#23 @ result is signed infinity
|
|
pop {r4,r5,r6,r15}
|
|
|
|
|
|
fa_xe0:
|
|
@ can handle denormals here
|
|
subs r2,#32
|
|
adds r2,r4 @ exponent -32 for +Inf, -33 for -Inf
|
|
b fa_xe
|
|
|
|
fa_xe255:
|
|
@ can handle NaNs here
|
|
lsls r2,#8
|
|
add r2,r2,r4 @ exponent ~64k for +Inf, ~64k-1 for -Inf
|
|
b fa_xe
|
|
|
|
fa_ye0:
|
|
@ can handle denormals here
|
|
subs r3,#32
|
|
adds r3,r5 @ exponent -32 for +Inf, -33 for -Inf
|
|
b fa_ye
|
|
|
|
fa_ye255:
|
|
@ can handle NaNs here
|
|
lsls r3,#8
|
|
add r3,r3,r5 @ exponent ~64k for +Inf, ~64k-1 for -Inf
|
|
b fa_ye
|
|
|
|
|
|
.align 2
|
|
.thumb_func
|
|
qfp_fmul:
|
|
push {r7,r14}
|
|
mov r2,r0
|
|
eors r2,r1 @ sign of result
|
|
lsrs r2,#31
|
|
lsls r2,#31
|
|
mov r14,r2
|
|
lsls r0,#1
|
|
lsls r1,#1
|
|
lsrs r2,r0,#24 @ xe
|
|
beq fm_xe0
|
|
cmp r2,#255
|
|
beq fm_xe255
|
|
fm_xe:
|
|
lsrs r3,r1,#24 @ ye
|
|
beq fm_ye0
|
|
cmp r3,#255
|
|
beq fm_ye255
|
|
fm_ye:
|
|
adds r7,r2,r3 @ exponent of result (will possibly be incremented)
|
|
subs r7,#128 @ adjust bias for packing
|
|
lsls r0,#8 @ x mantissa
|
|
lsls r1,#8 @ y mantissa
|
|
lsrs r0,#9
|
|
lsrs r1,#9
|
|
|
|
adds r2,r0,r1 @ for later
|
|
mov r12,r2
|
|
lsrs r2,r0,#7 @ x[22..7] Q16
|
|
lsrs r3,r1,#7 @ y[22..7] Q16
|
|
muls r2,r2,r3 @ result [45..14] Q32: never an overestimate and worst case error is 2*(2^7-1)*(2^23-2^7)+(2^7-1)^2 = 2130690049 < 2^31
|
|
muls r0,r0,r1 @ result [31..0] Q46
|
|
lsrs r2,#18 @ result [45..32] Q14
|
|
bcc 1f
|
|
cmp r0,#0
|
|
bmi 1f
|
|
adds r2,#1 @ fix error in r2
|
|
1:
|
|
lsls r3,r0,#9 @ bits off bottom of result
|
|
lsrs r0,#23 @ Q23
|
|
lsls r2,#9
|
|
adds r0,r2 @ cut'n'shut
|
|
add r0,r12 @ implied 1*(x+y) to compensate for no insertion of implied 1s
|
|
@ result-1 in r3:r0 Q23+32, i.e., in range [0,3)
|
|
|
|
lsrs r1,r0,#23
|
|
bne fm_0 @ branch if we need to shift down one place
|
|
@ here 1<=result<2
|
|
cmp r7,#254
|
|
bhs fm_3a @ catches both underflow and overflow
|
|
lsls r3,#1 @ sticky bits at top of R3, rounding bit in carry
|
|
bcc fm_1 @ no rounding
|
|
beq fm_2 @ rounding tie?
|
|
adds r0,#1 @ round up
|
|
fm_1:
|
|
adds r7,#1 @ for implied 1
|
|
lsls r7,#23 @ pack result
|
|
add r0,r7
|
|
add r0,r14
|
|
pop {r7,r15}
|
|
fm_2: @ rounding tie
|
|
adds r0,#1
|
|
fm_3:
|
|
lsrs r0,#1
|
|
lsls r0,#1 @ clear bottom bit
|
|
b fm_1
|
|
|
|
@ here 1<=result-1<3
|
|
fm_0:
|
|
adds r7,#1 @ increment exponent
|
|
cmp r7,#254
|
|
bhs fm_3b @ catches both underflow and overflow
|
|
lsrs r0,#1 @ shift mantissa down
|
|
bcc fm_1a @ no rounding
|
|
adds r0,#1 @ assume we will round up
|
|
cmp r3,#0 @ sticky bits
|
|
beq fm_3c @ rounding tie?
|
|
fm_1a:
|
|
adds r7,r7
|
|
adds r7,#1 @ for implied 1
|
|
lsls r7,#22 @ pack result
|
|
add r0,r7
|
|
add r0,r14
|
|
pop {r7,r15}
|
|
|
|
fm_3c:
|
|
lsrs r0,#1
|
|
lsls r0,#1 @ clear bottom bit
|
|
b fm_1a
|
|
|
|
fm_xe0:
|
|
subs r2,#16
|
|
fm_xe255:
|
|
lsls r2,#8
|
|
b fm_xe
|
|
fm_ye0:
|
|
subs r3,#16
|
|
fm_ye255:
|
|
lsls r3,#8
|
|
b fm_ye
|
|
|
|
@ here the result is under- or overflowing
|
|
fm_3b:
|
|
bge fm_4 @ branch on overflow
|
|
@ trap case where result is denormal 0x007fffff + 0.5ulp or more
|
|
adds r7,#1 @ exponent=-1?
|
|
bne fm_5
|
|
@ corrected mantissa will be >= 3.FFFFFC (0x1fffffe Q23)
|
|
@ so r0 >= 2.FFFFFC (0x17ffffe Q23)
|
|
adds r0,#2
|
|
lsrs r0,#23
|
|
cmp r0,#3
|
|
bne fm_5
|
|
b fm_6
|
|
|
|
fm_3a:
|
|
bge fm_4 @ branch on overflow
|
|
@ trap case where result is denormal 0x007fffff + 0.5ulp or more
|
|
adds r7,#1 @ exponent=-1?
|
|
bne fm_5
|
|
adds r0,#1 @ mantissa=0xffffff (i.e., r0=0x7fffff)?
|
|
lsrs r0,#23
|
|
beq fm_5
|
|
fm_6:
|
|
movs r0,#1 @ return smallest normal
|
|
lsls r0,#23
|
|
add r0,r14
|
|
pop {r7,r15}
|
|
|
|
fm_5:
|
|
mov r0,r14
|
|
pop {r7,r15}
|
|
fm_4:
|
|
movs r0,#0xff
|
|
lsls r0,#23
|
|
add r0,r14
|
|
pop {r7,r15}
|
|
|
|
@ This version of the division algorithm uses external divider hardware to estimate the
|
|
@ reciprocal of the divisor to about 14 bits; then a multiplication step to get a first
|
|
@ quotient estimate; then the remainder based on this estimate is used to calculate a
|
|
@ correction to the quotient. The result is good to about 27 bits and so we only need
|
|
@ to calculate the exact remainder when close to a rounding boundary.
|
|
.align 2
|
|
.thumb_func
|
|
qfp_fdiv:
|
|
push {r4,r5,r6,r14}
|
|
fdiv_n:
|
|
|
|
movs r4,#1
|
|
lsls r4,#23 @ implied 1 position
|
|
lsls r2,r1,#9 @ clear out sign and exponent
|
|
lsrs r2,r2,#9
|
|
orrs r2,r2,r4 @ divisor mantissa Q23 with implied 1
|
|
|
|
@ here
|
|
@ r0=packed dividend
|
|
@ r1=packed divisor
|
|
@ r2=divisor mantissa Q23
|
|
@ r4=1<<23
|
|
|
|
// see divtest.c
|
|
lsrs r3,r2,#18 @ x2=x>>18; // Q5 32..63
|
|
adr r5,rcpapp-32
|
|
ldrb r3,[r5,r3] @ u=lut5[x2-32]; // Q8
|
|
lsls r5,r2,#5
|
|
muls r5,r5,r3
|
|
asrs r5,#14 @ e=(i32)(u*(x<<5))>>14; // Q22
|
|
asrs r6,r5,#11
|
|
muls r6,r6,r6 @ e2=(e>>11)*(e>>11); // Q22
|
|
subs r5,r6
|
|
muls r5,r5,r3 @ c=(e-e2)*u; // Q30
|
|
lsls r6,r3,#8
|
|
asrs r5,#13
|
|
adds r5,#1
|
|
asrs r5,#1
|
|
subs r5,r6,r5 @ u0=(u<<8)-((c+0x2000)>>14); // Q16
|
|
|
|
@ here
|
|
@ r0=packed dividend
|
|
@ r1=packed divisor
|
|
@ r2=divisor mantissa Q23
|
|
@ r4=1<<23
|
|
@ r5=reciprocal estimate Q16
|
|
|
|
lsrs r6,r0,#23
|
|
uxtb r3,r6 @ dividend exponent
|
|
lsls r0,#9
|
|
lsrs r0,#9
|
|
orrs r0,r0,r4 @ dividend mantissa Q23
|
|
|
|
lsrs r1,#23
|
|
eors r6,r1 @ sign of result in bit 8
|
|
lsrs r6,#8
|
|
lsls r6,#31 @ sign of result in bit 31, other bits clear
|
|
|
|
@ here
|
|
@ r0=dividend mantissa Q23
|
|
@ r1=divisor sign+exponent
|
|
@ r2=divisor mantissa Q23
|
|
@ r3=dividend exponent
|
|
@ r5=reciprocal estimate Q16
|
|
@ r6b31=sign of result
|
|
|
|
uxtb r1,r1 @ divisor exponent
|
|
cmp r1,#0
|
|
beq retinf
|
|
cmp r1,#255
|
|
beq 20f @ divisor is infinite
|
|
cmp r3,#0
|
|
beq retzero
|
|
cmp r3,#255
|
|
beq retinf
|
|
subs r3,r1 @ initial result exponent (no bias)
|
|
adds r3,#125 @ add bias
|
|
|
|
lsrs r1,r0,#8 @ dividend mantissa Q15
|
|
|
|
@ here
|
|
@ r0=dividend mantissa Q23
|
|
@ r1=dividend mantissa Q15
|
|
@ r2=divisor mantissa Q23
|
|
@ r3=initial result exponent
|
|
@ r5=reciprocal estimate Q16
|
|
@ r6b31=sign of result
|
|
|
|
muls r1,r5
|
|
|
|
lsrs r1,#16 @ Q15 qu0=(q15)(u*y0);
|
|
lsls r0,r0,#15 @ dividend Q38
|
|
movs r4,r2
|
|
muls r4,r1 @ Q38 qu0*x
|
|
subs r4,r0,r4 @ Q38 re0=(y<<15)-qu0*x; note this remainder is signed
|
|
asrs r4,#10
|
|
muls r4,r5 @ Q44 qu1=(re0>>10)*u; this quotient correction is also signed
|
|
asrs r4,#16 @ Q28
|
|
lsls r1,#13
|
|
adds r1,r1,r4 @ Q28 qu=(qu0<<13)+(qu1>>16);
|
|
|
|
@ here
|
|
@ r0=dividend mantissa Q38
|
|
@ r1=quotient Q28
|
|
@ r2=divisor mantissa Q23
|
|
@ r3=initial result exponent
|
|
@ r6b31=sign of result
|
|
|
|
lsrs r4,r1,#28
|
|
bne 1f
|
|
@ here the quotient is less than 1<<28 (i.e., result mantissa <1.0)
|
|
|
|
adds r1,#5
|
|
lsrs r4,r1,#4 @ rounding + small reduction in systematic bias
|
|
bcc 2f @ skip if we are not near a rounding boundary
|
|
lsrs r1,#3 @ quotient Q25
|
|
lsls r0,#10 @ dividend mantissa Q48
|
|
muls r1,r1,r2 @ quotient*divisor Q48
|
|
subs r0,r0,r1 @ remainder Q48
|
|
bmi 2f
|
|
b 3f
|
|
|
|
1:
|
|
@ here the quotient is at least 1<<28 (i.e., result mantissa >=1.0)
|
|
|
|
adds r3,#1 @ bump exponent (and shift mantissa down one more place)
|
|
adds r1,#9
|
|
lsrs r4,r1,#5 @ rounding + small reduction in systematic bias
|
|
bcc 2f @ skip if we are not near a rounding boundary
|
|
|
|
lsrs r1,#4 @ quotient Q24
|
|
lsls r0,#9 @ dividend mantissa Q47
|
|
muls r1,r1,r2 @ quotient*divisor Q47
|
|
subs r0,r0,r1 @ remainder Q47
|
|
bmi 2f
|
|
3:
|
|
adds r4,#1 @ increment quotient as we are above the rounding boundary
|
|
|
|
@ here
|
|
@ r3=result exponent
|
|
@ r4=correctly rounded quotient Q23 in range [1,2] *note closed interval*
|
|
@ r6b31=sign of result
|
|
|
|
2:
|
|
cmp r3,#254
|
|
bhs 10f @ this catches both underflow and overflow
|
|
lsls r1,r3,#23
|
|
adds r0,r4,r1
|
|
adds r0,r6
|
|
pop {r4,r5,r6,r15}
|
|
|
|
@ here divisor is infinite; dividend exponent in r3
|
|
20:
|
|
cmp r3,#255
|
|
bne retzero
|
|
|
|
retinf:
|
|
movs r0,#255
|
|
21:
|
|
lsls r0,#23
|
|
orrs r0,r6
|
|
pop {r4,r5,r6,r15}
|
|
|
|
10:
|
|
bge retinf @ overflow?
|
|
adds r1,r3,#1
|
|
bne retzero @ exponent <-1? return 0
|
|
@ here exponent is exactly -1
|
|
lsrs r1,r4,#25
|
|
bcc retzero @ mantissa is not 01000000?
|
|
@ return minimum normal
|
|
movs r0,#1
|
|
lsls r0,#23
|
|
orrs r0,r6
|
|
pop {r4,r5,r6,r15}
|
|
|
|
retzero:
|
|
movs r0,r6
|
|
pop {r4,r5,r6,r15}
|
|
|
|
@ x2=[32:1:63]/32;
|
|
@ round(256 ./(x2+1/64))
|
|
.align 2
|
|
rcpapp:
|
|
.byte 252,245,237,231,224,218,213,207,202,197,193,188,184,180,176,172
|
|
.byte 169,165,162,159,156,153,150,148,145,142,140,138,135,133,131,129
|
|
|
|
@ The square root routine uses an initial approximation to the reciprocal of the square root of the argument based
|
|
@ on the top four bits of the mantissa (possibly shifted one place to make the exponent even). It then performs two
|
|
@ Newton-Raphson iterations, resulting in about 14 bits of accuracy. This reciprocal is then multiplied by
|
|
@ the original argument to produce an approximation to the result, again with about 14 bits of accuracy.
|
|
@ Then a remainder is calculated, and multiplied by the reciprocal estiamte to generate a correction term
|
|
@ giving a final answer to about 28 bits of accuracy. A final remainder calculation rounds to the correct
|
|
@ result if necessary.
|
|
@ Again, the fixed-point calculation is carefully implemented to preserve accuracy, and similar comments to those
|
|
@ made above on the fast division routine apply.
|
|
@ The reciprocal square root calculation has been tested for all possible (possibly shifted) input mantissa values.
|
|
.align 2
|
|
.thumb_func
|
|
qfp_fsqrt:
|
|
push {r4}
|
|
lsls r1,r0,#1
|
|
bcs sq_0 @ negative?
|
|
lsls r1,#8
|
|
lsrs r1,#9 @ mantissa
|
|
movs r2,#1
|
|
lsls r2,#23
|
|
adds r1,r2 @ insert implied 1
|
|
lsrs r2,r0,#23 @ extract exponent
|
|
beq sq_2 @ zero?
|
|
cmp r2,#255 @ infinite?
|
|
beq sq_1
|
|
adds r2,#125 @ correction for packing
|
|
asrs r2,#1 @ exponent/2, LSB into carry
|
|
bcc 1f
|
|
lsls r1,#1 @ was even: double mantissa; mantissa y now 1..4 Q23
|
|
1:
|
|
adr r4,rsqrtapp-4@ first four table entries are never accessed because of the mantissa's leading 1
|
|
lsrs r3,r1,#21 @ y Q2
|
|
ldrb r4,[r4,r3] @ initial approximation to reciprocal square root a0 Q8
|
|
|
|
lsrs r0,r1,#7 @ y Q16: first Newton-Raphson iteration
|
|
muls r0,r4 @ a0*y Q24
|
|
muls r0,r4 @ r0=p0=a0*y*y Q32
|
|
asrs r0,#12 @ r0 Q20
|
|
muls r0,r4 @ dy0=a0*r0 Q28
|
|
asrs r0,#13 @ dy0 Q15
|
|
lsls r4,#8 @ a0 Q16
|
|
subs r4,r0 @ a1=a0-dy0/2 Q16-Q15/2 -> Q16
|
|
adds r4,#170 @ mostly remove systematic error in this approximation: gains approximately 1 bit
|
|
|
|
movs r0,r4 @ second Newton-Raphson iteration
|
|
muls r0,r0 @ a1*a1 Q32
|
|
lsrs r0,#15 @ a1*a1 Q17
|
|
lsrs r3,r1,#8 @ y Q15
|
|
muls r0,r3 @ r1=p1=a1*a1*y Q32
|
|
asrs r0,#12 @ r1 Q20
|
|
muls r0,r4 @ dy1=a1*r1 Q36
|
|
asrs r0,#21 @ dy1 Q15
|
|
subs r4,r0 @ a2=a1-dy1/2 Q16-Q15/2 -> Q16
|
|
|
|
muls r3,r4 @ a3=y*a2 Q31
|
|
lsrs r3,#15 @ a3 Q16
|
|
@ here a2 is an approximation to the reciprocal square root
|
|
@ and a3 is an approximation to the square root
|
|
movs r0,r3
|
|
muls r0,r0 @ a3*a3 Q32
|
|
lsls r1,#9 @ y Q32
|
|
subs r0,r1,r0 @ r2=y-a3*a3 Q32 remainder
|
|
asrs r0,#5 @ r2 Q27
|
|
muls r4,r0 @ r2*a2 Q43
|
|
lsls r3,#7 @ a3 Q23
|
|
asrs r0,r4,#15 @ r2*a2 Q28
|
|
adds r0,#16 @ rounding to Q24
|
|
asrs r0,r0,#6 @ r2*a2 Q22
|
|
add r3,r0 @ a4 Q23: candidate final result
|
|
bcc sq_3 @ near rounding boundary? skip if no rounding needed
|
|
mov r4,r3
|
|
adcs r4,r4 @ a4+0.5ulp Q24
|
|
muls r4,r4 @ Q48
|
|
lsls r1,#16 @ y Q48
|
|
subs r1,r4 @ remainder Q48
|
|
bmi sq_3
|
|
adds r3,#1 @ round up
|
|
sq_3:
|
|
lsls r2,#23 @ pack exponent
|
|
adds r0,r2,r3
|
|
sq_6:
|
|
pop {r4}
|
|
bx r14
|
|
|
|
sq_0:
|
|
lsrs r1,#24
|
|
beq sq_2 @ -0: return it
|
|
@ here negative and not -0: return -Inf
|
|
asrs r0,#31
|
|
sq_5:
|
|
lsls r0,#23
|
|
b sq_6
|
|
sq_1: @ +Inf
|
|
lsrs r0,#23
|
|
b sq_5
|
|
sq_2:
|
|
lsrs r0,#31
|
|
lsls r0,#31
|
|
b sq_6
|
|
|
|
@ round(sqrt(2^22./[72:16:248]))
|
|
rsqrtapp:
|
|
.byte 0xf1,0xda,0xc9,0xbb, 0xb0,0xa6,0x9e,0x97, 0x91,0x8b,0x86,0x82
|
|
|
|
qfp_lib_end:
|