Drivers/qfplib-m0-full-20240105/qfplib-m0-full.s


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041

@ Copyright 2019-2024 Mark Owen
@ http://www.quinapalus.com
@ E-mail: qfp@quinapalus.com
@
@ This file is free software: you can redistribute it and/or modify
@ it under the terms of version 2 of the GNU General Public License
@ as published by the Free Software Foundation.
@
@ This file is distributed in the hope that it will be useful,
@ but WITHOUT ANY WARRANTY; without even the implied warranty of
@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
@ GNU General Public License for more details.
@
@ You should have received a copy of the GNU General Public License
@ along with this file.  If not, see <http://www.gnu.org/licenses/> or
@ write to the Free Software Foundation, Inc., 51 Franklin Street,
@ Fifth Floor, Boston, MA  02110-1301, USA.

.section .RamFunc

.syntax unified
.cpu cortex-m0plus
.thumb

@ exported symbols

.global qfp_fadd
.global qfp_fsub
.global qfp_fmul
.global qfp_fdiv
.global qfp_fcmp
.global qfp_fsqrt
.global qfp_float2int
.global qfp_float2fix
.global qfp_float2uint
.global qfp_float2ufix
.global qfp_int2float
.global qfp_fix2float
.global qfp_uint2float
.global qfp_ufix2float
.global qfp_fexp
.global qfp_fln

qfp_lib_start:

@ exchange r0<->r1, r2<->r3
xchxy:
 push {r0,r2,r14}
 mov r0,r1
 mov r2,r3
 pop {r1,r3,r15}

@ IEEE single in r0-> signed (two's complemennt) mantissa in r0 9Q23 (24 significant bits), signed exponent (bias removed) in r2
@ trashes r4; zero, denormal -> mantissa=+/-1, exponent=-380; Inf, NaN -> mantissa=+/-1, exponent=+640
unpackx:
 lsrs r2,r0,#23 @ save exponent and sign
 lsls r0,#9     @ extract mantissa
 lsrs r0,#9
 movs r4,#1
 lsls r4,#23
 orrs r0,r4     @ reinstate implied leading 1
 cmp r2,#255    @ test sign bit
 uxtb r2,r2     @ clear it
 bls 1f         @ branch on positive
 rsbs r0,#0     @ negate mantissa
1:
 subs r2,#1
 cmp r2,#254    @ zero/denormal/Inf/NaN?
 bhs 2f
 subs r2,#126   @ remove exponent bias: can now be -126..+127
 bx r14

2:              @ here with special-case values
 cmp r0,#0
 mov r0,r4      @ set mantissa to +1
 bpl 3f
 rsbs r0,#0     @ zero/denormal/Inf/NaN: mantissa=+/-1
3:
 subs r2,#126   @ zero/denormal: exponent -> -127; Inf, NaN: exponent -> 128
 lsls r2,#2     @ zero/denormal: exponent -> -508; Inf, NaN: exponent -> 512
 adds r2,#128   @ zero/denormal: exponent -> -380; Inf, NaN: exponent -> 640
 bx r14

@ normalise and pack signed mantissa in r0 nominally 3Q29, signed exponent in r2-> IEEE single in r0
@ trashes r4, preserves r1,r3
@ r5: "sticky bits", must be zero iff all result bits below r0 are zero for correct rounding
packx:
 lsrs r4,r0,#31 @ save sign bit
 lsls r4,r4,#31 @ sign now in b31
 bpl 2f         @ skip if positive
 cmp r5,#0
 beq 11f
 adds r0,#1     @ fiddle carry in to following rsb if sticky bits are non-zero
11:
 rsbs r0,#0     @ can now treat r0 as unsigned
packx0:
 bmi 3f         @ catch r0=0x80000000 case
2:
 subs r2,#1     @ normalisation loop
 adds r0,r0
 beq 1f         @ zero? special case
 bpl 2b         @ normalise so leading "1" in bit 31
3:
 adds r2,#129   @ (mis-)offset exponent
 bne 12f        @ special case: highest denormal can round to lowest normal
 adds r0,#0x80  @ in special case, need to add 256 to r0 for rounding
 bcs 4f         @ tripped carry? then have leading 1 in C as required
12:
 adds r0,#0x80  @ rounding
 bcs 4f         @ tripped carry? then have leading 1 in C as required (and result is even so can ignore sticky bits)
 cmp r5,#0
 beq 7f         @ sticky bits zero?
8:
 lsls r0,#1     @ remove leading 1
9:
 subs r2,#1     @ compensate exponent on this path
4:
 cmp r2,#254
 bge 5f         @ overflow?
 adds r2,#1     @ correct exponent offset
 ble 10f        @ denormal/underflow?
 lsrs r0,#9     @ align mantissa
 lsls r2,#23    @ align exponent
 orrs r0,r2     @ assemble exponent and mantissa
6:
 orrs r0,r4     @ apply sign
1:
 bx r14

5:
 movs r0,#0xff  @ create infinity
 lsls r0,#23
 b 6b

10:
 movs r0,#0     @ create zero
 bx r14

7:              @ sticky bit rounding case
 lsls r5,r0,#24 @ check bottom 8 bits of r0
 bne 8b         @ in rounding-tie case?
 lsrs r0,#9     @ ensure even result
 lsls r0,#10
 b 9b

.align 2
.ltorg

@ signed multiply r0 1Q23 by r1 4Q23, result in r0 7Q25, sticky bits in r5
@ trashes r3,r4
mul0:
 uxth r3,r0      @ Q23
 asrs r4,r1,#16  @ Q7
 muls r3,r4      @ L*H, Q30 signed
 asrs r4,r0,#16  @ Q7
 uxth r5,r1      @ Q23
 muls r4,r5      @ H*L, Q30 signed
 adds r3,r4      @ sum of middle partial products
 uxth r4,r0
 muls r4,r5      @ L*L, Q46 unsigned
 lsls r5,r4,#16  @ initialise sticky bits from low half of low partial product
 lsrs r4,#16     @ Q25
 adds r3,r4      @ add high half of low partial product to sum of middle partial products
                 @ (cannot generate carry by limits on input arguments)
 asrs r0,#16     @ Q7
 asrs r1,#16     @ Q7
 muls r0,r1      @ H*H, Q14 signed
 lsls r0,#11     @ high partial product Q25
 lsls r1,r3,#27  @ sticky
 orrs r5,r1      @ collect further sticky bits
 asrs r1,r3,#5   @ middle partial products Q25
 adds r0,r1      @ final result
 bx r14

.thumb_func
qfp_fcmp:
 lsls r2,r0,#1
 lsrs r2,#24
 beq 1f
 cmp r2,#0xff
 bne 2f
1:
 lsrs r0,#23     @ clear mantissa if NaN or denormal
 lsls r0,#23
2:
 lsls r2,r1,#1
 lsrs r2,#24
 beq 1f
 cmp r2,#0xff
 bne 2f
1:
 lsrs r1,#23     @ clear mantissa if NaN or denormal
 lsls r1,#23
2:
 movs r2,#1      @ initialise result
 eors r1,r0
 bmi 4f          @ opposite signs? then can proceed on basis of sign of x
 eors r1,r0      @ restore y
 bpl 1f
 rsbs r2,#0      @ both negative? flip comparison
1:
 cmp r0,r1
 bgt 2f
 blt 3f
5:
 movs r2,#0
3:
 rsbs r2,#0
2:
 subs r0,r2,#0
 bx r14
4:
 orrs r1,r0
 adds r1,r1
 beq 5b
 cmp r0,#0
 bge 2b
 b 3b

.ltorg

@ convert float to signed int, rounding towards -Inf, clamping
.thumb_func
qfp_float2int:
 movs r1,#0      @ fall through

@ convert float in r0 to signed fixed point in r0, clamping
.thumb_func
qfp_float2fix:
 push {r4,r14}
 bl unpackx
 movs r3,r2
 adds r3,#130
 bmi 6f          @ -0?
 add r2,r1       @ incorporate binary point position into exponent
 subs r2,#23     @ r2 is now amount of left shift required
 blt 1f          @ requires right shift?
 cmp r2,#7       @ overflow?
 ble 4f
3:               @ overflow
 asrs r1,r0,#31  @ +ve:0 -ve:0xffffffff
 mvns r1,r1      @ +ve:0xffffffff -ve:0
 movs r0,#1
 lsls r0,#31
5:
 eors r0,r1      @ +ve:0x7fffffff -ve:0x80000000 (unsigned path: 0xffffffff)
 pop {r4,r15}
1:
 rsbs r2,#0      @ right shift for r0, >0
 cmp r2,#32
 blt 2f          @ more than 32 bits of right shift?
 movs r2,#32
2:
 asrs r0,r0,r2
 pop {r4,r15}
6:
 movs r0,#0
 pop {r4,r15}

@ unsigned version
.thumb_func
qfp_float2uint:
 movs r1,#0      @ fall through
.thumb_func
qfp_float2ufix:
 push {r4,r14}
 bl unpackx
 add r2,r1       @ incorporate binary point position into exponent
 movs r1,r0
 bmi 5b          @ negative? return zero
 subs r2,#23     @ r2 is now amount of left shift required
 blt 1b          @ requires right shift?
 mvns r1,r0      @ ready to return 0xffffffff
 cmp r2,#8       @ overflow?
 bgt 5b
4:
 lsls r0,r0,r2   @ result fits, left shifted
 pop {r4,r15}

@ convert signed int to float, rounding
.thumb_func
qfp_int2float:
 movs r1,#0      @ fall through

@ convert signed fix to float, rounding; number of r0 bits after point in r1
.thumb_func
qfp_fix2float:
 push {r4,r5,r14}
1:
 movs r2,#29
 subs r2,r1      @ fix exponent
packretns:       @ pack and return, sticky bits=0
 movs r5,#0
packret:         @ common return point: "pack and return"
 bl packx
ret_pop45:
 pop {r4,r5,r15}


@ unsigned version
.thumb_func
qfp_uint2float:
 movs r1,#0      @ fall through
.thumb_func
qfp_ufix2float:
 push {r4,r5,r14}
 cmp r0,#0
 bge 1b          @ treat <2^31 as signed
 movs r2,#30
 subs r2,r1      @ fix exponent
 lsls r5,r0,#31  @ one sticky bit
 lsrs r0,#1
 b packret

.thumb_func
qfp_fexp:
 push {r4,r5,r14}
 movs r1,#24
 bl qfp_float2fix    @ Q24: covers entire valid input range
 asrs r1,r0,#16      @ Q8
 ldr r2,=#5909       @ log_2(e) Q12
 muls r2,r1          @ estimate exponent of result Q20 (always an underestimate)
 asrs r2,#20         @ Q0
 lsls r1,r0,#6       @ Q30
 ldr r0,=#0x2c5c85fe @ ln(2) Q30
 muls r0,r2          @ accurate contribution of estimated exponent
 subs r1,r0          @ residual to be exponentiated, guaranteed ≥0, < about 0.75 Q30

@ here
@ r1: mantissa to exponentiate, 0...~0.75 Q30
@ r2: first exponent estimate

 movs r5,#1          @ shift
 adr r3,ftab_exp     @ could use alternate words from dtab_exp to save space if required
 movs r0,#1
 lsls r0,#29         @ x=1 Q29

3:
 ldmia r3!,{r4}
 subs r4,r1,r4
 bmi 1f
 movs r1,r4          @ keep result of subtraction
 movs r4,r0
 lsrs r4,r5
 adcs r0,r4          @ x+=x>>i with rounding

1:
 adds r5,#1
 cmp r5,#15
 bne 3b

@ here
@ r0: exp a Q29 1..2+
@ r1: ε (residual x where x=a+ε), < 2^-14 Q30
@ r2: first exponent estimate
@ and we wish to calculate exp x=exp a exp ε~(exp a)(1+ε)

 lsrs r3,r0,#15      @ exp a Q14
 muls r3,r1          @ ε exp a Q44
 lsrs r3,#15         @ ε exp a Q29
 adcs r0,r3          @ (1+ε) exp a Q29 with rounding

 b packretns         @ pack result

.thumb_func
qfp_fln:
 push {r4,r5,r14}
 asrs r1,r0,#23
 bmi 3f              @ -ve argument?
 beq 3f              @ 0 argument?
 cmp r1,#0xff
 beq 4f              @ +Inf/NaN
 bl unpackx
 adds r2,#1
 ldr r3,=#0x2c5c85fe @ ln(2) Q30
 lsrs r1,r3,#14      @ ln(2) Q16
 muls r1,r2          @ result estimate Q16
 asrs r1,#16         @ integer contribution to result
 muls r3,r2
 lsls r4,r1,#30
 subs r3,r4          @ fractional contribution to result Q30, signed
 lsls r0,#8          @ Q31

@ here
@ r0: mantissa Q31
@ r1: integer contribution to result
@ r3: fractional contribution to result Q30, signed

 movs r5,#1          @ shift
 adr r4,ftab_exp     @ could use alternate words from dtab_exp to save space if required

2:
 movs r2,r0
 lsrs r2,r5
 adcs r2,r0          @ x+(x>>i) with rounding
 bcs 1f              @ >=2?
 movs r0,r2          @ keep result
 ldr r2,[r4]
 subs r3,r2
1:
 adds r4,#4
 adds r5,#1
 cmp r5,#15
 bne 2b

@ here
@ r0: residual x, nearly 2 Q31
@ r1: integer contribution to result
@ r3: fractional part of result Q30

 asrs r0,#2
 adds r0,r3,r0

 cmp r1,#0
 bne 2f

 asrs r0,#1
 lsls r1,#29
 adds r0,r1
 movs r2,#0
 b packretns

2:
 lsls r1,#24
 asrs r0,#6          @ Q24
 adcs r0,r1          @ with rounding
 movs r2,#5
 b packretns

3:
 ldr r0,=#0xff800000 @ -Inf
 pop {r4,r5,r15}
4:
 ldr r0,=#0x7f800000 @ +Inf
 pop {r4,r5,r15}

.align 2
ftab_exp:
.word 0x19f323ed   @ log 1+2^-1 Q30
.word 0x0e47fbe4   @ log 1+2^-2 Q30
.word 0x0789c1dc   @ log 1+2^-3 Q30
.word 0x03e14618   @ log 1+2^-4 Q30
.word 0x01f829b1   @ log 1+2^-5 Q30
.word 0x00fe0546   @ log 1+2^-6 Q30
.word 0x007f80aa   @ log 1+2^-7 Q30
.word 0x003fe015   @ log 1+2^-8 Q30
.word 0x001ff803   @ log 1+2^-9 Q30
.word 0x000ffe00   @ log 1+2^-10 Q30
.word 0x0007ff80   @ log 1+2^-11 Q30
.word 0x0003ffe0   @ log 1+2^-12 Q30
.word 0x0001fff8   @ log 1+2^-13 Q30
.word 0x0000fffe   @ log 1+2^-14 Q30

.align 2
.thumb_func
qfp_fsub:
 ldr r2,=#0x80000000
 eors r1,r2    @ flip sign on second argument
@ drop into fadd, on .align2:ed boundary

.thumb_func
qfp_fadd:
 push {r4,r5,r6,r14}
 asrs r4,r0,#31
 lsls r2,r0,#1
 lsrs r2,#24     @ x exponent
 beq fa_xe0
 cmp r2,#255
 beq fa_xe255
fa_xe:
 asrs r5,r1,#31
 lsls r3,r1,#1
 lsrs r3,#24     @ y exponent
 beq fa_ye0
 cmp r3,#255
 beq fa_ye255
fa_ye:
 ldr r6,=#0x007fffff
 ands r0,r0,r6   @ extract mantissa bits
 ands r1,r1,r6
 adds r6,#1      @ r6=0x00800000
 orrs r0,r0,r6   @ set implied 1
 orrs r1,r1,r6
 eors r0,r0,r4   @ complement...
 eors r1,r1,r5
 subs r0,r0,r4   @ ... and add 1 if sign bit is set: 2's complement
 subs r1,r1,r5
 subs r5,r3,r2   @ ye-xe
 subs r4,r2,r3   @ xe-ye
 bmi fa_ygtx
@ here xe>=ye
 cmp r4,#30
 bge fa_xmgty    @ xe much greater than ye?
 adds r5,#32
 movs r3,r2      @ save exponent
@ here y in r1 must be shifted down r4 places to align with x in r0
 movs r2,r1
 lsls r2,r2,r5   @ keep the bits we will shift off the bottom of r1
 asrs r1,r1,r4
 b fa_0

.ltorg
 
fa_ymgtx:
 movs r2,#0      @ result is just y
 movs r0,r1
 b fa_1
fa_xmgty:
 movs r3,r2      @ result is just x
 movs r2,#0
 b fa_1

fa_ygtx:
@ here ye>xe
 cmp r5,#30
 bge fa_ymgtx    @ ye much greater than xe?
 adds r4,#32
@ here x in r0 must be shifted down r5 places to align with y in r1
 movs r2,r0
 lsls r2,r2,r4   @ keep the bits we will shift off the bottom of r0
 asrs r0,r0,r5
fa_0:
 adds r0,r1      @ result is now in r0:r2, possibly highly denormalised or zero; exponent in r3
 beq fa_9        @ if zero, inputs must have been of identical magnitude and opposite sign, so return +0
fa_1: 
 lsrs r1,r0,#31  @ sign bit
 beq fa_8
 mvns r0,r0
 rsbs r2,r2,#0
 bne fa_8
 adds r0,#1
fa_8:
 adds r6,r6
@ r6=0x01000000
 cmp r0,r6
 bhs fa_2
fa_3:
 adds r2,r2      @ normalisation loop
 adcs r0,r0
 subs r3,#1      @ adjust exponent
 cmp r0,r6
 blo fa_3
fa_2:
@ here r0:r2 is the result mantissa 0x01000000<=r0<0x02000000, r3 the exponent, and r1 the sign bit
 lsrs r0,#1
 bcc fa_4
@ rounding bits here are 1:r2
 adds r0,#1      @ round up
 cmp r2,#0
 beq fa_5        @ sticky bits all zero?
fa_4:
 cmp r3,#254
 bhs fa_6        @ exponent too large or negative?
 lsls r1,#31     @ pack everything
 add r0,r1
 lsls r3,#23
 add r0,r3
fa_end:
 pop {r4,r5,r6,r15}

fa_9:
 cmp r2,#0       @ result zero?
 beq fa_end      @ return +0
 b fa_1

fa_5:
 lsrs r0,#1
 lsls r0,#1      @ round to even
 b fa_4

fa_6:
 bge fa_7
@ underflow
@ can handle denormals here
 lsls r0,r1,#31  @ result is signed zero
 pop {r4,r5,r6,r15}
fa_7:
@ overflow
 lsls r0,r1,#8
 adds r0,#255
 lsls r0,#23     @ result is signed infinity
 pop {r4,r5,r6,r15}


fa_xe0:
@ can handle denormals here
 subs r2,#32
 adds r2,r4       @ exponent -32 for +Inf, -33 for -Inf
 b fa_xe

fa_xe255:
@ can handle NaNs here
 lsls r2,#8
 add r2,r2,r4 @ exponent ~64k for +Inf, ~64k-1 for -Inf
 b fa_xe

fa_ye0:
@ can handle denormals here
 subs r3,#32
 adds r3,r5       @ exponent -32 for +Inf, -33 for -Inf
 b fa_ye

fa_ye255:
@ can handle NaNs here
 lsls r3,#8
 add r3,r3,r5 @ exponent ~64k for +Inf, ~64k-1 for -Inf
 b fa_ye


.align 2
.thumb_func
qfp_fmul:
 push {r7,r14}
 mov r2,r0
 eors r2,r1       @ sign of result
 lsrs r2,#31
 lsls r2,#31
 mov r14,r2
 lsls r0,#1
 lsls r1,#1
 lsrs r2,r0,#24   @ xe
 beq fm_xe0
 cmp r2,#255
 beq fm_xe255
fm_xe:
 lsrs r3,r1,#24   @ ye
 beq fm_ye0
 cmp r3,#255
 beq fm_ye255
fm_ye:
 adds r7,r2,r3    @ exponent of result (will possibly be incremented)
 subs r7,#128     @ adjust bias for packing
 lsls r0,#8       @ x mantissa
 lsls r1,#8       @ y mantissa
 lsrs r0,#9
 lsrs r1,#9

 adds r2,r0,r1    @ for later
 mov r12,r2
 lsrs r2,r0,#7    @ x[22..7] Q16
 lsrs r3,r1,#7    @ y[22..7] Q16
 muls r2,r2,r3    @ result [45..14] Q32: never an overestimate and worst case error is 2*(2^7-1)*(2^23-2^7)+(2^7-1)^2 = 2130690049 < 2^31
 muls r0,r0,r1    @ result [31..0] Q46
 lsrs r2,#18      @ result [45..32] Q14
 bcc 1f
 cmp r0,#0
 bmi 1f
 adds r2,#1       @ fix error in r2
1:
 lsls r3,r0,#9    @ bits off bottom of result
 lsrs r0,#23      @ Q23
 lsls r2,#9
 adds r0,r2       @ cut'n'shut
 add r0,r12       @ implied 1*(x+y) to compensate for no insertion of implied 1s
@ result-1 in r3:r0 Q23+32, i.e., in range [0,3)

 lsrs r1,r0,#23
 bne fm_0         @ branch if we need to shift down one place
@ here 1<=result<2
 cmp r7,#254
 bhs fm_3a        @ catches both underflow and overflow
 lsls r3,#1       @ sticky bits at top of R3, rounding bit in carry
 bcc fm_1         @ no rounding
 beq fm_2         @ rounding tie?
 adds r0,#1       @ round up
fm_1:
 adds r7,#1       @ for implied 1
 lsls r7,#23      @ pack result
 add r0,r7
 add r0,r14
 pop {r7,r15}
fm_2:             @ rounding tie
 adds r0,#1
fm_3:
 lsrs r0,#1
 lsls r0,#1       @ clear bottom bit
 b fm_1

@ here 1<=result-1<3
fm_0:
 adds r7,#1       @ increment exponent
 cmp r7,#254
 bhs fm_3b        @ catches both underflow and overflow
 lsrs r0,#1       @ shift mantissa down
 bcc fm_1a        @ no rounding
 adds r0,#1       @ assume we will round up
 cmp r3,#0        @ sticky bits
 beq fm_3c        @ rounding tie?
fm_1a:
 adds r7,r7
 adds r7,#1       @ for implied 1
 lsls r7,#22      @ pack result
 add r0,r7
 add r0,r14
 pop {r7,r15}

fm_3c:
 lsrs r0,#1
 lsls r0,#1       @ clear bottom bit
 b fm_1a

fm_xe0:
 subs r2,#16
fm_xe255:
 lsls r2,#8
 b fm_xe
fm_ye0:
 subs r3,#16
fm_ye255:
 lsls r3,#8
 b fm_ye

@ here the result is under- or overflowing
fm_3b:
 bge fm_4        @ branch on overflow
@ trap case where result is denormal 0x007fffff + 0.5ulp or more
 adds r7,#1      @ exponent=-1?
 bne fm_5
@ corrected mantissa will be >= 3.FFFFFC (0x1fffffe Q23)
@ so r0 >= 2.FFFFFC (0x17ffffe Q23)
 adds r0,#2
 lsrs r0,#23
 cmp r0,#3
 bne fm_5
 b fm_6

fm_3a:
 bge fm_4        @ branch on overflow
@ trap case where result is denormal 0x007fffff + 0.5ulp or more
 adds r7,#1      @ exponent=-1?
 bne fm_5
 adds r0,#1      @ mantissa=0xffffff (i.e., r0=0x7fffff)?
 lsrs r0,#23
 beq fm_5
fm_6:
 movs r0,#1      @ return smallest normal
 lsls r0,#23
 add r0,r14
 pop {r7,r15}

fm_5:
 mov r0,r14
 pop {r7,r15}
fm_4:
 movs r0,#0xff
 lsls r0,#23
 add r0,r14
 pop {r7,r15}

@ This version of the division algorithm uses external divider hardware to estimate the
@ reciprocal of the divisor to about 14 bits; then a multiplication step to get a first
@ quotient estimate; then the remainder based on this estimate is used to calculate a
@ correction to the quotient. The result is good to about 27 bits and so we only need
@ to calculate the exact remainder when close to a rounding boundary.
.align 2
.thumb_func
qfp_fdiv:
 push {r4,r5,r6,r14}
fdiv_n:

 movs r4,#1
 lsls r4,#23   @ implied 1 position
 lsls r2,r1,#9 @ clear out sign and exponent
 lsrs r2,r2,#9
 orrs r2,r2,r4 @ divisor mantissa Q23 with implied 1

@ here
@ r0=packed dividend
@ r1=packed divisor
@ r2=divisor mantissa Q23
@ r4=1<<23

// see divtest.c
 lsrs r3,r2,#18 @ x2=x>>18; // Q5 32..63
 adr r5,rcpapp-32
 ldrb r3,[r5,r3] @ u=lut5[x2-32]; // Q8
 lsls r5,r2,#5
 muls r5,r5,r3
 asrs r5,#14 @ e=(i32)(u*(x<<5))>>14; // Q22
 asrs r6,r5,#11
 muls r6,r6,r6 @ e2=(e>>11)*(e>>11); // Q22
 subs r5,r6
 muls r5,r5,r3 @ c=(e-e2)*u; // Q30
 lsls r6,r3,#8
 asrs r5,#13
 adds r5,#1
 asrs r5,#1
 subs r5,r6,r5 @ u0=(u<<8)-((c+0x2000)>>14); // Q16

@ here
@ r0=packed dividend
@ r1=packed divisor
@ r2=divisor mantissa Q23
@ r4=1<<23
@ r5=reciprocal estimate Q16

 lsrs r6,r0,#23
 uxtb r3,r6        @ dividend exponent
 lsls r0,#9
 lsrs r0,#9
 orrs r0,r0,r4     @ dividend mantissa Q23

 lsrs r1,#23
 eors r6,r1        @ sign of result in bit 8
 lsrs r6,#8
 lsls r6,#31       @ sign of result in bit 31, other bits clear

@ here
@ r0=dividend mantissa Q23
@ r1=divisor sign+exponent
@ r2=divisor mantissa Q23
@ r3=dividend exponent
@ r5=reciprocal estimate Q16
@ r6b31=sign of result

 uxtb r1,r1        @ divisor exponent
 cmp r1,#0
 beq retinf
 cmp r1,#255
 beq 20f           @ divisor is infinite
 cmp r3,#0
 beq retzero
 cmp r3,#255
 beq retinf
 subs r3,r1        @ initial result exponent (no bias)
 adds r3,#125      @ add bias

 lsrs r1,r0,#8     @ dividend mantissa Q15

@ here
@ r0=dividend mantissa Q23
@ r1=dividend mantissa Q15
@ r2=divisor mantissa Q23
@ r3=initial result exponent
@ r5=reciprocal estimate Q16
@ r6b31=sign of result

 muls r1,r5

 lsrs r1,#16       @ Q15 qu0=(q15)(u*y0);
 lsls r0,r0,#15    @ dividend Q38
 movs r4,r2
 muls r4,r1        @ Q38 qu0*x
 subs r4,r0,r4     @ Q38 re0=(y<<15)-qu0*x; note this remainder is signed
 asrs r4,#10
 muls r4,r5        @ Q44 qu1=(re0>>10)*u; this quotient correction is also signed
 asrs r4,#16       @ Q28
 lsls r1,#13
 adds r1,r1,r4     @ Q28 qu=(qu0<<13)+(qu1>>16);

@ here
@ r0=dividend mantissa Q38
@ r1=quotient Q28
@ r2=divisor mantissa Q23
@ r3=initial result exponent
@ r6b31=sign of result

 lsrs r4,r1,#28
 bne 1f
@ here the quotient is less than 1<<28 (i.e., result mantissa <1.0)

 adds r1,#5
 lsrs r4,r1,#4     @ rounding + small reduction in systematic bias
 bcc 2f            @ skip if we are not near a rounding boundary
 lsrs r1,#3        @ quotient Q25
 lsls r0,#10       @ dividend mantissa Q48
 muls r1,r1,r2     @ quotient*divisor Q48
 subs r0,r0,r1     @ remainder Q48
 bmi 2f
 b 3f

1:
@ here the quotient is at least 1<<28 (i.e., result mantissa >=1.0)

 adds r3,#1        @ bump exponent (and shift mantissa down one more place)
 adds r1,#9
 lsrs r4,r1,#5     @ rounding + small reduction in systematic bias
 bcc 2f            @ skip if we are not near a rounding boundary

 lsrs r1,#4        @ quotient Q24
 lsls r0,#9        @ dividend mantissa Q47
 muls r1,r1,r2     @ quotient*divisor Q47
 subs r0,r0,r1     @ remainder Q47
 bmi 2f
3:
 adds r4,#1        @ increment quotient as we are above the rounding boundary

@ here
@ r3=result exponent
@ r4=correctly rounded quotient Q23 in range [1,2] *note closed interval*
@ r6b31=sign of result

2:
 cmp r3,#254
 bhs 10f           @ this catches both underflow and overflow
 lsls r1,r3,#23
 adds r0,r4,r1
 adds r0,r6
 pop {r4,r5,r6,r15}

@ here divisor is infinite; dividend exponent in r3
20:
 cmp r3,#255
 bne retzero

retinf:
 movs r0,#255
21:
 lsls r0,#23
 orrs r0,r6
 pop {r4,r5,r6,r15}

10:
 bge retinf       @ overflow?
 adds r1,r3,#1
 bne retzero      @ exponent <-1? return 0
@ here exponent is exactly -1
 lsrs r1,r4,#25
 bcc retzero      @ mantissa is not 01000000?
@ return minimum normal
 movs r0,#1
 lsls r0,#23
 orrs r0,r6
 pop {r4,r5,r6,r15}

retzero:
 movs r0,r6
 pop {r4,r5,r6,r15}

@ x2=[32:1:63]/32;
@ round(256 ./(x2+1/64))
.align 2
rcpapp:
.byte 252,245,237,231,224,218,213,207,202,197,193,188,184,180,176,172
.byte 169,165,162,159,156,153,150,148,145,142,140,138,135,133,131,129

@ The square root routine uses an initial approximation to the reciprocal of the square root of the argument based
@ on the top four bits of the mantissa (possibly shifted one place to make the exponent even). It then performs two
@ Newton-Raphson iterations, resulting in about 14 bits of accuracy. This reciprocal is then multiplied by
@ the original argument to produce an approximation to the result, again with about 14 bits of accuracy.
@ Then a remainder is calculated, and multiplied by the reciprocal estiamte to generate a correction term
@ giving a final answer to about 28 bits of accuracy. A final remainder calculation rounds to the correct
@ result if necessary.
@ Again, the fixed-point calculation is carefully implemented to preserve accuracy, and similar comments to those
@ made above on the fast division routine apply.
@ The reciprocal square root calculation has been tested for all possible (possibly shifted) input mantissa values.
.align 2
.thumb_func
qfp_fsqrt:
 push {r4}
 lsls r1,r0,#1
 bcs sq_0         @ negative?
 lsls r1,#8
 lsrs r1,#9       @ mantissa
 movs r2,#1
 lsls r2,#23
 adds r1,r2       @ insert implied 1
 lsrs r2,r0,#23   @ extract exponent
 beq sq_2         @ zero?
 cmp r2,#255      @ infinite?
 beq sq_1
 adds r2,#125     @ correction for packing
 asrs r2,#1       @ exponent/2, LSB into carry
 bcc 1f
 lsls r1,#1       @ was even: double mantissa; mantissa y now 1..4 Q23
1:
 adr r4,rsqrtapp-4@ first four table entries are never accessed because of the mantissa's leading 1
 lsrs r3,r1,#21   @ y Q2
 ldrb r4,[r4,r3]  @ initial approximation to reciprocal square root a0 Q8

 lsrs r0,r1,#7    @ y Q16: first Newton-Raphson iteration
 muls r0,r4       @ a0*y Q24
 muls r0,r4       @ r0=p0=a0*y*y Q32
 asrs r0,#12      @ r0 Q20
 muls r0,r4       @ dy0=a0*r0 Q28
 asrs r0,#13      @ dy0 Q15
 lsls r4,#8       @ a0 Q16
 subs r4,r0       @ a1=a0-dy0/2 Q16-Q15/2 -> Q16
 adds r4,#170     @ mostly remove systematic error in this approximation: gains approximately 1 bit

 movs r0,r4       @ second Newton-Raphson iteration
 muls r0,r0       @ a1*a1 Q32
 lsrs r0,#15      @ a1*a1 Q17
 lsrs r3,r1,#8    @ y Q15
 muls r0,r3       @ r1=p1=a1*a1*y Q32
 asrs r0,#12      @ r1 Q20
 muls r0,r4       @ dy1=a1*r1 Q36
 asrs r0,#21      @ dy1 Q15
 subs r4,r0       @ a2=a1-dy1/2 Q16-Q15/2 -> Q16

 muls r3,r4       @ a3=y*a2 Q31
 lsrs r3,#15      @ a3 Q16
@ here a2 is an approximation to the reciprocal square root
@ and a3 is an approximation to the square root
 movs r0,r3
 muls r0,r0       @ a3*a3 Q32
 lsls r1,#9       @ y Q32
 subs r0,r1,r0    @ r2=y-a3*a3 Q32 remainder
 asrs r0,#5       @ r2 Q27
 muls r4,r0       @ r2*a2 Q43
 lsls r3,#7       @ a3 Q23
 asrs r0,r4,#15   @ r2*a2 Q28
 adds r0,#16      @ rounding to Q24
 asrs r0,r0,#6    @ r2*a2 Q22
 add r3,r0        @ a4 Q23: candidate final result
 bcc sq_3         @ near rounding boundary? skip if no rounding needed
 mov r4,r3
 adcs r4,r4       @ a4+0.5ulp Q24
 muls r4,r4       @ Q48
 lsls r1,#16      @ y Q48
 subs r1,r4       @ remainder Q48
 bmi sq_3
 adds r3,#1       @ round up
sq_3:
 lsls r2,#23      @ pack exponent
 adds r0,r2,r3
sq_6:
 pop {r4}
 bx r14

sq_0:
 lsrs r1,#24
 beq sq_2         @ -0: return it
@ here negative and not -0: return -Inf
 asrs r0,#31
sq_5:
 lsls r0,#23
 b sq_6
sq_1:             @ +Inf
 lsrs r0,#23
 b sq_5
sq_2:
 lsrs r0,#31
 lsls r0,#31
 b sq_6

@ round(sqrt(2^22./[72:16:248]))
rsqrtapp:
.byte 0xf1,0xda,0xc9,0xbb, 0xb0,0xa6,0x9e,0x97, 0x91,0x8b,0x86,0x82

qfp_lib_end: