From eea21d422b06445729f1daec764591d083584613 Mon Sep 17 00:00:00 2001
From: Clyne Sullivan <clyne@bitgloo.com>
Date: Sun, 2 Jun 2024 17:59:04 -0400
Subject: [PATCH] use qfplib; fix I2S clock

---
 Makefile                                 |   15 +-
 cfg/mcuconf.h                            |    4 +-
 main.cpp                                 |   51 +-
 qfplib-m0-full-20240105/LICENCE          |  339 ++
 qfplib-m0-full-20240105/README           |    4 +
 qfplib-m0-full-20240105/qfplib-m0-full.h |   97 +
 qfplib-m0-full-20240105/qfplib-m0-full.s | 3670 ++++++++++++++++++++++
 sos-iir-filter.h                         |   96 +-
 8 files changed, 4221 insertions(+), 55 deletions(-)
 create mode 100644 qfplib-m0-full-20240105/LICENCE
 create mode 100644 qfplib-m0-full-20240105/README
 create mode 100644 qfplib-m0-full-20240105/qfplib-m0-full.h
 create mode 100644 qfplib-m0-full-20240105/qfplib-m0-full.s

diff --git a/Makefile b/Makefile
index 4e12bd1..1065763 100644
--- a/Makefile
+++ b/Makefile
@@ -5,7 +5,7 @@
 
 # Compiler options here.
 ifeq ($(USE_OPT),)
-  USE_OPT = -O3 -ggdb -fomit-frame-pointer -falign-functions=16
+  USE_OPT = -O3 -ggdb -fomit-frame-pointer -falign-functions=16 -fno-stack-protector
 endif
 
 # C specific options here (added to USE_OPT).
@@ -25,7 +25,7 @@ endif
 
 # Linker extra options here.
 ifeq ($(USE_LDOPT),)
-  USE_LDOPT = -lm
+  USE_LDOPT =
 endif
 
 # Enable this if you want link time optimizations (LTO).
@@ -66,12 +66,12 @@ endif
 
 # Enables the use of FPU (no, softfp, hard).
 ifeq ($(USE_FPU),)
-  USE_FPU = softfp
+  USE_FPU = no
 endif
 
 # FPU-related options.
 ifeq ($(USE_FPU_OPT),)
-  USE_FPU_OPT = -mfloat-abi=$(USE_FPU) -mfpu=fpv4-sp-d16
+  USE_FPU_OPT = #-mfloat-abi=$(USE_FPU) -mfpu=fpv4-sp-d16
 endif
 
 #
@@ -89,7 +89,7 @@ PROJECT = ch
 MCU  = cortex-m0
 
 # Imported source files and paths.
-CHIBIOS  := ../..
+CHIBIOS  := ../../ChibiOS
 CONFDIR  := ./cfg
 BUILDDIR := ./build
 DEPDIR   := ./.dep
@@ -127,14 +127,15 @@ CPPSRC = $(ALLCPPSRC) \
          main.cpp
 
 # List ASM source files here.
-ASMSRC = $(ALLASMSRC)
+ASMSRC = $(ALLASMSRC) \
+	  qfplib-m0-full-20240105/qfplib-m0-full.s
 
 # List ASM with preprocessor source files here.
 ASMXSRC = $(ALLXASMSRC)
 
 # Inclusion directories.
 INCDIR = $(CONFDIR) $(ALLINC) \
-         fpm/include
+	 qfplib-m0-full-20240105
 
 # Define C warning options here.
 CWARN = -Wall -Wextra -Wundef -Wstrict-prototypes
diff --git a/cfg/mcuconf.h b/cfg/mcuconf.h
index 6e9c549..7b25ff4 100644
--- a/cfg/mcuconf.h
+++ b/cfg/mcuconf.h
@@ -67,7 +67,7 @@
 #define STM32_PLLQ_VALUE                    4
 #define STM32_PLLR_VALUE                    4
 #define STM32_HPRE                          STM32_HPRE_DIV1
-#define STM32_PPRE                          STM32_PPRE_DIV1
+#define STM32_PPRE                          STM32_PPRE_DIV4
 #define STM32_MCOSEL                        STM32_MCOSEL_NOCLOCK
 #define STM32_MCOPRE                        STM32_MCOPRE_DIV1
 #define STM32_LSCOSEL                       STM32_LSCOSEL_NOCLOCK
@@ -79,7 +79,7 @@
 #define STM32_USART2SEL                     STM32_USART2SEL_PCLK
 #define STM32_LPUART1SEL                    STM32_LPUART1SEL_PCLK
 #define STM32_I2C1SEL                       STM32_I2C1SEL_PCLK
-#define STM32_I2S1SEL                       STM32_I2S1SEL_HSI16
+#define STM32_I2S1SEL                       STM32_I2S1SEL_SYSCLK
 #define STM32_LPTIM1SEL                     STM32_LPTIM1SEL_PCLK
 #define STM32_TIM1SEL                       STM32_TIM1SEL_TIMPCLK
 #define STM32_RNGSEL                        STM32_RNGSEL_HSI16
diff --git a/main.cpp b/main.cpp
index 5151e33..2db9ec8 100644
--- a/main.cpp
+++ b/main.cpp
@@ -9,31 +9,31 @@
 
 static constexpr auto& WEIGHTING       = A_weighting;
 static constexpr auto& MIC_EQUALIZER   = SPH0645LM4H_B_RB;
-static constexpr sos_t MIC_OFFSET_DB   (  0.f); // Linear offset
-static constexpr sos_t MIC_SENSITIVITY (-26.f); // dBFS value expected at MIC_REF_DB
-static constexpr sos_t MIC_REF_DB      ( 94.f); // dB where sensitivity is specified
+static constexpr float MIC_OFFSET_DB   (  0.f); // Linear offset
+static constexpr float MIC_SENSITIVITY (-26.f); // dBFS value expected at MIC_REF_DB
+static constexpr float MIC_REF_DB      ( 94.f); // dB where sensitivity is specified
 static constexpr sos_t MIC_OVERLOAD_DB (120.f); // dB - Acoustic overload point
 static constexpr sos_t MIC_NOISE_DB    ( 29.f); // dB - Noise floor
-static constexpr auto  MIC_BITS        = 16u;   // SPH0645 is actually 18 bits
-static constexpr auto  SAMPLE_RATE     = 32000u;
+static constexpr auto  MIC_BITS        = 18u;
+static constexpr auto  SAMPLE_RATE     = 48000u;
 
-static constexpr unsigned I2S_BUFSIZ = 512; // was 1024
-static constexpr unsigned I2S_STRIDE = 16; // was 8
+static constexpr unsigned I2S_BUFSIZ = 1024;
+static constexpr unsigned I2S_STRIDE = 16;
 
 // Calculate reference amplitude value at compile time
-static const auto MIC_REF_AMPL = fpm::pow(sos_t(10), MIC_SENSITIVITY / 20) *
+static const auto MIC_REF_AMPL = qfp_fpow(10.f, MIC_SENSITIVITY / 20) *
     ((1 << (MIC_BITS - 1)) - 1);
 
 static SEMAPHORE_DECL(i2sReady, 0);
 static THD_WORKING_AREA(waThread1, 128);
 static std::array<uint32_t, I2S_BUFSIZ> i2sBuffer;
-static sos_t Leq_sum_sqr (0);
+static sos_t Leq_sum_sqr (0.f);
 static unsigned Leq_samples = 0;
 
 static THD_FUNCTION(Thread1, arg);
 static void i2sCallback(I2SDriver *i2s);
 
-static constexpr auto I2SPRval = SAMPLE_RATE * 64 / 1000 / 1024 * 2;
+static constexpr unsigned I2SPRval = 16'000'000 / SAMPLE_RATE / 32 / 2;
 static constexpr I2SConfig i2sConfig = {
     /* TX buffer */ NULL,
     /* RX buffer */ i2sBuffer.data(),
@@ -43,11 +43,11 @@ static constexpr I2SConfig i2sConfig = {
                     (0 << SPI_I2SCFGR_I2SSTD_Pos) | // Philips I2S
                     (1 << SPI_I2SCFGR_DATLEN_Pos) | // 24-bit
                     SPI_I2SCFGR_CHLEN,              // 32-bit frame
-    /* I2SPR */     I2SPRval | (((I2SPRval / 2) & 1) ? SPI_I2SPR_ODD : 0)
+    /* I2SPR */     (I2SPRval / 2) | ((I2SPRval & 1) ? SPI_I2SPR_ODD : 0)
 };
 
 THD_TABLE_BEGIN
-  THD_TABLE_THREAD(0, "blinker1",     waThread1,       Thread1,      NULL)
+  THD_TABLE_THREAD(0, "main",     waThread1,       Thread1,      NULL)
 THD_TABLE_END
 
 int main(void)
@@ -64,6 +64,7 @@ THD_FUNCTION(Thread1, arg)
   
     chThdSleepMilliseconds(2000);
   
+    palSetPadMode(GPIOB, 7, PAL_MODE_OUTPUT_PUSHPULL);
     palSetPadMode(GPIOF, 2, PAL_MODE_UNCONNECTED);
     palSetLineMode(LINE_I2S_SD, PAL_MODE_ALTERNATE(0));
     palSetLineMode(LINE_I2S_WS, PAL_MODE_ALTERNATE(0));
@@ -79,12 +80,13 @@ THD_FUNCTION(Thread1, arg)
   
     uint8_t strbuf[7] = { 0, 0, 0, 'd', 'B', '\n', '\0' };
     for (;;) {
+        palSetPad(GPIOB, 7);
         chSemWait(&i2sReady);
 
-        const auto Leq_RMS = fpm::sqrt(Leq_sum_sqr / Leq_samples);
+        const auto Leq_RMS = qfp_fsqrt((float)Leq_sum_sqr / Leq_samples);
         const auto Leq_dB = MIC_OFFSET_DB + MIC_REF_DB + 20 *
-            fpm::log10(Leq_RMS / MIC_REF_AMPL);
-        Leq_sum_sqr = sos_t(0);
+            qfp_flog10(Leq_RMS / MIC_REF_AMPL);
+        Leq_sum_sqr = sos_t(0.f);
         Leq_samples = 0;
 
         auto n = std::clamp(static_cast<int32_t>(Leq_dB), 0l, 999l);
@@ -92,22 +94,24 @@ THD_FUNCTION(Thread1, arg)
         strbuf[1] = n % 10 + '0'; n /= 10;
         strbuf[0] = n ? n + '0' : ' ';
         sdWrite(&SD2, strbuf, sizeof(strbuf));
+        palClearPad(GPIOB, 7);
     }
 }
 
+int32_t fixsample(uint32_t s) {
+    return (int32_t)(((s & 0xFFFF) << 16) | (s >> 16)) >> (32 - MIC_BITS);
+}
+
 void i2sCallback(I2SDriver *i2s)
 {
-    // Note: samples come in as 16-bit big-endian, so for simplicity we just
-    // take the "high" 16-bits of valid data and drop the rest. Mic is
-    // technically 18-bit, so we are losing some precision.
-
+    palSetPad(GPIOB, 7);
     const auto halfsize = i2sBuffer.size() / 2;
     const auto offset = i2sIsBufferComplete(i2s) ? halfsize : 0;
     auto samples = reinterpret_cast<sos_t *>(i2sBuffer.data() + offset);
     std::ranges::copy(
-        std::views::counted(i2sBuffer.begin() + offset, halfsize)
-            | std::ranges::views::stride(2 * I2S_STRIDE)
-            | std::views::transform([](auto s) { return sos_t((int16_t)s); }),
+        std::views::counted(i2sBuffer.begin() + offset, halfsize / I2S_STRIDE)
+            | std::ranges::views::stride(2)
+            | std::views::transform([](uint32_t s) { return sos_t(fixsample(s)); }),
         samples);
     auto samps = std::views::counted(samples, halfsize / (2 * I2S_STRIDE));
 
@@ -117,8 +121,9 @@ void i2sCallback(I2SDriver *i2s)
     Leq_samples += samps.size();
 
     // Wakeup main thread for dB calculation every second
-    if (Leq_samples >= 2 * SAMPLE_RATE / I2S_STRIDE) {
+    if (Leq_samples >= SAMPLE_RATE / I2S_STRIDE) {
         chSemSignalI(&i2sReady);
     }
+    palClearPad(GPIOB, 7);
 }
 
diff --git a/qfplib-m0-full-20240105/LICENCE b/qfplib-m0-full-20240105/LICENCE
new file mode 100644
index 0000000..d511905
--- /dev/null
+++ b/qfplib-m0-full-20240105/LICENCE
@@ -0,0 +1,339 @@
+		    GNU GENERAL PUBLIC LICENSE
+		       Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+			    Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users.  This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it.  (Some other Free Software Foundation software is covered by
+the GNU Lesser General Public License instead.)  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+  To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have.  You must make sure that they, too, receive or can get the
+source code.  And you must show them these terms so they know their
+rights.
+
+  We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+  Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software.  If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+  Finally, any free program is threatened constantly by software
+patents.  We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary.  To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+		    GNU GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License.  The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language.  (Hereinafter, translation is included without limitation in
+the term "modification".)  Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+  1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+  2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) You must cause the modified files to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    b) You must cause any work that you distribute or publish, that in
+    whole or in part contains or is derived from the Program or any
+    part thereof, to be licensed as a whole at no charge to all third
+    parties under the terms of this License.
+
+    c) If the modified program normally reads commands interactively
+    when run, you must cause it, when started running for such
+    interactive use in the most ordinary way, to print or display an
+    announcement including an appropriate copyright notice and a
+    notice that there is no warranty (or else, saying that you provide
+    a warranty) and that users may redistribute the program under
+    these conditions, and telling the user how to view a copy of this
+    License.  (Exception: if the Program itself is interactive but
+    does not normally print such an announcement, your work based on
+    the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+    a) Accompany it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Sections 1 and 2 above on a medium
+    customarily used for software interchange; or,
+
+    c) Accompany it with the information you received as to the offer
+    to distribute corresponding source code.  (This alternative is
+    allowed only for noncommercial distribution and only if you
+    received the program in object code or executable form with such
+    an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it.  For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable.  However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+  5. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Program or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+  6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+  7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation.  If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+  10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission.  For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this.  Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+			    NO WARRANTY
+
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+		     END OF TERMS AND CONDITIONS
+
+	    How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+    Gnomovision version 69, Copyright (C) year name of author
+    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+  `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+  <signature of Ty Coon>, 1 April 1989
+  Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs.  If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.
diff --git a/qfplib-m0-full-20240105/README b/qfplib-m0-full-20240105/README
new file mode 100644
index 0000000..99e878a
--- /dev/null
+++ b/qfplib-m0-full-20240105/README
@@ -0,0 +1,4 @@
+The Qfplib libraries are open source, licensed under version 2 of the
+GNU GPL. A copy of that licence is included in this archive.
+
+Visit http://www.quinapalus.com/qfplib.html for more information.
diff --git a/qfplib-m0-full-20240105/qfplib-m0-full.h b/qfplib-m0-full-20240105/qfplib-m0-full.h
new file mode 100644
index 0000000..36d17d2
--- /dev/null
+++ b/qfplib-m0-full-20240105/qfplib-m0-full.h
@@ -0,0 +1,97 @@
+#ifndef __QFPLIB_M0_FULL_H__
+#define __QFPLIB_M0_FULL_H__
+
+/*
+Copyright 2019-2024 Mark Owen
+http://www.quinapalus.com
+E-mail: qfp@quinapalus.com
+
+This file is free software: you can redistribute it and/or modify
+it under the terms of version 2 of the GNU General Public License
+as published by the Free Software Foundation.
+
+This file is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this file.  If not, see <http://www.gnu.org/licenses/> or
+write to the Free Software Foundation, Inc., 51 Franklin Street,
+Fifth Floor, Boston, MA  02110-1301, USA.
+*/
+
+typedef unsigned           int ui32;
+typedef                    int i32;
+typedef unsigned long long int ui64;
+typedef          long long int i64;
+
+extern float  qfp_fadd          (float x,float y);
+extern float  qfp_fsub          (float x,float y);
+extern float  qfp_fmul          (float x,float y);
+extern float  qfp_fdiv          (float x,float y);
+extern int    qfp_fcmp          (float x,float y);
+extern float  qfp_fsqrt         (float x);
+extern i32    qfp_float2int     (float x);
+extern i32    qfp_float2fix     (float x,int f);
+extern ui32   qfp_float2uint    (float x);
+extern ui32   qfp_float2ufix    (float x,int f);
+extern float  qfp_int2float     (i32 x);
+extern float  qfp_fix2float     (i32 x,int f);
+extern float  qfp_uint2float    (ui32 x);
+extern float  qfp_ufix2float    (ui32 x,int f);
+extern float  qfp_int642float   (i64 x);
+extern float  qfp_fix642float   (i64 x,int f);
+extern float  qfp_uint642float  (ui64 x);
+extern float  qfp_ufix642float  (ui64 x,int f);
+extern float  qfp_fcos          (float x);
+extern float  qfp_fsin          (float x);
+extern float  qfp_ftan          (float x);
+extern float  qfp_fatan2        (float y,float x);
+extern float  qfp_fexp          (float x);
+extern float  qfp_fln           (float x);
+
+extern double qfp_dadd          (double x,double y);
+extern double qfp_dsub          (double x,double y);
+extern double qfp_dmul          (double x,double y);
+extern double qfp_ddiv          (double x,double y);
+extern double qfp_dsqrt         (double x);
+extern double qfp_dcos          (double x);
+extern double qfp_dsin          (double x);
+extern double qfp_dtan          (double x);
+extern double qfp_datan2        (double y,double x);
+extern double qfp_dexp          (double x);
+extern double qfp_dln           (double x);
+extern int    qfp_dcmp          (double x,double y);
+
+extern i64    qfp_float2int64   (float x);
+extern i64    qfp_float2fix64   (float x,int f);
+extern ui64   qfp_float2uint64  (float x);
+extern ui64   qfp_float2ufix64  (float x,int f);
+extern i32    qfp_float2int_z   (float x);
+extern i64    qfp_float2int64_z (float x);
+
+extern i32    qfp_double2int    (double x);
+extern i32    qfp_double2fix    (double x,int f);
+extern ui32   qfp_double2uint   (double x);
+extern ui32   qfp_double2ufix   (double x,int f);
+extern i64    qfp_double2int64  (double x);
+extern i64    qfp_double2fix64  (double x,int f);
+extern ui64   qfp_double2uint64 (double x);
+extern ui64   qfp_double2ufix64 (double x,int f);
+extern i32    qfp_double2int_z  (double x);
+extern i64    qfp_double2int64_z(double x);
+
+extern double qfp_int2double    (i32  x);
+extern double qfp_fix2double    (i32  x,int f);
+extern double qfp_uint2double   (ui32 x);
+extern double qfp_ufix2double   (ui32 x,int f);
+extern double qfp_int642double  (i64  x);
+extern double qfp_fix642double  (i64  x,int f);
+extern double qfp_uint642double (ui64 x);
+extern double qfp_ufix642double (ui64 x,int f);
+
+extern float  qfp_double2float  (double x);
+extern double qfp_float2double  (float x);
+
+#endif
diff --git a/qfplib-m0-full-20240105/qfplib-m0-full.s b/qfplib-m0-full-20240105/qfplib-m0-full.s
new file mode 100644
index 0000000..0a354f1
--- /dev/null
+++ b/qfplib-m0-full-20240105/qfplib-m0-full.s
@@ -0,0 +1,3670 @@
+@ Copyright 2019-2024 Mark Owen
+@ http://www.quinapalus.com
+@ E-mail: qfp@quinapalus.com
+@
+@ This file is free software: you can redistribute it and/or modify
+@ it under the terms of version 2 of the GNU General Public License
+@ as published by the Free Software Foundation.
+@
+@ This file is distributed in the hope that it will be useful,
+@ but WITHOUT ANY WARRANTY; without even the implied warranty of
+@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+@ GNU General Public License for more details.
+@
+@ You should have received a copy of the GNU General Public License
+@ along with this file.  If not, see <http://www.gnu.org/licenses/> or
+@ write to the Free Software Foundation, Inc., 51 Franklin Street,
+@ Fifth Floor, Boston, MA  02110-1301, USA.
+
+.syntax unified
+.cpu cortex-m0plus
+.thumb
+
+@ exported symbols
+
+.global qfp_fadd
+.global qfp_fsub
+.global qfp_fmul
+.global qfp_fdiv
+.global qfp_fcmp
+.global qfp_fsqrt
+.global qfp_float2int
+.global qfp_float2fix
+.global qfp_float2uint
+.global qfp_float2ufix
+.global qfp_int2float
+.global qfp_fix2float
+.global qfp_uint2float
+.global qfp_ufix2float
+.global qfp_int642float
+.global qfp_fix642float
+.global qfp_uint642float
+.global qfp_ufix642float
+.global qfp_fcos
+.global qfp_fsin
+.global qfp_ftan
+.global qfp_fatan2
+.global qfp_fexp
+.global qfp_fln
+
+.global qfp_dadd
+.global qfp_dsub
+.global qfp_dmul
+.global qfp_ddiv
+.global qfp_dsqrt
+.global qfp_dcos
+.global qfp_dsin
+.global qfp_dtan
+.global qfp_datan2
+.global qfp_dexp
+.global qfp_dln
+.global qfp_dcmp
+
+.global qfp_float2int64
+.global qfp_float2fix64
+.global qfp_float2uint64
+.global qfp_float2ufix64
+.global qfp_float2int_z
+.global qfp_float2int64_z
+
+.global qfp_double2int
+.global qfp_double2fix
+.global qfp_double2uint
+.global qfp_double2ufix
+.global qfp_double2int64
+.global qfp_double2fix64
+.global qfp_double2uint64
+.global qfp_double2ufix64
+.global qfp_double2int_z
+.global qfp_double2int64_z
+
+.global qfp_int2double
+.global qfp_fix2double
+.global qfp_uint2double
+.global qfp_ufix2double
+.global qfp_int642double
+.global qfp_fix642double
+.global qfp_uint642double
+.global qfp_ufix642double
+
+.global qfp_double2float
+.global qfp_float2double
+
+qfp_lib_start:
+
+@ exchange r0<->r1, r2<->r3
+xchxy:
+ push {r0,r2,r14}
+ mov r0,r1
+ mov r2,r3
+ pop {r1,r3,r15}
+
+@ IEEE single in r0-> signed (two's complemennt) mantissa in r0 9Q23 (24 significant bits), signed exponent (bias removed) in r2
+@ trashes r4; zero, denormal -> mantissa=+/-1, exponent=-380; Inf, NaN -> mantissa=+/-1, exponent=+640
+unpackx:
+ lsrs r2,r0,#23 @ save exponent and sign
+ lsls r0,#9     @ extract mantissa
+ lsrs r0,#9
+ movs r4,#1
+ lsls r4,#23
+ orrs r0,r4     @ reinstate implied leading 1
+ cmp r2,#255    @ test sign bit
+ uxtb r2,r2     @ clear it
+ bls 1f         @ branch on positive
+ rsbs r0,#0     @ negate mantissa
+1:
+ subs r2,#1
+ cmp r2,#254    @ zero/denormal/Inf/NaN?
+ bhs 2f
+ subs r2,#126   @ remove exponent bias: can now be -126..+127
+ bx r14
+
+2:              @ here with special-case values
+ cmp r0,#0
+ mov r0,r4      @ set mantissa to +1
+ bpl 3f
+ rsbs r0,#0     @ zero/denormal/Inf/NaN: mantissa=+/-1
+3:
+ subs r2,#126   @ zero/denormal: exponent -> -127; Inf, NaN: exponent -> 128
+ lsls r2,#2     @ zero/denormal: exponent -> -508; Inf, NaN: exponent -> 512
+ adds r2,#128   @ zero/denormal: exponent -> -380; Inf, NaN: exponent -> 640
+ bx r14
+
+@ normalise and pack signed mantissa in r0 nominally 3Q29, signed exponent in r2-> IEEE single in r0
+@ trashes r4, preserves r1,r3
+@ r5: "sticky bits", must be zero iff all result bits below r0 are zero for correct rounding
+packx:
+ lsrs r4,r0,#31 @ save sign bit
+ lsls r4,r4,#31 @ sign now in b31
+ bpl 2f         @ skip if positive
+ cmp r5,#0
+ beq 11f
+ adds r0,#1     @ fiddle carry in to following rsb if sticky bits are non-zero
+11:
+ rsbs r0,#0     @ can now treat r0 as unsigned
+packx0:
+ bmi 3f         @ catch r0=0x80000000 case
+2:
+ subs r2,#1     @ normalisation loop
+ adds r0,r0
+ beq 1f         @ zero? special case
+ bpl 2b         @ normalise so leading "1" in bit 31
+3:
+ adds r2,#129   @ (mis-)offset exponent
+ bne 12f        @ special case: highest denormal can round to lowest normal
+ adds r0,#0x80  @ in special case, need to add 256 to r0 for rounding
+ bcs 4f         @ tripped carry? then have leading 1 in C as required
+12:
+ adds r0,#0x80  @ rounding
+ bcs 4f         @ tripped carry? then have leading 1 in C as required (and result is even so can ignore sticky bits)
+ cmp r5,#0
+ beq 7f         @ sticky bits zero?
+8:
+ lsls r0,#1     @ remove leading 1
+9:
+ subs r2,#1     @ compensate exponent on this path
+4:
+ cmp r2,#254
+ bge 5f         @ overflow?
+ adds r2,#1     @ correct exponent offset
+ ble 10f        @ denormal/underflow?
+ lsrs r0,#9     @ align mantissa
+ lsls r2,#23    @ align exponent
+ orrs r0,r2     @ assemble exponent and mantissa
+6:
+ orrs r0,r4     @ apply sign
+1:
+ bx r14
+
+5:
+ movs r0,#0xff  @ create infinity
+ lsls r0,#23
+ b 6b
+
+10:
+ movs r0,#0     @ create zero
+ bx r14
+
+7:              @ sticky bit rounding case
+ lsls r5,r0,#24 @ check bottom 8 bits of r0
+ bne 8b         @ in rounding-tie case?
+ lsrs r0,#9     @ ensure even result
+ lsls r0,#10
+ b 9b
+
+.align 2
+.ltorg
+
+@ signed multiply r0 1Q23 by r1 4Q23, result in r0 7Q25, sticky bits in r5
+@ trashes r3,r4
+mul0:
+ uxth r3,r0      @ Q23
+ asrs r4,r1,#16  @ Q7
+ muls r3,r4      @ L*H, Q30 signed
+ asrs r4,r0,#16  @ Q7
+ uxth r5,r1      @ Q23
+ muls r4,r5      @ H*L, Q30 signed
+ adds r3,r4      @ sum of middle partial products
+ uxth r4,r0
+ muls r4,r5      @ L*L, Q46 unsigned
+ lsls r5,r4,#16  @ initialise sticky bits from low half of low partial product
+ lsrs r4,#16     @ Q25
+ adds r3,r4      @ add high half of low partial product to sum of middle partial products
+                 @ (cannot generate carry by limits on input arguments)
+ asrs r0,#16     @ Q7
+ asrs r1,#16     @ Q7
+ muls r0,r1      @ H*H, Q14 signed
+ lsls r0,#11     @ high partial product Q25
+ lsls r1,r3,#27  @ sticky
+ orrs r5,r1      @ collect further sticky bits
+ asrs r1,r3,#5   @ middle partial products Q25
+ adds r0,r1      @ final result
+ bx r14
+
+.thumb_func
+qfp_fcmp:
+ lsls r2,r0,#1
+ lsrs r2,#24
+ beq 1f
+ cmp r2,#0xff
+ bne 2f
+1:
+ lsrs r0,#23     @ clear mantissa if NaN or denormal
+ lsls r0,#23
+2:
+ lsls r2,r1,#1
+ lsrs r2,#24
+ beq 1f
+ cmp r2,#0xff
+ bne 2f
+1:
+ lsrs r1,#23     @ clear mantissa if NaN or denormal
+ lsls r1,#23
+2:
+ movs r2,#1      @ initialise result
+ eors r1,r0
+ bmi 4f          @ opposite signs? then can proceed on basis of sign of x
+ eors r1,r0      @ restore y
+ bpl 1f
+ rsbs r2,#0      @ both negative? flip comparison
+1:
+ cmp r0,r1
+ bgt 2f
+ blt 3f
+5:
+ movs r2,#0
+3:
+ rsbs r2,#0
+2:
+ subs r0,r2,#0
+ bx r14
+4:
+ orrs r1,r0
+ adds r1,r1
+ beq 5b
+ cmp r0,#0
+ bge 2b
+ b 3b
+
+@ convert float to signed int, rounding towards 0, clamping
+.thumb_func
+qfp_float2int_z:
+ push {r14}
+ cmp r0,#0
+ blt 1f
+ bl qfp_float2int   @ +ve or zero? then use rounding towards -Inf
+ cmp r0,#0
+ bge 2f
+ ldr r0,=#0x7fffffff
+2:
+ pop {r15}
+1:
+ lsls r0,#1          @ -ve: clear sign bit
+ lsrs r0,#1
+ bl qfp_float2uint   @ convert to unsigned, rounding towards -Inf
+ cmp r0,#0
+ blt 1f
+ rsbs r0,#0
+ pop {r15}
+1:
+ movs r0,#1
+ lsls r0,#31         @ 0x80000000
+ pop {r15}
+
+.ltorg
+
+@ convert float to signed int, rounding towards -Inf, clamping
+.thumb_func
+qfp_float2int:
+ movs r1,#0      @ fall through
+
+@ convert float in r0 to signed fixed point in r0, clamping
+.thumb_func
+qfp_float2fix:
+ push {r4,r14}
+ bl unpackx
+ movs r3,r2
+ adds r3,#130
+ bmi 6f          @ -0?
+ add r2,r1       @ incorporate binary point position into exponent
+ subs r2,#23     @ r2 is now amount of left shift required
+ blt 1f          @ requires right shift?
+ cmp r2,#7       @ overflow?
+ ble 4f
+3:               @ overflow
+ asrs r1,r0,#31  @ +ve:0 -ve:0xffffffff
+ mvns r1,r1      @ +ve:0xffffffff -ve:0
+ movs r0,#1
+ lsls r0,#31
+5:
+ eors r0,r1      @ +ve:0x7fffffff -ve:0x80000000 (unsigned path: 0xffffffff)
+ pop {r4,r15}
+1:
+ rsbs r2,#0      @ right shift for r0, >0
+ cmp r2,#32
+ blt 2f          @ more than 32 bits of right shift?
+ movs r2,#32
+2:
+ asrs r0,r0,r2
+ pop {r4,r15}
+6:
+ movs r0,#0
+ pop {r4,r15}
+
+@ unsigned version
+.thumb_func
+qfp_float2uint:
+ movs r1,#0      @ fall through
+.thumb_func
+qfp_float2ufix:
+ push {r4,r14}
+ bl unpackx
+ add r2,r1       @ incorporate binary point position into exponent
+ movs r1,r0
+ bmi 5b          @ negative? return zero
+ subs r2,#23     @ r2 is now amount of left shift required
+ blt 1b          @ requires right shift?
+ mvns r1,r0      @ ready to return 0xffffffff
+ cmp r2,#8       @ overflow?
+ bgt 5b
+4:
+ lsls r0,r0,r2   @ result fits, left shifted
+ pop {r4,r15}
+
+
+@ convert uint64 to float, rounding
+.thumb_func
+qfp_uint642float:
+ movs r2,#0       @ fall through
+
+@ convert unsigned 64-bit fix to float, rounding; number of r0:r1 bits after point in r2
+.thumb_func
+qfp_ufix642float:
+ push {r4,r5,r14}
+ cmp r1,#0
+ bpl 3f          @ positive? we can use signed code
+ lsls r5,r1,#31  @ contribution to sticky bits
+ orrs r5,r0
+ lsrs r0,r1,#1
+ subs r2,#1
+ b 4f
+
+@ convert int64 to float, rounding
+.thumb_func
+qfp_int642float:
+ movs r2,#0       @ fall through
+
+@ convert signed 64-bit fix to float, rounding; number of r0:r1 bits after point in r2
+.thumb_func
+qfp_fix642float:
+ push {r4,r5,r14}
+3:
+ movs r5,r0
+ orrs r5,r1
+ beq ret_pop45   @ zero? return +0
+ asrs r5,r1,#31  @ sign bits
+2:
+ asrs r4,r1,#24  @ try shifting 7 bits at a time
+ cmp r4,r5
+ bne 1f          @ next shift will overflow?
+ lsls r1,#7
+ lsrs r4,r0,#25
+ orrs r1,r4
+ lsls r0,#7
+ adds r2,#7
+ b 2b
+1:
+ movs r5,r0
+ movs r0,r1
+4:
+ rsbs r2,#0
+ adds r2,#32+29
+ b packret
+
+@ convert signed int to float, rounding
+.thumb_func
+qfp_int2float:
+ movs r1,#0      @ fall through
+
+@ convert signed fix to float, rounding; number of r0 bits after point in r1
+.thumb_func
+qfp_fix2float:
+ push {r4,r5,r14}
+1:
+ movs r2,#29
+ subs r2,r1      @ fix exponent
+packretns:       @ pack and return, sticky bits=0
+ movs r5,#0
+packret:         @ common return point: "pack and return"
+ bl packx
+ret_pop45:
+ pop {r4,r5,r15}
+
+
+@ unsigned version
+.thumb_func
+qfp_uint2float:
+ movs r1,#0      @ fall through
+.thumb_func
+qfp_ufix2float:
+ push {r4,r5,r14}
+ cmp r0,#0
+ bge 1b          @ treat <2^31 as signed
+ movs r2,#30
+ subs r2,r1      @ fix exponent
+ lsls r5,r0,#31  @ one sticky bit
+ lsrs r0,#1
+ b packret
+
+@ All the scientific functions are implemented using the CORDIC algorithm. For notation,
+@ details not explained in the comments below, and a good overall survey see
+@ "50 Years of CORDIC: Algorithms, Architectures, and Applications" by Meher et al.,
+@ IEEE Transactions on Circuits and Systems Part I, Volume 56 Issue 9.
+
+@ Register use:
+@ r0: x
+@ r1: y
+@ r2: z/omega
+@ r3: coefficient pointer
+@ r4,r12: m
+@ r5: i (shift)
+
+cordic_start: @ initialisation
+ movs r5,#0   @ initial shift=0
+ mov r12,r4
+ b 5f
+
+cordic_vstep: @ one step of algorithm in vector mode
+ cmp r1,#0    @ check sign of y
+ bgt 4f
+ b 1f
+cordic_rstep: @ one step of algorithm in rotation mode
+ cmp r2,#0    @ check sign of angle
+ bge 1f
+4:
+ subs r1,r6   @ negative rotation: y=y-(x>>i)
+ rsbs r7,#0
+ adds r2,r4   @ accumulate angle
+ b 2f
+1:
+ adds r1,r6   @ positive rotation: y=y+(x>>i)
+ subs r2,r4   @ accumulate angle
+2:
+ mov r4,r12
+ muls r7,r4   @ apply sign from m
+ subs r0,r7   @ finish rotation: x=x{+/-}(y>>i)
+5:
+ ldmia r3!,{r4} @ fetch next angle from table and bump pointer
+ lsrs r4,#1   @ repeated angle?
+ bcs 3f
+ adds r5,#1   @ adjust shift if not
+3:
+ mov r6,r0
+ asrs r6,r5   @ x>>i
+ mov r7,r1
+ asrs r7,r5   @ y>>i
+ lsrs r4,#1   @ shift end flag into carry
+ bx r14
+
+@ CORDIC rotation mode
+cordic_rot:
+ push {r6,r7,r14}
+ bl cordic_start   @ initialise
+1:
+ bl cordic_rstep
+ bcc 1b            @ step until table finished
+ asrs r6,r0,#14    @ remaining small rotations can be linearised: see IV.B of paper referenced above
+ asrs r7,r1,#14
+ asrs r2,#3
+ muls r6,r2        @ all remaining CORDIC steps in a multiplication
+ muls r7,r2
+ mov r4,r12
+ muls r7,r4
+ asrs r6,#12
+ asrs r7,#12
+ subs r0,r7        @ x=x{+/-}(yz>>k)
+ adds r1,r6        @ y=y+(xz>>k)
+cordic_exit:
+ pop {r6,r7,r15}
+
+@ CORDIC vector mode
+cordic_vec:
+ push {r6,r7,r14}
+ bl cordic_start   @ initialise
+1:
+ bl cordic_vstep
+ bcc 1b            @ step until table finished
+4:
+ cmp r1,#0         @ continue as in cordic_vstep but without using table; x is not affected as y is small
+ bgt 2f            @ check sign of y
+ adds r1,r6        @ positive rotation: y=y+(x>>i)
+ subs r2,r4        @ accumulate angle
+ b 3f
+2:
+ subs r1,r6        @ negative rotation: y=y-(x>>i)
+ adds r2,r4        @ accumulate angle
+3:
+ asrs r6,#1
+ asrs r4,#1        @ next "table entry"
+ bne 4b
+ b cordic_exit
+
+.thumb_func
+qfp_fsin:            @ calculate sin and cos using CORDIC rotation method
+ push {r4,r5,r14}
+ movs r1,#24
+ bl qfp_float2fix    @ range reduction by repeated subtraction/addition in fixed point
+ ldr r4,pi_q29
+ lsrs r4,#4          @ 2pi Q24
+1:
+ subs r0,r4
+ bge 1b
+1:
+ adds r0,r4
+ bmi 1b              @ now in range 0..2pi
+ lsls r2,r0,#2       @ z Q26
+ lsls r5,r4,#1       @ pi Q26 (r4=pi/2 Q26)
+ ldr r0,=#0x136e9db4 @ initialise CORDIC x,y with scaling
+ movs r1,#0
+1:
+ cmp r2,r4           @ >pi/2?
+ blt 2f
+ subs r2,r5          @ reduce range to -pi/2..pi/2
+ rsbs r0,#0          @ rotate vector by pi
+ b 1b
+2:
+ lsls r2,#3          @ Q29
+ adr r3,tab_cc       @ circular coefficients
+ movs r4,#1          @ m=1
+ bl cordic_rot
+ adds r1,#9          @ fiddle factor to make sin(0)==0
+ movs r2,#0          @ exponents to zero
+ movs r3,#0
+ movs r5,#0          @ no sticky bits
+ bl clampx
+ bl packx            @ pack cosine
+ bl xchxy
+ bl clampx
+ b packretns         @ pack sine
+
+.thumb_func
+qfp_fcos:
+ push {r14}
+ bl qfp_fsin
+ mov r0,r1           @ extract cosine result
+ pop {r15}
+
+@ force r0 to lie in range [-1,1] Q29
+clampx:
+ movs r4,#1
+ lsls r4,#29
+ cmp r0,r4
+ bgt 1f
+ rsbs r4,#0
+ cmp r0,r4
+ ble 1f
+ bx r14
+1:
+ movs r0,r4
+ bx r14
+
+.thumb_func
+qfp_ftan:
+ push {r4,r5,r6,r14}
+ bl qfp_fsin         @ sine in r0/r2, cosine in r1/r3
+ b fdiv_n            @ sin/cos
+
+.thumb_func
+qfp_fexp:
+ push {r4,r5,r14}
+ movs r1,#24
+ bl qfp_float2fix    @ Q24: covers entire valid input range
+ asrs r1,r0,#16      @ Q8
+ ldr r2,=#5909       @ log_2(e) Q12
+ muls r2,r1          @ estimate exponent of result Q20 (always an underestimate)
+ asrs r2,#20         @ Q0
+ lsls r1,r0,#6       @ Q30
+ ldr r0,=#0x2c5c85fe @ ln(2) Q30
+ muls r0,r2          @ accurate contribution of estimated exponent
+ subs r1,r0          @ residual to be exponentiated, guaranteed ≥0, < about 0.75 Q30
+
+@ here
+@ r1: mantissa to exponentiate, 0...~0.75 Q30
+@ r2: first exponent estimate
+
+ movs r5,#1          @ shift
+ adr r3,ftab_exp     @ could use alternate words from dtab_exp to save space if required
+ movs r0,#1
+ lsls r0,#29         @ x=1 Q29
+
+3:
+ ldmia r3!,{r4}
+ subs r4,r1,r4
+ bmi 1f
+ movs r1,r4          @ keep result of subtraction
+ movs r4,r0
+ lsrs r4,r5
+ adcs r0,r4          @ x+=x>>i with rounding
+
+1:
+ adds r5,#1
+ cmp r5,#15
+ bne 3b
+
+@ here
+@ r0: exp a Q29 1..2+
+@ r1: ε (residual x where x=a+ε), < 2^-14 Q30
+@ r2: first exponent estimate
+@ and we wish to calculate exp x=exp a exp ε~(exp a)(1+ε)
+
+ lsrs r3,r0,#15      @ exp a Q14
+ muls r3,r1          @ ε exp a Q44
+ lsrs r3,#15         @ ε exp a Q29
+ adcs r0,r3          @ (1+ε) exp a Q29 with rounding
+
+ b packretns         @ pack result
+
+.thumb_func
+qfp_fln:
+ push {r4,r5,r14}
+ asrs r1,r0,#23
+ bmi 3f              @ -ve argument?
+ beq 3f              @ 0 argument?
+ cmp r1,#0xff
+ beq 4f              @ +Inf/NaN
+ bl unpackx
+ adds r2,#1
+ ldr r3,=#0x2c5c85fe @ ln(2) Q30
+ lsrs r1,r3,#14      @ ln(2) Q16
+ muls r1,r2          @ result estimate Q16
+ asrs r1,#16         @ integer contribution to result
+ muls r3,r2
+ lsls r4,r1,#30
+ subs r3,r4          @ fractional contribution to result Q30, signed
+ lsls r0,#8          @ Q31
+
+@ here
+@ r0: mantissa Q31
+@ r1: integer contribution to result
+@ r3: fractional contribution to result Q30, signed
+
+ movs r5,#1          @ shift
+ adr r4,ftab_exp     @ could use alternate words from dtab_exp to save space if required
+
+2:
+ movs r2,r0
+ lsrs r2,r5
+ adcs r2,r0          @ x+(x>>i) with rounding
+ bcs 1f              @ >=2?
+ movs r0,r2          @ keep result
+ ldr r2,[r4]
+ subs r3,r2
+1:
+ adds r4,#4
+ adds r5,#1
+ cmp r5,#15
+ bne 2b
+
+@ here
+@ r0: residual x, nearly 2 Q31
+@ r1: integer contribution to result
+@ r3: fractional part of result Q30
+
+ asrs r0,#2
+ adds r0,r3,r0
+
+ cmp r1,#0
+ bne 2f
+
+ asrs r0,#1
+ lsls r1,#29
+ adds r0,r1
+ movs r2,#0
+ b packretns
+
+2:
+ lsls r1,#24
+ asrs r0,#6          @ Q24
+ adcs r0,r1          @ with rounding
+ movs r2,#5
+ b packretns
+
+3:
+ ldr r0,=#0xff800000 @ -Inf
+ pop {r4,r5,r15}
+4:
+ ldr r0,=#0x7f800000 @ +Inf
+ pop {r4,r5,r15}
+
+.align 2
+ftab_exp:
+.word 0x19f323ed   @ log 1+2^-1 Q30
+.word 0x0e47fbe4   @ log 1+2^-2 Q30
+.word 0x0789c1dc   @ log 1+2^-3 Q30
+.word 0x03e14618   @ log 1+2^-4 Q30
+.word 0x01f829b1   @ log 1+2^-5 Q30
+.word 0x00fe0546   @ log 1+2^-6 Q30
+.word 0x007f80aa   @ log 1+2^-7 Q30
+.word 0x003fe015   @ log 1+2^-8 Q30
+.word 0x001ff803   @ log 1+2^-9 Q30
+.word 0x000ffe00   @ log 1+2^-10 Q30
+.word 0x0007ff80   @ log 1+2^-11 Q30
+.word 0x0003ffe0   @ log 1+2^-12 Q30
+.word 0x0001fff8   @ log 1+2^-13 Q30
+.word 0x0000fffe   @ log 1+2^-14 Q30
+
+.thumb_func
+qfp_fatan2:
+ push {r4,r5,r14}
+
+@ unpack arguments and shift one down to have common exponent
+ bl unpackx
+ bl xchxy
+ bl unpackx
+ lsls r0,r0,#5  @ Q28
+ lsls r1,r1,#5  @ Q28
+ adds r4,r2,r3  @ this is -760 if both arguments are 0 and at least -380-126=-506 otherwise
+ asrs r4,#9
+ adds r4,#1
+ bmi 2f         @ force y to 0 proper, so result will be zero
+ subs r4,r2,r3  @ calculate shift
+ bge 1f         @ ex>=ey?
+ rsbs r4,#0     @ make shift positive
+ asrs r0,r4
+ cmp r4,#28
+ blo 3f
+ asrs r0,#31
+ b 3f
+1:
+ asrs r1,r4
+ cmp r4,#28
+ blo 3f
+2:
+@ here |x|>>|y| or both x and y are ±0
+ cmp r0,#0
+ bge 4f         @ x positive, return signed 0
+ ldr r0,pi_q29  @ x negative, return +/- pi
+ asrs r1,#31
+ eors r0,r1
+ b 7f
+4:
+ asrs r0,r1,#31
+ b 7f
+3:
+ movs r2,#0              @ initial angle
+ cmp r0,#0               @ x negative
+ bge 5f
+ rsbs r0,#0              @ rotate to 1st/4th quadrants
+ rsbs r1,#0
+ ldr r2,pi_q29           @ pi Q29
+5:
+ adr r3,tab_cc           @ circular coefficients
+ movs r4,#1              @ m=1
+ bl cordic_vec           @ also produces magnitude (with scaling factor 1.646760119), which is discarded
+ mov r0,r2               @ result here is -pi/2..3pi/2 Q29
+@ asrs r2,#29
+@ subs r0,r2
+ ldr r2,pi_q29           @ pi Q29
+ adds r4,r0,r2           @ attempt to fix -3pi/2..-pi case
+ bcs 6f                  @ -pi/2..0? leave result as is
+ subs r4,r0,r2           @ <pi? leave as is
+ bmi 6f
+ subs r0,r4,r2           @ >pi: take off 2pi
+6:
+ subs r0,#1              @ fiddle factor so atan2(0,1)==0
+7:
+ movs r2,#0              @ exponent for pack
+ b packretns
+
+.align 2
+.ltorg
+
+@ first entry in following table is pi Q29
+pi_q29:
+@ circular CORDIC coefficients: atan(2^-i), b0=flag for preventing shift, b1=flag for end of table
+tab_cc:
+.word 0x1921fb54*4+1     @ no shift before first iteration
+.word 0x0ed63383*4+0
+.word 0x07d6dd7e*4+0
+.word 0x03fab753*4+0
+.word 0x01ff55bb*4+0
+.word 0x00ffeaae*4+0
+.word 0x007ffd55*4+0
+.word 0x003fffab*4+0
+.word 0x001ffff5*4+0
+.word 0x000fffff*4+0
+.word 0x0007ffff*4+0
+.word 0x00040000*4+0
+.word 0x00020000*4+0+2   @ +2 marks end
+
+.align 2
+.thumb_func
+qfp_fsub:
+ ldr r2,=#0x80000000
+ eors r1,r2    @ flip sign on second argument
+@ drop into fadd, on .align2:ed boundary
+
+.thumb_func
+qfp_fadd:
+ push {r4,r5,r6,r14}
+ asrs r4,r0,#31
+ lsls r2,r0,#1
+ lsrs r2,#24     @ x exponent
+ beq fa_xe0
+ cmp r2,#255
+ beq fa_xe255
+fa_xe:
+ asrs r5,r1,#31
+ lsls r3,r1,#1
+ lsrs r3,#24     @ y exponent
+ beq fa_ye0
+ cmp r3,#255
+ beq fa_ye255
+fa_ye:
+ ldr r6,=#0x007fffff
+ ands r0,r0,r6   @ extract mantissa bits
+ ands r1,r1,r6
+ adds r6,#1      @ r6=0x00800000
+ orrs r0,r0,r6   @ set implied 1
+ orrs r1,r1,r6
+ eors r0,r0,r4   @ complement...
+ eors r1,r1,r5
+ subs r0,r0,r4   @ ... and add 1 if sign bit is set: 2's complement
+ subs r1,r1,r5
+ subs r5,r3,r2   @ ye-xe
+ subs r4,r2,r3   @ xe-ye
+ bmi fa_ygtx
+@ here xe>=ye
+ cmp r4,#30
+ bge fa_xmgty    @ xe much greater than ye?
+ adds r5,#32
+ movs r3,r2      @ save exponent
+@ here y in r1 must be shifted down r4 places to align with x in r0
+ movs r2,r1
+ lsls r2,r2,r5   @ keep the bits we will shift off the bottom of r1
+ asrs r1,r1,r4
+ b fa_0
+
+.ltorg
+ 
+fa_ymgtx:
+ movs r2,#0      @ result is just y
+ movs r0,r1
+ b fa_1
+fa_xmgty:
+ movs r3,r2      @ result is just x
+ movs r2,#0
+ b fa_1
+
+fa_ygtx:
+@ here ye>xe
+ cmp r5,#30
+ bge fa_ymgtx    @ ye much greater than xe?
+ adds r4,#32
+@ here x in r0 must be shifted down r5 places to align with y in r1
+ movs r2,r0
+ lsls r2,r2,r4   @ keep the bits we will shift off the bottom of r0
+ asrs r0,r0,r5
+fa_0:
+ adds r0,r1      @ result is now in r0:r2, possibly highly denormalised or zero; exponent in r3
+ beq fa_9        @ if zero, inputs must have been of identical magnitude and opposite sign, so return +0
+fa_1: 
+ lsrs r1,r0,#31  @ sign bit
+ beq fa_8
+ mvns r0,r0
+ rsbs r2,r2,#0
+ bne fa_8
+ adds r0,#1
+fa_8:
+ adds r6,r6
+@ r6=0x01000000
+ cmp r0,r6
+ bhs fa_2
+fa_3:
+ adds r2,r2      @ normalisation loop
+ adcs r0,r0
+ subs r3,#1      @ adjust exponent
+ cmp r0,r6
+ blo fa_3
+fa_2:
+@ here r0:r2 is the result mantissa 0x01000000<=r0<0x02000000, r3 the exponent, and r1 the sign bit
+ lsrs r0,#1
+ bcc fa_4
+@ rounding bits here are 1:r2
+ adds r0,#1      @ round up
+ cmp r2,#0
+ beq fa_5        @ sticky bits all zero?
+fa_4:
+ cmp r3,#254
+ bhs fa_6        @ exponent too large or negative?
+ lsls r1,#31     @ pack everything
+ add r0,r1
+ lsls r3,#23
+ add r0,r3
+fa_end:
+ pop {r4,r5,r6,r15}
+
+fa_9:
+ cmp r2,#0       @ result zero?
+ beq fa_end      @ return +0
+ b fa_1
+
+fa_5:
+ lsrs r0,#1
+ lsls r0,#1      @ round to even
+ b fa_4
+
+fa_6:
+ bge fa_7
+@ underflow
+@ can handle denormals here
+ lsls r0,r1,#31  @ result is signed zero
+ pop {r4,r5,r6,r15}
+fa_7:
+@ overflow
+ lsls r0,r1,#8
+ adds r0,#255
+ lsls r0,#23     @ result is signed infinity
+ pop {r4,r5,r6,r15}
+
+
+fa_xe0:
+@ can handle denormals here
+ subs r2,#32
+ adds r2,r4       @ exponent -32 for +Inf, -33 for -Inf
+ b fa_xe
+
+fa_xe255:
+@ can handle NaNs here
+ lsls r2,#8
+ add r2,r2,r4 @ exponent ~64k for +Inf, ~64k-1 for -Inf
+ b fa_xe
+
+fa_ye0:
+@ can handle denormals here
+ subs r3,#32
+ adds r3,r5       @ exponent -32 for +Inf, -33 for -Inf
+ b fa_ye
+
+fa_ye255:
+@ can handle NaNs here
+ lsls r3,#8
+ add r3,r3,r5 @ exponent ~64k for +Inf, ~64k-1 for -Inf
+ b fa_ye
+
+
+.align 2
+.thumb_func
+qfp_fmul:
+ push {r7,r14}
+ mov r2,r0
+ eors r2,r1       @ sign of result
+ lsrs r2,#31
+ lsls r2,#31
+ mov r14,r2
+ lsls r0,#1
+ lsls r1,#1
+ lsrs r2,r0,#24   @ xe
+ beq fm_xe0
+ cmp r2,#255
+ beq fm_xe255
+fm_xe:
+ lsrs r3,r1,#24   @ ye
+ beq fm_ye0
+ cmp r3,#255
+ beq fm_ye255
+fm_ye:
+ adds r7,r2,r3    @ exponent of result (will possibly be incremented)
+ subs r7,#128     @ adjust bias for packing
+ lsls r0,#8       @ x mantissa
+ lsls r1,#8       @ y mantissa
+ lsrs r0,#9
+ lsrs r1,#9
+
+ adds r2,r0,r1    @ for later
+ mov r12,r2
+ lsrs r2,r0,#7    @ x[22..7] Q16
+ lsrs r3,r1,#7    @ y[22..7] Q16
+ muls r2,r2,r3    @ result [45..14] Q32: never an overestimate and worst case error is 2*(2^7-1)*(2^23-2^7)+(2^7-1)^2 = 2130690049 < 2^31
+ muls r0,r0,r1    @ result [31..0] Q46
+ lsrs r2,#18      @ result [45..32] Q14
+ bcc 1f
+ cmp r0,#0
+ bmi 1f
+ adds r2,#1       @ fix error in r2
+1:
+ lsls r3,r0,#9    @ bits off bottom of result
+ lsrs r0,#23      @ Q23
+ lsls r2,#9
+ adds r0,r2       @ cut'n'shut
+ add r0,r12       @ implied 1*(x+y) to compensate for no insertion of implied 1s
+@ result-1 in r3:r0 Q23+32, i.e., in range [0,3)
+
+ lsrs r1,r0,#23
+ bne fm_0         @ branch if we need to shift down one place
+@ here 1<=result<2
+ cmp r7,#254
+ bhs fm_3a        @ catches both underflow and overflow
+ lsls r3,#1       @ sticky bits at top of R3, rounding bit in carry
+ bcc fm_1         @ no rounding
+ beq fm_2         @ rounding tie?
+ adds r0,#1       @ round up
+fm_1:
+ adds r7,#1       @ for implied 1
+ lsls r7,#23      @ pack result
+ add r0,r7
+ add r0,r14
+ pop {r7,r15}
+fm_2:             @ rounding tie
+ adds r0,#1
+fm_3:
+ lsrs r0,#1
+ lsls r0,#1       @ clear bottom bit
+ b fm_1
+
+@ here 1<=result-1<3
+fm_0:
+ adds r7,#1       @ increment exponent
+ cmp r7,#254
+ bhs fm_3b        @ catches both underflow and overflow
+ lsrs r0,#1       @ shift mantissa down
+ bcc fm_1a        @ no rounding
+ adds r0,#1       @ assume we will round up
+ cmp r3,#0        @ sticky bits
+ beq fm_3c        @ rounding tie?
+fm_1a:
+ adds r7,r7
+ adds r7,#1       @ for implied 1
+ lsls r7,#22      @ pack result
+ add r0,r7
+ add r0,r14
+ pop {r7,r15}
+
+fm_3c:
+ lsrs r0,#1
+ lsls r0,#1       @ clear bottom bit
+ b fm_1a
+
+fm_xe0:
+ subs r2,#16
+fm_xe255:
+ lsls r2,#8
+ b fm_xe
+fm_ye0:
+ subs r3,#16
+fm_ye255:
+ lsls r3,#8
+ b fm_ye
+
+@ here the result is under- or overflowing
+fm_3b:
+ bge fm_4        @ branch on overflow
+@ trap case where result is denormal 0x007fffff + 0.5ulp or more
+ adds r7,#1      @ exponent=-1?
+ bne fm_5
+@ corrected mantissa will be >= 3.FFFFFC (0x1fffffe Q23)
+@ so r0 >= 2.FFFFFC (0x17ffffe Q23)
+ adds r0,#2
+ lsrs r0,#23
+ cmp r0,#3
+ bne fm_5
+ b fm_6
+
+fm_3a:
+ bge fm_4        @ branch on overflow
+@ trap case where result is denormal 0x007fffff + 0.5ulp or more
+ adds r7,#1      @ exponent=-1?
+ bne fm_5
+ adds r0,#1      @ mantissa=0xffffff (i.e., r0=0x7fffff)?
+ lsrs r0,#23
+ beq fm_5
+fm_6:
+ movs r0,#1      @ return smallest normal
+ lsls r0,#23
+ add r0,r14
+ pop {r7,r15}
+
+fm_5:
+ mov r0,r14
+ pop {r7,r15}
+fm_4:
+ movs r0,#0xff
+ lsls r0,#23
+ add r0,r14
+ pop {r7,r15}
+
+@ This version of the division algorithm uses external divider hardware to estimate the
+@ reciprocal of the divisor to about 14 bits; then a multiplication step to get a first
+@ quotient estimate; then the remainder based on this estimate is used to calculate a
+@ correction to the quotient. The result is good to about 27 bits and so we only need
+@ to calculate the exact remainder when close to a rounding boundary.
+.align 2
+.thumb_func
+qfp_fdiv:
+ push {r4,r5,r6,r14}
+fdiv_n:
+
+ movs r4,#1
+ lsls r4,#23   @ implied 1 position
+ lsls r2,r1,#9 @ clear out sign and exponent
+ lsrs r2,r2,#9
+ orrs r2,r2,r4 @ divisor mantissa Q23 with implied 1
+
+@ here
+@ r0=packed dividend
+@ r1=packed divisor
+@ r2=divisor mantissa Q23
+@ r4=1<<23
+
+// see divtest.c
+ lsrs r3,r2,#18 @ x2=x>>18; // Q5 32..63
+ adr r5,rcpapp-32
+ ldrb r3,[r5,r3] @ u=lut5[x2-32]; // Q8
+ lsls r5,r2,#5
+ muls r5,r5,r3
+ asrs r5,#14 @ e=(i32)(u*(x<<5))>>14; // Q22
+ asrs r6,r5,#11
+ muls r6,r6,r6 @ e2=(e>>11)*(e>>11); // Q22
+ subs r5,r6
+ muls r5,r5,r3 @ c=(e-e2)*u; // Q30
+ lsls r6,r3,#8
+ asrs r5,#13
+ adds r5,#1
+ asrs r5,#1
+ subs r5,r6,r5 @ u0=(u<<8)-((c+0x2000)>>14); // Q16
+
+@ here
+@ r0=packed dividend
+@ r1=packed divisor
+@ r2=divisor mantissa Q23
+@ r4=1<<23
+@ r5=reciprocal estimate Q16
+
+ lsrs r6,r0,#23
+ uxtb r3,r6        @ dividend exponent
+ lsls r0,#9
+ lsrs r0,#9
+ orrs r0,r0,r4     @ dividend mantissa Q23
+
+ lsrs r1,#23
+ eors r6,r1        @ sign of result in bit 8
+ lsrs r6,#8
+ lsls r6,#31       @ sign of result in bit 31, other bits clear
+
+@ here
+@ r0=dividend mantissa Q23
+@ r1=divisor sign+exponent
+@ r2=divisor mantissa Q23
+@ r3=dividend exponent
+@ r5=reciprocal estimate Q16
+@ r6b31=sign of result
+
+ uxtb r1,r1        @ divisor exponent
+ cmp r1,#0
+ beq retinf
+ cmp r1,#255
+ beq 20f           @ divisor is infinite
+ cmp r3,#0
+ beq retzero
+ cmp r3,#255
+ beq retinf
+ subs r3,r1        @ initial result exponent (no bias)
+ adds r3,#125      @ add bias
+
+ lsrs r1,r0,#8     @ dividend mantissa Q15
+
+@ here
+@ r0=dividend mantissa Q23
+@ r1=dividend mantissa Q15
+@ r2=divisor mantissa Q23
+@ r3=initial result exponent
+@ r5=reciprocal estimate Q16
+@ r6b31=sign of result
+
+ muls r1,r5
+
+ lsrs r1,#16       @ Q15 qu0=(q15)(u*y0);
+ lsls r0,r0,#15    @ dividend Q38
+ movs r4,r2
+ muls r4,r1        @ Q38 qu0*x
+ subs r4,r0,r4     @ Q38 re0=(y<<15)-qu0*x; note this remainder is signed
+ asrs r4,#10
+ muls r4,r5        @ Q44 qu1=(re0>>10)*u; this quotient correction is also signed
+ asrs r4,#16       @ Q28
+ lsls r1,#13
+ adds r1,r1,r4     @ Q28 qu=(qu0<<13)+(qu1>>16);
+
+@ here
+@ r0=dividend mantissa Q38
+@ r1=quotient Q28
+@ r2=divisor mantissa Q23
+@ r3=initial result exponent
+@ r6b31=sign of result
+
+ lsrs r4,r1,#28
+ bne 1f
+@ here the quotient is less than 1<<28 (i.e., result mantissa <1.0)
+
+ adds r1,#5
+ lsrs r4,r1,#4     @ rounding + small reduction in systematic bias
+ bcc 2f            @ skip if we are not near a rounding boundary
+ lsrs r1,#3        @ quotient Q25
+ lsls r0,#10       @ dividend mantissa Q48
+ muls r1,r1,r2     @ quotient*divisor Q48
+ subs r0,r0,r1     @ remainder Q48
+ bmi 2f
+ b 3f
+
+1:
+@ here the quotient is at least 1<<28 (i.e., result mantissa >=1.0)
+
+ adds r3,#1        @ bump exponent (and shift mantissa down one more place)
+ adds r1,#9
+ lsrs r4,r1,#5     @ rounding + small reduction in systematic bias
+ bcc 2f            @ skip if we are not near a rounding boundary
+
+ lsrs r1,#4        @ quotient Q24
+ lsls r0,#9        @ dividend mantissa Q47
+ muls r1,r1,r2     @ quotient*divisor Q47
+ subs r0,r0,r1     @ remainder Q47
+ bmi 2f
+3:
+ adds r4,#1        @ increment quotient as we are above the rounding boundary
+
+@ here
+@ r3=result exponent
+@ r4=correctly rounded quotient Q23 in range [1,2] *note closed interval*
+@ r6b31=sign of result
+
+2:
+ cmp r3,#254
+ bhs 10f           @ this catches both underflow and overflow
+ lsls r1,r3,#23
+ adds r0,r4,r1
+ adds r0,r6
+ pop {r4,r5,r6,r15}
+
+@ here divisor is infinite; dividend exponent in r3
+20:
+ cmp r3,#255
+ bne retzero
+
+retinf:
+ movs r0,#255
+21:
+ lsls r0,#23
+ orrs r0,r6
+ pop {r4,r5,r6,r15}
+
+10:
+ bge retinf       @ overflow?
+ adds r1,r3,#1
+ bne retzero      @ exponent <-1? return 0
+@ here exponent is exactly -1
+ lsrs r1,r4,#25
+ bcc retzero      @ mantissa is not 01000000?
+@ return minimum normal
+ movs r0,#1
+ lsls r0,#23
+ orrs r0,r6
+ pop {r4,r5,r6,r15}
+
+retzero:
+ movs r0,r6
+ pop {r4,r5,r6,r15}
+
+@ x2=[32:1:63]/32;
+@ round(256 ./(x2+1/64))
+.align 2
+rcpapp:
+.byte 252,245,237,231,224,218,213,207,202,197,193,188,184,180,176,172
+.byte 169,165,162,159,156,153,150,148,145,142,140,138,135,133,131,129
+
+@ The square root routine uses an initial approximation to the reciprocal of the square root of the argument based
+@ on the top four bits of the mantissa (possibly shifted one place to make the exponent even). It then performs two
+@ Newton-Raphson iterations, resulting in about 14 bits of accuracy. This reciprocal is then multiplied by
+@ the original argument to produce an approximation to the result, again with about 14 bits of accuracy.
+@ Then a remainder is calculated, and multiplied by the reciprocal estiamte to generate a correction term
+@ giving a final answer to about 28 bits of accuracy. A final remainder calculation rounds to the correct
+@ result if necessary.
+@ Again, the fixed-point calculation is carefully implemented to preserve accuracy, and similar comments to those
+@ made above on the fast division routine apply.
+@ The reciprocal square root calculation has been tested for all possible (possibly shifted) input mantissa values.
+.align 2
+.thumb_func
+qfp_fsqrt:
+ push {r4}
+ lsls r1,r0,#1
+ bcs sq_0         @ negative?
+ lsls r1,#8
+ lsrs r1,#9       @ mantissa
+ movs r2,#1
+ lsls r2,#23
+ adds r1,r2       @ insert implied 1
+ lsrs r2,r0,#23   @ extract exponent
+ beq sq_2         @ zero?
+ cmp r2,#255      @ infinite?
+ beq sq_1
+ adds r2,#125     @ correction for packing
+ asrs r2,#1       @ exponent/2, LSB into carry
+ bcc 1f
+ lsls r1,#1       @ was even: double mantissa; mantissa y now 1..4 Q23
+1:
+ adr r4,rsqrtapp-4@ first four table entries are never accessed because of the mantissa's leading 1
+ lsrs r3,r1,#21   @ y Q2
+ ldrb r4,[r4,r3]  @ initial approximation to reciprocal square root a0 Q8
+
+ lsrs r0,r1,#7    @ y Q16: first Newton-Raphson iteration
+ muls r0,r4       @ a0*y Q24
+ muls r0,r4       @ r0=p0=a0*y*y Q32
+ asrs r0,#12      @ r0 Q20
+ muls r0,r4       @ dy0=a0*r0 Q28
+ asrs r0,#13      @ dy0 Q15
+ lsls r4,#8       @ a0 Q16
+ subs r4,r0       @ a1=a0-dy0/2 Q16-Q15/2 -> Q16
+ adds r4,#170     @ mostly remove systematic error in this approximation: gains approximately 1 bit
+
+ movs r0,r4       @ second Newton-Raphson iteration
+ muls r0,r0       @ a1*a1 Q32
+ lsrs r0,#15      @ a1*a1 Q17
+ lsrs r3,r1,#8    @ y Q15
+ muls r0,r3       @ r1=p1=a1*a1*y Q32
+ asrs r0,#12      @ r1 Q20
+ muls r0,r4       @ dy1=a1*r1 Q36
+ asrs r0,#21      @ dy1 Q15
+ subs r4,r0       @ a2=a1-dy1/2 Q16-Q15/2 -> Q16
+
+ muls r3,r4       @ a3=y*a2 Q31
+ lsrs r3,#15      @ a3 Q16
+@ here a2 is an approximation to the reciprocal square root
+@ and a3 is an approximation to the square root
+ movs r0,r3
+ muls r0,r0       @ a3*a3 Q32
+ lsls r1,#9       @ y Q32
+ subs r0,r1,r0    @ r2=y-a3*a3 Q32 remainder
+ asrs r0,#5       @ r2 Q27
+ muls r4,r0       @ r2*a2 Q43
+ lsls r3,#7       @ a3 Q23
+ asrs r0,r4,#15   @ r2*a2 Q28
+ adds r0,#16      @ rounding to Q24
+ asrs r0,r0,#6    @ r2*a2 Q22
+ add r3,r0        @ a4 Q23: candidate final result
+ bcc sq_3         @ near rounding boundary? skip if no rounding needed
+ mov r4,r3
+ adcs r4,r4       @ a4+0.5ulp Q24
+ muls r4,r4       @ Q48
+ lsls r1,#16      @ y Q48
+ subs r1,r4       @ remainder Q48
+ bmi sq_3
+ adds r3,#1       @ round up
+sq_3:
+ lsls r2,#23      @ pack exponent
+ adds r0,r2,r3
+sq_6:
+ pop {r4}
+ bx r14
+
+sq_0:
+ lsrs r1,#24
+ beq sq_2         @ -0: return it
+@ here negative and not -0: return -Inf
+ asrs r0,#31
+sq_5:
+ lsls r0,#23
+ b sq_6
+sq_1:             @ +Inf
+ lsrs r0,#23
+ b sq_5
+sq_2:
+ lsrs r0,#31
+ lsls r0,#31
+ b sq_6
+
+@ round(sqrt(2^22./[72:16:248]))
+rsqrtapp:
+.byte 0xf1,0xda,0xc9,0xbb, 0xb0,0xa6,0x9e,0x97, 0x91,0x8b,0x86,0x82
+
+
+
+@ Notation:
+@ rx:ry means the concatenation of rx and ry with rx having the less significant bits
+
+@ IEEE double in ra:rb ->
+@ mantissa in ra:rb 12Q52 (53 significant bits) with implied 1 set
+@ exponent in re
+@ sign in rs
+@ trashes rt
+.macro mdunpack ra,rb,re,rs,rt
+ lsrs \re,\rb,#20              @ extract sign and exponent
+ subs \rs,\re,#1
+ lsls \rs,#20
+ subs \rb,\rs                  @ clear sign and exponent in mantissa; insert implied 1
+ lsrs \rs,\re,#11              @ sign
+ lsls \re,#21
+ lsrs \re,#21                  @ exponent
+ beq l\@_1                     @ zero exponent?
+ adds \rt,\re,#1
+ lsrs \rt,#11
+ beq l\@_2                     @ exponent != 0x7ff? then done
+l\@_1:
+ movs \ra,#0
+ movs \rb,#1
+ lsls \rb,#20
+ subs \re,#128
+ lsls \re,#12
+l\@_2:
+.endm
+
+@ IEEE double in ra:rb ->
+@ signed mantissa in ra:rb 12Q52 (53 significant bits) with implied 1
+@ exponent in re
+@ trashes rt0 and rt1
+@ +zero, +denormal -> exponent=-0x80000
+@ -zero, -denormal -> exponent=-0x80000
+@ +Inf, +NaN -> exponent=+0x77f000
+@ -Inf, -NaN -> exponent=+0x77e000
+.macro mdunpacks ra,rb,re,rt0,rt1
+ lsrs \re,\rb,#20              @ extract sign and exponent
+ lsrs \rt1,\rb,#31             @ sign only
+ subs \rt0,\re,#1
+ lsls \rt0,#20
+ subs \rb,\rt0                 @ clear sign and exponent in mantissa; insert implied 1
+ lsls \re,#21
+ bcc l\@_1                     @ skip on positive
+ mvns \rb,\rb                  @ negate mantissa
+ rsbs \ra,#0
+ bcc l\@_1
+ adds \rb,#1
+l\@_1:
+ lsrs \re,#21
+ beq l\@_2                     @ zero exponent?
+ adds \rt0,\re,#1
+ lsrs \rt0,#11
+ beq l\@_3                     @ exponent != 0x7ff? then done
+ subs \re,\rt1
+l\@_2:
+ movs \ra,#0
+ lsls \rt1,#1                  @ +ve: 0  -ve: 2
+ adds \rb,\rt1,#1              @ +ve: 1  -ve: 3
+ lsls \rb,#30                  @ create +/-1 mantissa
+ asrs \rb,#10
+ subs \re,#128
+ lsls \re,#12
+l\@_3:
+.endm
+
+.align 2
+.thumb_func
+qfp_dsub:
+ push {r4-r7,r14}
+ movs r4,#1
+ lsls r4,#31
+ eors r3,r4                    @ flip sign on second argument
+ b da_entry                    @ continue in dadd
+
+.align 2
+.thumb_func
+qfp_dadd:
+ push {r4-r7,r14}
+da_entry:
+ mdunpacks r0,r1,r4,r6,r7
+ mdunpacks r2,r3,r5,r6,r7
+ subs r7,r5,r4                 @ ye-xe
+ subs r6,r4,r5                 @ xe-ye
+ bmi da_ygtx
+@ here xe>=ye: need to shift y down r6 places
+ mov r12,r4                    @ save exponent
+ cmp r6,#32
+ bge da_xrgty                  @ xe rather greater than ye?
+ adds r7,#32
+ movs r4,r2
+ lsls r4,r4,r7                 @ rounding bit + sticky bits
+da_xgty0:
+ movs r5,r3
+ lsls r5,r5,r7
+ lsrs r2,r6
+ asrs r3,r6
+ orrs r2,r5
+da_add:
+ adds r0,r2
+ adcs r1,r3
+da_pack:
+@ here unnormalised signed result (possibly 0) is in r0:r1 with exponent r12, rounding + sticky bits in r4
+@ Note that if a large normalisation shift is required then the arguments were close in magnitude and so we
+@ cannot have not gone via the xrgty/yrgtx paths. There will therefore always be enough high bits in r4
+@ to provide a correct continuation of the exact result.
+@ now pack result back up
+ lsrs r3,r1,#31                @ get sign bit
+ beq 1f                        @ skip on positive
+ mvns r1,r1                    @ negate mantissa
+ mvns r0,r0
+ movs r2,#0
+ rsbs r4,#0
+ adcs r0,r2
+ adcs r1,r2
+1:
+ mov r2,r12                    @ get exponent
+ lsrs r5,r1,#21
+ bne da_0                      @ shift down required?
+ lsrs r5,r1,#20
+ bne da_1                      @ normalised?
+ cmp r0,#0
+ beq da_5                      @ could mantissa be zero?
+da_2:
+ adds r4,r4
+ adcs r0,r0
+ adcs r1,r1
+ subs r2,#1                    @ adjust exponent
+ lsrs r5,r1,#20
+ beq da_2
+da_1:
+ lsls r4,#1                    @ check rounding bit
+ bcc da_3
+da_4:
+ adds r0,#1                    @ round up
+ bcc 2f
+ adds r1,#1
+2:
+ cmp r4,#0                     @ sticky bits zero?
+ bne da_3
+ lsrs r0,#1                    @ round to even
+ lsls r0,#1
+da_3:
+ subs r2,#1
+ bmi da_6
+ adds r4,r2,#2                 @ check if exponent is overflowing
+ lsrs r4,#11
+ bne da_7
+ lsls r2,#20                   @ pack exponent and sign
+ add r1,r2
+ lsls r3,#31
+ add r1,r3
+ pop {r4-r7,r15}
+
+da_7:
+@ here exponent overflow: return signed infinity
+ lsls r1,r3,#31
+ ldr r3,=#0x7ff00000
+ orrs r1,r3
+ b 1f
+da_6:
+@ here exponent underflow: return signed zero
+ lsls r1,r3,#31
+1:
+ movs r0,#0
+ pop {r4-r7,r15}
+
+da_5:
+@ here mantissa could be zero
+ cmp r1,#0
+ bne da_2
+ cmp r4,#0
+ bne da_2
+@ inputs must have been of identical magnitude and opposite sign, so return +0
+ pop {r4-r7,r15}
+
+da_0:
+@ here a shift down by one place is required for normalisation
+ adds r2,#1                    @ adjust exponent
+ lsls r6,r0,#31                @ save rounding bit
+ lsrs r0,#1
+ lsls r5,r1,#31
+ orrs r0,r5
+ lsrs r1,#1
+ cmp r6,#0
+ beq da_3
+ b da_4
+
+da_xrgty:                      @ xe>ye and shift>=32 places
+ cmp r6,#60
+ bge da_xmgty                  @ xe much greater than ye?
+ subs r6,#32
+ adds r7,#64
+
+ movs r4,r2
+ lsls r4,r4,r7                 @ these would be shifted off the bottom of the sticky bits
+ beq 1f
+ movs r4,#1
+1:
+ lsrs r2,r2,r6
+ orrs r4,r2
+ movs r2,r3
+ lsls r3,r3,r7
+ orrs r4,r3
+ asrs r3,r2,#31                @ propagate sign bit
+ b da_xgty0
+
+da_ygtx:
+@ here ye>xe: need to shift x down r7 places
+ mov r12,r5                    @ save exponent
+ cmp r7,#32
+ bge da_yrgtx                  @ ye rather greater than xe?
+ adds r6,#32
+ movs r4,r0
+ lsls r4,r4,r6                 @ rounding bit + sticky bits
+da_ygtx0:
+ movs r5,r1
+ lsls r5,r5,r6
+ lsrs r0,r7
+ asrs r1,r7
+ orrs r0,r5
+ b da_add
+
+da_yrgtx:
+ cmp r7,#60
+ bge da_ymgtx                  @ ye much greater than xe?
+ subs r7,#32
+ adds r6,#64
+
+ movs r4,r0
+ lsls r4,r4,r6                 @ these would be shifted off the bottom of the sticky bits
+ beq 1f
+ movs r4,#1
+1:
+ lsrs r0,r0,r7
+ orrs r4,r0
+ movs r0,r1
+ lsls r1,r1,r6
+ orrs r4,r1
+ asrs r1,r0,#31                @ propagate sign bit
+ b da_ygtx0
+
+da_ymgtx:                      @ result is just y
+ movs r0,r2
+ movs r1,r3
+da_xmgty:                      @ result is just x
+ movs r4,#0                    @ clear sticky bits
+ b da_pack
+
+.ltorg
+
+@ equivalent of UMULL
+@ needs five temporary registers
+@ can have rt3==rx, in which case rx trashed
+@ can have rt4==ry, in which case ry trashed
+@ can have rzl==rx
+@ can have rzh==ry
+@ can have rzl,rzh==rt3,rt4
+.macro mul32_32_64 rx,ry,rzl,rzh,rt0,rt1,rt2,rt3,rt4
+                               @   t0   t1   t2   t3   t4
+                               @                  (x)  (y)
+ uxth \rt0,\rx                 @   xl
+ uxth \rt1,\ry                 @        yl
+ muls \rt0,\rt1                @  xlyl=L
+ lsrs \rt2,\rx,#16             @             xh
+ muls \rt1,\rt2                @       xhyl=M0
+ lsrs \rt4,\ry,#16             @                       yh
+ muls \rt2,\rt4                @           xhyh=H
+ uxth \rt3,\rx                 @                   xl
+ muls \rt3,\rt4                @                  xlyh=M1
+ adds \rt1,\rt3                @      M0+M1=M
+ bcc l\@_1                     @ addition of the two cross terms can overflow, so add carry into H
+ movs \rt3,#1                  @                   1
+ lsls \rt3,#16                 @                0x10000
+ adds \rt2,\rt3                @             H'
+l\@_1:
+                               @   t0   t1   t2   t3   t4
+                               @                 (zl) (zh)
+ lsls \rzl,\rt1,#16            @                  ML
+ lsrs \rzh,\rt1,#16            @                       MH
+ adds \rzl,\rt0                @                  ZL
+ adcs \rzh,\rt2                @                       ZH
+.endm
+
+@ SUMULL: x signed, y unsigned
+@ in table below ¯ means signed variable
+@ needs five temporary registers
+@ can have rt3==rx, in which case rx trashed
+@ can have rt4==ry, in which case ry trashed
+@ can have rzl==rx
+@ can have rzh==ry
+@ can have rzl,rzh==rt3,rt4
+.macro muls32_32_64 rx,ry,rzl,rzh,rt0,rt1,rt2,rt3,rt4
+                               @   t0   t1   t2   t3   t4
+                               @                 ¯(x)  (y)
+ uxth \rt0,\rx                 @   xl
+ uxth \rt1,\ry                 @        yl
+ muls \rt0,\rt1                @  xlyl=L
+ asrs \rt2,\rx,#16             @            ¯xh
+ muls \rt1,\rt2                @      ¯xhyl=M0
+ lsrs \rt4,\ry,#16             @                       yh
+ muls \rt2,\rt4                @          ¯xhyh=H
+ uxth \rt3,\rx                 @                   xl
+ muls \rt3,\rt4                @                 xlyh=M1
+ asrs \rt4,\rt1,#31            @                      M0sx   (M1 sign extension is zero)
+ adds \rt1,\rt3                @      M0+M1=M 
+ movs \rt3,#0                  @                    0
+ adcs \rt4,\rt3                @                      ¯Msx
+ lsls \rt4,#16                 @                    ¯Msx<<16
+ adds \rt2,\rt4                @             H'
+
+                               @   t0   t1   t2   t3   t4
+                               @                 (zl) (zh)
+ lsls \rzl,\rt1,#16            @                  M~
+ lsrs \rzh,\rt1,#16            @                       M~
+ adds \rzl,\rt0                @                  ZL
+ adcs \rzh,\rt2                @                      ¯ZH
+.endm
+
+@ SSMULL: x signed, y signed
+@ in table below ¯ means signed variable
+@ needs five temporary registers
+@ can have rt3==rx, in which case rx trashed
+@ can have rt4==ry, in which case ry trashed
+@ can have rzl==rx
+@ can have rzh==ry
+@ can have rzl,rzh==rt3,rt4
+.macro muls32_s32_64 rx,ry,rzl,rzh,rt0,rt1,rt2,rt3,rt4
+                               @   t0   t1   t2   t3   t4
+                               @                 ¯(x)  (y)
+ uxth \rt0,\rx                 @   xl
+ uxth \rt1,\ry                 @        yl
+ muls \rt0,\rt1                @  xlyl=L
+ asrs \rt2,\rx,#16             @            ¯xh
+ muls \rt1,\rt2                @      ¯xhyl=M0
+ asrs \rt4,\ry,#16             @                      ¯yh
+ muls \rt2,\rt4                @          ¯xhyh=H
+ uxth \rt3,\rx                 @                   xl
+ muls \rt3,\rt4                @                ¯xlyh=M1
+ adds \rt1,\rt3                @     ¯M0+M1=M
+ asrs \rt3,\rt1,#31            @                  Msx
+ bvc l\@_1                     @
+ mvns \rt3,\rt3                @                 ¯Msx        flip sign extension bits if overflow
+l\@_1:
+ lsls \rt3,#16                 @                    ¯Msx<<16
+ adds \rt2,\rt3                @             H'
+
+                               @   t0   t1   t2   t3   t4
+                               @                 (zl) (zh)
+ lsls \rzl,\rt1,#16            @                  M~
+ lsrs \rzh,\rt1,#16            @                       M~
+ adds \rzl,\rt0                @                  ZL
+ adcs \rzh,\rt2                @                      ¯ZH
+.endm
+
+@ can have rt2==rx, in which case rx trashed
+@ can have rzl==rx
+@ can have rzh==rt1
+.macro square32_64 rx,rzl,rzh,rt0,rt1,rt2
+                               @   t0   t1   t2   zl   zh
+ uxth \rt0,\rx                 @   xl
+ muls \rt0,\rt0                @ xlxl=L 
+ uxth \rt1,\rx                 @        xl
+ lsrs \rt2,\rx,#16             @             xh
+ muls \rt1,\rt2                @      xlxh=M
+ muls \rt2,\rt2                @           xhxh=H
+ lsls \rzl,\rt1,#17            @                  ML
+ lsrs \rzh,\rt1,#15            @                       MH
+ adds \rzl,\rt0                @                  ZL
+ adcs \rzh,\rt2                @                       ZH
+.endm
+
+.align 2
+.thumb_func
+qfp_dmul:
+ push {r4-r7,r14}
+ mdunpack r0,r1,r4,r6,r5
+ mov r12,r4
+ mdunpack r2,r3,r4,r7,r5
+ eors r7,r6                    @ sign of result
+ add r4,r12                    @ exponent of result
+ push {r0-r2,r4,r7}
+
+@ accumulate full product in r12:r5:r6:r7
+ mul32_32_64 r0,r2, r0,r5, r4,r6,r7,r0,r5    @ XL*YL
+ mov r12,r0                    @ save LL bits
+
+ mul32_32_64 r1,r3, r6,r7, r0,r2,r4,r6,r7    @ XH*YH
+
+ pop {r0}                      @ XL
+ mul32_32_64 r0,r3, r0,r3, r1,r2,r4,r0,r3    @ XL*YH
+ adds r5,r0
+ adcs r6,r3
+ movs r0,#0
+ adcs r7,r0
+
+ pop {r1,r2}                   @ XH,YL
+ mul32_32_64 r1,r2, r1,r2, r0,r3,r4, r1,r2   @ XH*YL
+ adds r5,r1
+ adcs r6,r2
+ movs r0,#0
+ adcs r7,r0
+
+@ here r5:r6:r7 holds the product [1..4) in Q(104-32)=Q72, with extra LSBs in r12
+ pop {r3,r4}                   @ exponent in r3, sign in r4
+ lsls r1,r7,#11
+ lsrs r2,r6,#21
+ orrs r1,r2
+ lsls r0,r6,#11
+ lsrs r2,r5,#21
+ orrs r0,r2
+ lsls r5,#11                   @ now r5:r0:r1 Q83=Q(51+32), extra LSBs in r12
+ lsrs r2,r1,#20
+ bne 1f                        @ skip if in range [2..4)
+ adds r5,r5                    @ shift up so always [2..4) Q83, i.e. [1..2) Q84=Q(52+32)
+ adcs r0,r0
+ adcs r1,r1
+ subs r3,#1                    @ correct exponent
+1:
+ ldr r6,=#0x3ff
+ subs r3,r6                    @ correct for exponent bias
+ lsls r6,#1                    @ 0x7fe
+ cmp r3,r6
+ bhs dm_0                      @ exponent over- or underflow
+ lsls r5,#1                    @ rounding bit to carry
+ bcc 1f                        @ result is correctly rounded
+ adds r0,#1
+ movs r6,#0
+ adcs r1,r6                    @ round up
+ mov r6,r12                    @ remaining sticky bits
+ orrs r5,r6
+ bne 1f                        @ some sticky bits set?
+ lsrs r0,#1
+ lsls r0,#1                    @ round to even
+1:
+ lsls r3,#20
+ adds r1,r3
+dm_2:
+ lsls r4,#31
+ add r1,r4
+ pop {r4-r7,r15}
+
+@ here for exponent over- or underflow
+dm_0:
+ bge dm_1                      @ overflow?
+ adds r3,#1                    @ would-be zero exponent?
+ bne 1f
+ adds r0,#1
+ bne 1f                        @ all-ones mantissa?
+ adds r1,#1
+ lsrs r7,r1,#21
+ beq 1f
+ lsrs r1,#1
+ b dm_2
+1:
+ lsls r1,r4,#31
+ movs r0,#0
+ pop {r4-r7,r15}
+
+@ here for exponent overflow
+dm_1:
+ adds r6,#1                    @ 0x7ff
+ lsls r1,r6,#20
+ movs r0,#0
+ b dm_2
+
+.ltorg
+
+@ Approach to division y/x is as follows.
+@
+@ First generate u1, an approximation to 1/x to about 29 bits. Multiply this by the top
+@ 32 bits of y to generate a0, a first approximation to the result (good to 28 bits or so).
+@ Calculate the exact remainder r0=y-a0*x, which will be about 0. Calculate a correction
+@ d0=r0*u1, and then write a1=a0+d0. If near a rounding boundary, compute the exact
+@ remainder r1=y-a1*x (which can be done using r0 as a basis) to determine whether to
+@ round up or down.
+@
+@ The calculation of 1/x is as given in dreciptest.c. That code verifies exhaustively
+@ that | u1*x-1 | < 10*2^-32.
+@
+@ More precisely:
+@
+@ x0=(q16)x;
+@ x1=(q30)x;
+@ y0=(q31)y;
+@ u0=(q15~)"(0xffffffffU/(unsigned int)roundq(x/x_ulp))/powq(2,16)"(x0); // q15 approximation to 1/x; "~" denotes rounding rather than truncation
+@ v=(q30)(u0*x1-1);
+@ u1=(q30)u0-(q30~)(u0*v);
+@
+@ a0=(q30)(u1*y0);
+@ r0=(q82)y-a0*x;
+@ r0x=(q57)r0;
+@ d0=r0x*u1;
+@ a1=d0+a0;
+@
+@ Error analysis
+@
+@ Use Greek letters to represent the errors introduced by rounding and truncation.
+@
+@               r₀ = y - a₀x
+@                  = y - [ u₁ ( y - α ) - β ] x    where 0 ≤ α < 2^-31, 0 ≤ β < 2^-30
+@                  = y ( 1 - u₁x ) + ( u₁α + β ) x
+@
+@     Hence
+@
+@       | r₀ / x | < 2 * 10*2^-32 + 2^-31 + 2^-30
+@                  = 26*2^-32
+@
+@               r₁ = y - a₁x
+@                  = y - a₀x - d₀x
+@                  = r₀ - d₀x
+@                  = r₀ - u₁ ( r₀ - γ ) x    where 0 ≤ γ < 2^-57
+@                  = r₀ ( 1 - u₁x ) + u₁γx
+@
+@     Hence
+@
+@       | r₁ / x | < 26*2^-32 * 10*2^-32 + 2^-57
+@                  = (260+128)*2^-64
+@                  < 2^-55
+@
+@ Empirically it seems to be nearly twice as good as this.
+@
+@ To determine correctly whether the exact remainder calculation can be skipped we need a result
+@ accurate to < 0.25ulp. In the case where x>y the quotient will be shifted up one place for normalisation
+@ and so 1ulp is 2^-53 and so the calculation above suffices.
+
+.align 2
+.thumb_func
+qfp_ddiv:
+ push {r4-r7,r14}
+ddiv0:                         @ entry point from dtan
+ mdunpack r2,r3,r4,r7,r6       @ unpack divisor
+
+@ unpack dividend by hand to save on register use
+ lsrs r6,r1,#31
+ adds r6,r7
+ mov r12,r6                    @ result sign in r12b0; r12b1 trashed
+ lsls r1,#1
+ lsrs r7,r1,#21                @ exponent
+ beq 1f                        @ zero exponent?
+ adds r6,r7,#1
+ lsrs r6,#11
+ beq 2f                        @ exponent != 0x7ff? then done
+1:
+ movs r0,#0
+ movs r1,#0
+ subs r7,#64                   @ less drastic fiddling of exponents to get 0/0, Inf/Inf correct
+ lsls r7,#12
+2:
+ subs r6,r7,r4
+ lsls r6,#2
+ add r12,r12,r6                @ (signed) exponent in r12[31..8]
+ subs r7,#1                    @ implied 1
+ lsls r7,#21
+ subs r1,r7
+ lsrs r1,#1
+
+// see dreciptest-boxc.c
+ lsrs r4,r3,#15 @ x2=x>>15; // Q5 32..63
+ ldr r5,=#(rcpapp-32)
+ ldrb r4,[r5,r4] @ u=lut5[x2-32]; // Q8
+ lsls r5,r3,#8
+ muls r5,r5,r4
+ asrs r5,#14 @ e=(i32)(u*(x<<8))>>14; // Q22
+ asrs r6,r5,#11
+ muls r6,r6,r6 @ e2=(e>>11)*(e>>11); // Q22
+ subs r5,r6
+ muls r5,r5,r4 @ c=(e-e2)*u; // Q30
+ lsls r6,r4,#7
+ asrs r5,#14
+ adds r5,#1
+ asrs r5,#1
+ subs r6,r5 @ u0=(u<<7)-((c+0x4000)>>15); // Q15
+
+@ here
+@ r0:r1 y mantissa
+@ r2:r3 x mantissa
+@ r6    u0, first approximation to 1/x Q15
+@ r12: result sign, exponent
+
+ lsls r4,r3,#10
+ lsrs r5,r2,#22
+ orrs r5,r4                    @ x1=(q30)x
+ muls r5,r6                    @ u0*x1 Q45
+ asrs r5,#15                   @ v=u0*x1-1 Q30
+ muls r5,r6                    @ u0*v Q45
+ asrs r5,#14
+ adds r5,#1
+ asrs r5,#1                    @ round u0*v to Q30
+ lsls r6,#15
+ subs r6,r5                    @ u1 Q30
+
+@ here
+@ r0:r1 y mantissa
+@ r2:r3 x mantissa
+@ r6    u1, second approximation to 1/x Q30
+@ r12: result sign, exponent
+
+ push {r2,r3}
+ lsls r4,r1,#11
+ lsrs r5,r0,#21
+ orrs r4,r5                    @ y0=(q31)y
+ mul32_32_64 r4,r6, r4,r5, r2,r3,r7,r4,r5  @ y0*u1 Q61
+ adds r4,r4
+ adcs r5,r5                    @ a0=(q30)(y0*u1)
+
+@ here
+@ r0:r1 y mantissa
+@ r5    a0, first approximation to y/x Q30
+@ r6    u1, second approximation to 1/x Q30
+@ r12   result sign, exponent
+
+ ldr r2,[r13,#0]               @ xL
+ mul32_32_64 r2,r5, r2,r3, r1,r4,r7,r2,r3  @ xL*a0
+ ldr r4,[r13,#4]               @ xH
+ muls r4,r5                    @ xH*a0
+ adds r3,r4                    @ r2:r3 now x*a0 Q82
+ lsrs r2,#25
+ lsls r1,r3,#7
+ orrs r2,r1                    @ r2 now x*a0 Q57; r7:r2 is x*a0 Q89
+ lsls r4,r0,#5                 @ y Q57
+ subs r0,r4,r2                 @ r0x=y-x*a0 Q57 (signed)
+
+@ here
+@ r0  r0x Q57
+@ r5  a0, first approximation to y/x Q30
+@ r4  yL  Q57
+@ r6  u1 Q30
+@ r12 result sign, exponent
+
+ muls32_32_64 r0,r6, r7,r6, r1,r2,r3, r7,r6   @ r7:r6 r0x*u1 Q87
+ asrs r3,r6,#25
+ adds r5,r3
+ lsls r3,r6,#7                 @ r3:r5 a1 Q62 (but bottom 7 bits are zero so 55 bits of precision after binary point)
+@ here we could recover another 7 bits of precision (but not accuracy) from the top of r7
+@ but these bits are thrown away in the rounding and conversion to Q52 below
+
+@ here
+@ r3:r5  a1 Q62 candidate quotient [0.5,2) or so
+@ r4     yL Q57
+@ r12    result sign, exponent
+
+ movs r6,#0
+ adds r3,#128                  @ for initial rounding to Q53
+ adcs r5,r5,r6
+ lsrs  r1,r5,#30
+ bne dd_0
+@ here candidate quotient a1 is in range [0.5,1)
+@ so 30 significant bits in r5
+
+ lsls r4,#1                    @ y now Q58
+ lsrs r1,r5,#9                 @ to Q52
+ lsls r0,r5,#23
+ lsrs r3,#9                    @ 0.5ulp-significance bit in carry: if this is 1 we may need to correct result
+ orrs r0,r3
+ bcs dd_1
+ b dd_2
+dd_0:
+@ here candidate quotient a1 is in range [1,2)
+@ so 31 significant bits in r5
+
+ movs r2,#4
+ add r12,r12,r2                @ fix exponent; r3:r5 now effectively Q61
+ adds r3,#128                  @ complete rounding to Q53
+ adcs r5,r5,r6
+ lsrs r1,r5,#10
+ lsls r0,r5,#22
+ lsrs r3,#10                   @ 0.5ulp-significance bit in carry: if this is 1 we may need to correct result
+ orrs r0,r3
+ bcc dd_2
+dd_1:
+
+@ here
+@ r0:r1  rounded result Q53 [0.5,1) or Q52 [1,2), but may not be correctly rounded-to-nearest
+@ r4     yL Q58 or Q57
+@ r12    result sign, exponent
+@ carry set
+
+ adcs r0,r0,r0
+ adcs r1,r1,r1                 @ z Q53 with 1 in LSB
+ lsls r4,#16                   @ Q105-32=Q73
+ ldr r2,[r13,#0]               @ xL Q52
+ ldr r3,[r13,#4]               @ xH Q20
+
+ movs r5,r1                    @ zH Q21
+ muls r5,r2                    @ zH*xL Q73
+ subs r4,r5
+ muls r3,r0                    @ zL*xH Q73
+ subs r4,r3
+ mul32_32_64 r2,r0, r2,r3, r5,r6,r7,r2,r3  @ xL*zL
+ rsbs r2,#0                    @ borrow from low half?
+ sbcs r4,r3                    @ y-xz Q73 (remainder bits 52..73)
+
+ cmp r4,#0
+
+ bmi 1f
+ movs r2,#0                    @ round up
+ adds r0,#1
+ adcs r1,r2
+1:
+ lsrs r0,#1                    @ shift back down to Q52
+ lsls r2,r1,#31
+ orrs r0,r2
+ lsrs r1,#1
+dd_2:
+ add r13,#8
+ mov r2,r12
+ lsls r7,r2,#31                @ result sign
+ asrs r2,#2                    @ result exponent
+ ldr r3,=#0x3fd
+ adds r2,r3
+ ldr r3,=#0x7fe
+ cmp r2,r3
+ bhs dd_3                      @ over- or underflow?
+ lsls r2,#20
+ adds r1,r2                    @ pack exponent
+dd_5:
+ adds r1,r7                    @ pack sign
+ pop {r4-r7,r15}
+
+dd_3:
+ movs r0,#0
+ cmp r2,#0
+ bgt dd_4                      @ overflow?
+ movs r1,r7
+ pop {r4-r7,r15}
+
+dd_4:
+ adds r3,#1                    @ 0x7ff
+ lsls r1,r3,#20
+ b dd_5
+
+/*
+Approach to square root x=sqrt(y) is as follows.
+
+First generate a3, an approximation to 1/sqrt(y) to about 30 bits. Multiply this by y
+to give a4~sqrt(y) to about 28 bits and a remainder r4=y-a4^2. Then, because
+d sqrt(y) / dy = 1 / (2 sqrt(y)) let d4=r4*a3/2 and then the value a5=a4+d4 is
+a better approximation to sqrt(y). If this is near a rounding boundary we
+compute an exact remainder y-a5*a5 to decide whether to round up or down.
+
+The calculation of a3 and a4 is as given in dsqrttest.c. That code verifies exhaustively
+that | 1 - a3a4 | < 10*2^-32, | r4 | < 40*2^-32 and | r4/y | < 20*2^-32.
+
+More precisely, with "y" representing y truncated to 30 binary places:
+
+u=(q3)y;                          // 24-entry table
+a0=(q8~)"1/sqrtq(x+x_ulp/2)"(u);  // first approximation from table
+p0=(q16)(a0*a0) * (q16)y;
+r0=(q20)(p0-1);
+dy0=(q15)(r0*a0);                 // Newton-Raphson correction term
+a1=(q16)a0-dy0/2;                 // good to ~9 bits
+
+p1=(q19)(a1*a1)*(q19)y;
+r1=(q23)(p1-1);
+dy1=(q15~)(r1*a1);                // second Newton-Raphson correction
+a2x=(q16)a1-dy1/2;                // good to ~16 bits
+a2=a2x-a2x/1t16;                  // prevent overflow of a2*a2 in 32 bits
+
+p2=(a2*a2)*(q30)y;                // Q62
+r2=(q36)(p2-1+1t-31);
+dy2=(q30)(r2*a2);                 // Q52->Q30
+a3=(q31)a2-dy2/2;                 // good to about 30 bits
+a4=(q30)(a3*(q30)y+1t-31);        // good to about 28 bits
+
+Error analysis
+
+          r₄ = y - a₄²
+          d₄ = 1/2 a₃r₄
+          a₅ = a₄ + d₄
+          r₅ = y - a₅²
+             = y - ( a₄ + d₄ )²
+             = y - a₄² - a₃a₄r₄ - 1/4 a₃²r₄²
+             = r₄ - a₃a₄r₄ - 1/4 a₃²r₄²
+
+      | r₅ | < | r₄ | | 1 - a₃a₄ | + 1/4 r₄²
+
+          a₅ = √y √( 1 - r₅/y )
+             = √y ( 1 - 1/2 r₅/y + ... )
+
+So to first order (second order being very tiny)
+
+     √y - a₅ = 1/2 r₅/y
+
+and
+
+ | √y - a₅ | < 1/2 ( | r₄/y | | 1 - a₃a₄ | + 1/4 r₄²/y )
+
+From dsqrttest.c (conservatively):
+
+             < 1/2 ( 20*2^-32 * 10*2^-32 + 1/4 * 40*2^-32*20*2^-32 )
+             = 1/2 ( 200 + 200 ) * 2^-64
+             < 2^-56
+
+Empirically we see about 1ulp worst-case error including rounding at Q57.
+
+To determine correctly whether the exact remainder calculation can be skipped we need a result
+accurate to < 0.25ulp at Q52, or 2^-54.
+*/
+
+dq_2:
+ bge dq_3                      @ +Inf?
+ movs r1,#0
+ b dq_4
+
+dq_0:
+ lsrs r1,#31
+ lsls r1,#31                   @ preserve sign bit
+ lsrs r2,#21                   @ extract exponent
+ beq dq_4                      @ -0? return it
+ asrs r1,#11                   @ make -Inf
+ b dq_4
+
+dq_3:
+ ldr r1,=#0x7ff
+ lsls r1,#20                   @ return +Inf
+dq_4:
+ movs r0,#0
+dq_1:
+ bx r14
+
+.align 2
+.thumb_func
+qfp_dsqrt:
+ lsls r2,r1,#1
+ bcs dq_0                      @ negative?
+ lsrs r2,#21                   @ extract exponent
+ subs r2,#1
+ ldr r3,=#0x7fe
+ cmp r2,r3
+ bhs dq_2                      @ catches 0 and +Inf
+ push {r4-r7,r14}
+ lsls r4,r2,#20
+ subs r1,r4                    @ insert implied 1
+ lsrs r2,#1
+ bcc 1f                        @ even exponent? skip
+ adds r0,r0,r0                 @ odd exponent: shift up mantissa
+ adcs r1,r1,r1
+1:
+ lsrs r3,#2
+ adds r2,r3
+ lsls r2,#20
+ mov r12,r2                    @ save result exponent
+
+@ here
+@ r0:r1  y mantissa Q52 [1,4)
+@ r12    result exponent
+
+ adr r4,drsqrtapp-8            @ first eight table entries are never accessed because of the mantissa's leading 1
+ lsrs r2,r1,#17                @ y Q3
+ ldrb r2,[r4,r2]               @ initial approximation to reciprocal square root a0 Q8
+ lsrs r3,r1,#4                 @ first Newton-Raphson iteration
+ muls r3,r2
+ muls r3,r2                    @  i32 p0=a0*a0*(y>>14);          // Q32
+ asrs r3,r3,#12                @  i32 r0=p0>>12;                 // Q20
+ muls r3,r2
+ asrs r3,#13                   @  i32 dy0=(r0*a0)>>13;           // Q15
+ lsls r2,#8
+ subs r2,r3                    @  i32 a1=(a0<<8)-dy0;         // Q16
+
+ movs r3,r2
+ muls r3,r3
+ lsrs r3,#13
+ lsrs r4,r1,#1
+ muls r3,r4                    @  i32 p1=((a1*a1)>>11)*(y>>11);  // Q19*Q19=Q38
+ asrs r3,#15                   @  i32 r1=p1>>15;                 // Q23
+ muls r3,r2
+ asrs r3,#23
+ adds r3,#1
+ asrs r3,#1                    @  i32 dy1=(r1*a1+(1<<23))>>24;   // Q23*Q16=Q39; Q15
+ subs r2,r3                    @  i32 a2=a1-dy1;                 // Q16
+ lsrs r3,r2,#16
+ subs r2,r3                    @  if(a2>=0x10000) a2=0xffff; to prevent overflow of a2*a2
+
+@ here
+@ r0:r1 y mantissa
+@ r2    a2 ~ 1/sqrt(y) Q16
+@ r12   result exponent
+
+ movs r3,r2
+ muls r3,r3
+ lsls r1,#10
+ lsrs r4,r0,#22
+ orrs r1,r4                    @ y Q30
+ mul32_32_64 r1,r3, r4,r3, r5,r6,r7,r4,r3   @  i64 p2=(ui64)(a2*a2)*(ui64)y;  // Q62 r4:r3
+ lsls r5,r3,#6
+ lsrs r4,#26
+ orrs r4,r5
+ adds r4,#0x20                 @  i32 r2=(p2>>26)+0x20;          // Q36 r4
+ uxth r5,r4
+ muls r5,r2
+ asrs r4,#16
+ muls r4,r2
+ lsrs r5,#16
+ adds r4,r5
+ asrs r4,#6                    @ i32 dy2=((i64)r2*(i64)a2)>>22; // Q36*Q16=Q52; Q30
+ lsls r2,#15
+ subs r2,r4
+
+@ here
+@ r0    y low bits
+@ r1    y Q30
+@ r2    a3 ~ 1/sqrt(y) Q31
+@ r12   result exponent
+
+ mul32_32_64 r2,r1, r3,r4, r5,r6,r7,r3,r4
+ adds r3,r3,r3
+ adcs r4,r4,r4
+ adds r3,r3,r3
+ movs r3,#0
+ adcs r3,r4                    @ ui32 a4=((ui64)a3*(ui64)y+(1U<<31))>>31; // Q30
+
+@ here
+@ r0    y low bits
+@ r1    y Q30
+@ r2    a3 Q31 ~ 1/sqrt(y)
+@ r3    a4 Q30 ~ sqrt(y)
+@ r12   result exponent
+
+ square32_64 r3, r4,r5, r6,r5,r7
+ lsls r6,r0,#8
+ lsrs r7,r1,#2
+ subs r6,r4
+ sbcs r7,r5                    @ r4=(q60)y-a4*a4
+
+@ by exhaustive testing, r4 = fffffffc0e134fdc .. 00000003c2bf539c Q60
+
+ lsls r5,r7,#29
+ lsrs r6,#3
+ adcs r6,r5                    @ r4 Q57 with rounding
+ muls32_32_64 r6,r2, r6,r2, r4,r5,r7,r6,r2    @ d4=a3*r4/2 Q89
+@ r4+d4 is correct to 1ULP at Q57, tested on ~9bn cases including all extreme values of r4 for each possible y Q30
+
+ adds r2,#8
+ asrs r2,#5                    @ d4 Q52, rounded to Q53 with spare bit in carry
+
+@ here
+@ r0    y low bits
+@ r1    y Q30
+@ r2    d4 Q52, rounded to Q53
+@ C flag contains d4_b53
+@ r3    a4 Q30
+
+ bcs dq_5
+
+ lsrs r5,r3,#10                @ a4 Q52
+ lsls r4,r3,#22
+
+ asrs r1,r2,#31
+ adds r0,r2,r4
+ adcs r1,r5                    @ a4+d4
+
+ add r1,r12                    @ pack exponent
+ pop {r4-r7,r15}
+
+.ltorg
+ 
+
+@ round(sqrt(2^22./[68:8:252]))
+drsqrtapp:
+.byte 0xf8,0xeb,0xdf,0xd6,0xcd,0xc5,0xbe,0xb8
+.byte 0xb2,0xad,0xa8,0xa4,0xa0,0x9c,0x99,0x95
+.byte 0x92,0x8f,0x8d,0x8a,0x88,0x85,0x83,0x81
+
+dq_5:
+@ here we are near a rounding boundary, C is set
+ adcs r2,r2,r2                 @ d4 Q53+1ulp
+ lsrs r5,r3,#9
+ lsls r4,r3,#23                @ r4:r5 a4 Q53
+ asrs r1,r2,#31
+ adds r4,r2,r4
+ adcs r5,r1                    @ r4:r5 a5=a4+d4 Q53+1ulp
+ movs r3,r5
+ muls r3,r4
+ square32_64 r4,r1,r2,r6,r2,r7
+ adds r2,r3
+ adds r2,r3                    @ r1:r2 a5^2 Q106
+ lsls r0,#22                   @ y Q84
+
+ rsbs r1,#0
+ sbcs r0,r2                    @ remainder y-a5^2
+ bmi 1f                        @ y<a5^2: no need to increment a5
+ movs r3,#0
+ adds r4,#1
+ adcs r5,r3                    @ bump a5 if over rounding boundary
+1:
+ lsrs r0,r4,#1
+ lsrs r1,r5,#1
+ lsls r5,#31
+ orrs r0,r5
+ add r1,r12
+ pop {r4-r7,r15}
+
+@ compare r0:r1 against r2:r3, returning -1/0/1 for <, =, >
+@ also set flags accordingly
+.thumb_func
+qfp_dcmp:
+ push {r4,r6,r7,r14}
+ ldr r7,=#0x7ff                @ flush NaNs and denormals
+ lsls r4,r1,#1
+ lsrs r4,#21
+ beq 1f
+ cmp r4,r7
+ bne 2f
+1:
+ movs r0,#0
+ lsrs r1,#20
+ lsls r1,#20
+2:
+ lsls r4,r3,#1
+ lsrs r4,#21
+ beq 1f
+ cmp r4,r7
+ bne 2f
+1:
+ movs r2,#0
+ lsrs r3,#20
+ lsls r3,#20
+2:
+dcmp_fast_entry:
+ movs r6,#1
+ eors r3,r1
+ bmi 4f                        @ opposite signs? then can proceed on basis of sign of x
+ eors r3,r1                    @ restore r3
+ bpl 1f
+ rsbs r6,#0                    @ negative? flip comparison
+1:
+ cmp r1,r3
+ bne 1f
+ cmp r0,r2
+ bhi 2f
+ blo 3f
+5:
+ movs r6,#0                    @ equal? result is 0
+1:
+ bgt 2f
+3:
+ rsbs r6,#0
+2:
+ subs r0,r6,#0                 @ copy and set flags
+ pop {r4,r6,r7,r15}
+4:
+ orrs r3,r1                    @ make -0==+0
+ adds r3,r3
+ orrs r3,r0
+ orrs r3,r2
+ beq 5b
+ cmp r1,#0
+ bge 2b
+ b 3b
+ 
+
+@ "scientific" functions start here
+
+.thumb_func
+push_r8_r11:
+ mov r4,r8
+ mov r5,r9
+ mov r6,r10
+ mov r7,r11
+ push {r4-r7}
+ bx r14
+
+.thumb_func
+pop_r8_r11:
+ pop {r4-r7}
+ mov r8,r4
+ mov r9,r5
+ mov r10,r6
+ mov r11,r7
+ bx r14
+
+@ double-length CORDIC rotation step
+
+@ r0:r1   ω
+@ r6      32-i (complementary shift)
+@ r7      i (shift)
+@ r8:r9   x
+@ r10:r11 y
+@ r12     coefficient pointer
+
+@ an option in rotation mode would be to compute the sequence of σ values
+@ in one pass, rotate the initial vector by the residual ω and then run a
+@ second pass to compute the final x and y. This would relieve pressure
+@ on registers and hence possibly be faster. The same trick does not work
+@ in vectoring mode (but perhaps one could work to single precision in
+@ a first pass and then double precision in a second pass?).
+
+.thumb_func
+dcordic_vec_step:
+ mov r2,r12
+ ldmia r2!,{r3,r4}
+ mov r12,r2
+ mov r2,r11
+ cmp r2,#0
+ blt 1f
+ b 2f
+
+.thumb_func
+dcordic_rot_step:
+ mov r2,r12
+ ldmia r2!,{r3,r4}
+ mov r12,r2
+ cmp r1,#0
+ bge 1f
+2:
+@ ω<0 / y>=0
+@ ω+=dω
+@ x+=y>>i, y-=x>>i
+ adds r0,r3
+ adcs r1,r4
+
+ mov r3,r11
+ asrs r3,r7
+ mov r4,r11
+ lsls r4,r6
+ mov r2,r10
+ lsrs r2,r7
+ orrs r2,r4                    @ r2:r3 y>>i, rounding in carry
+ mov r4,r8
+ mov r5,r9                     @ r4:r5 x
+ adcs r2,r4
+ adcs r3,r5                    @ r2:r3 x+(y>>i)
+ mov r8,r2
+ mov r9,r3
+
+ mov r3,r5
+ lsls r3,r6
+ asrs r5,r7
+ lsrs r4,r7
+ orrs r4,r3                    @ r4:r5 x>>i, rounding in carry
+ mov r2,r10
+ mov r3,r11
+ sbcs r2,r4
+ sbcs r3,r5                    @ r2:r3 y-(x>>i)
+ mov r10,r2
+ mov r11,r3
+ bx r14
+
+
+@ ω>0 / y<0
+@ ω-=dω
+@ x-=y>>i, y+=x>>i
+1:
+ subs r0,r3
+ sbcs r1,r4
+
+ mov r3,r9
+ asrs r3,r7
+ mov r4,r9
+ lsls r4,r6
+ mov r2,r8
+ lsrs r2,r7
+ orrs r2,r4                    @ r2:r3 x>>i, rounding in carry
+ mov r4,r10
+ mov r5,r11                    @ r4:r5 y
+ adcs r2,r4
+ adcs r3,r5                    @ r2:r3 y+(x>>i)
+ mov r10,r2
+ mov r11,r3
+
+ mov r3,r5
+ lsls r3,r6
+ asrs r5,r7
+ lsrs r4,r7
+ orrs r4,r3                    @ r4:r5 y>>i, rounding in carry
+ mov r2,r8
+ mov r3,r9
+ sbcs r2,r4
+ sbcs r3,r5                    @ r2:r3 x-(y>>i)
+ mov r8,r2
+ mov r9,r3
+ bx r14
+
+ret_dzero:
+ movs r0,#0
+ movs r1,#0
+ bx r14
+
+@ convert double to signed int, rounding towards 0, clamping
+.thumb_func
+qfp_double2int_z:
+ cmp r1,#0
+ bge qfp_double2int  @ +ve or zero? then use rounding towards -Inf
+ push {r14}
+ lsls r1,#1          @ -ve: clear sign bit
+ lsrs r1,#1
+ bl qfp_double2uint  @ convert to unsigned, rounding towards -Inf
+ movs r1,#1
+ lsls r1,#31         @ r1=0x80000000
+ cmp r0,r1
+ bhi 1f
+ rsbs r0,#0
+ pop {r15}
+1:
+ mov r0,r1
+ pop {r15}
+
+@ convert packed double in r0:r1 to signed/unsigned 32/64-bit integer/fixed-point value in r0:r1 [with r2 places after point], with rounding towards -Inf
+@ fixed-point versions only work with reasonable values in r2 because of the way dunpacks works
+
+.thumb_func
+qfp_double2int:
+ movs r2,#0                    @ and fall through
+.thumb_func
+qfp_double2fix:
+ push {r14}
+ adds r2,#32
+ bl qfp_double2fix64
+ movs r0,r1
+ pop {r15}
+
+.thumb_func
+qfp_double2uint:
+ movs r2,#0                    @ and fall through
+.thumb_func
+qfp_double2ufix:
+ push {r14}
+ adds r2,#32
+ bl qfp_double2ufix64
+ movs r0,r1
+ pop {r15}
+
+.thumb_func
+qfp_float2int64_z:
+ cmp r0,#0
+ bge qfp_float2int64 @ +ve or zero? then use rounding towards -Inf
+ push {r14}
+ lsls r0,#1          @ -ve: clear sign bit
+ lsrs r0,#1
+ bl qfp_float2uint64 @ convert to unsigned, rounding towards -Inf
+ movs r2,#1
+ lsls r2,#31         @ r2=0x80000000
+ cmp r1,r2
+ bhs 1f
+ mvns r1,r1
+ rsbs r0,#0
+ bcc 2f
+ adds r1,#1
+2:
+ pop {r15}
+1:
+ movs r0,#0
+ mov r1,r2
+ pop {r15}
+
+.thumb_func
+qfp_float2int64:
+ movs r1,#0                    @ and fall through
+.thumb_func
+qfp_float2fix64:
+ push {r14}
+ bl f2fix
+ b d2f64_a
+
+.thumb_func
+qfp_float2uint64:
+ movs r1,#0                    @ and fall through
+.thumb_func
+qfp_float2ufix64:
+ asrs r3,r0,#23                @ negative? return 0
+ bmi ret_dzero
+@ and fall through
+
+@ convert float in r0 to signed fixed point in r0:r1:r3, r1 places after point, rounding towards -Inf
+@ result clamped so that r3 can only be 0 or -1
+@ trashes r12
+.thumb_func
+f2fix:
+ push {r4,r14}
+ mov r12,r1
+ asrs r3,r0,#31
+ lsls r0,#1
+ lsrs r2,r0,#24
+ beq 1f                        @ zero?
+ cmp r2,#0xff                  @ Inf?
+ beq 2f
+ subs r1,r2,#1
+ subs r2,#0x7f                 @ remove exponent bias
+ lsls r1,#24
+ subs r0,r1                    @ insert implied 1
+ eors r0,r3
+ subs r0,r3                    @ top two's complement
+ asrs r1,r0,#4                 @ convert to double format
+ lsls r0,#28
+ b d2fix_a
+1:
+ movs r0,#0
+ movs r1,r0
+ movs r3,r0
+ pop {r4,r15}
+2:
+ mvns r0,r3                    @ return max/min value
+ mvns r1,r3
+ pop {r4,r15}
+
+.thumb_func
+qfp_double2int64_z:
+ cmp r1,#0
+ bge qfp_double2int64 @ +ve or zero? then use rounding towards -Inf
+ push {r14}
+ lsls r1,#1           @ -ve: clear sign bit
+ lsrs r1,#1
+ bl qfp_double2uint64 @ convert to unsigned, rounding towards -Inf
+ cmp r1,#0
+ blt 1f
+ mvns r1,r1
+ rsbs r0,#0
+ bcc 2f
+ adds r1,#1
+2:
+ pop {r15}
+1:
+ movs r0,#0
+ movs r1,#1
+ lsls r1,#31          @ 0x80000000
+ pop {r15}
+
+.thumb_func
+qfp_double2int64:
+ movs r2,#0                    @ and fall through
+.thumb_func
+qfp_double2fix64:
+ push {r14}
+ bl d2fix
+d2f64_a:
+ asrs r2,r1,#31
+ cmp r2,r3
+ bne 1f                        @ sign extension bits fail to match sign of result?
+ pop {r15}
+1:
+ mvns r0,r3
+ movs r1,#1
+ lsls r1,#31
+ eors r1,r1,r0                 @ generate extreme fixed-point values
+ pop {r15}
+
+.thumb_func
+qfp_double2uint64:
+ movs r2,#0                    @ and fall through
+.thumb_func
+qfp_double2ufix64:
+ asrs r3,r1,#20                @ negative? return 0
+ bmi ret_dzero
+@ and fall through
+
+@ convert double in r0:r1 to signed fixed point in r0:r1:r3, r2 places after point, rounding towards -Inf
+@ result clamped so that r3 can only be 0 or -1
+@ trashes r12
+.thumb_func
+d2fix:
+ push {r4,r14}
+ mov r12,r2
+ bl dunpacks
+ asrs r4,r2,#16
+ adds r4,#1
+ bge 1f
+ movs r1,#0                    @ -0 -> +0
+1:
+ asrs r3,r1,#31
+d2fix_a:
+@ here
+@ r0:r1 two's complement mantissa
+@ r2    unbaised exponent
+@ r3    mantissa sign extension bits
+ add r2,r12                    @ exponent plus offset for required binary point position
+ subs r2,#52                   @ required shift
+ bmi 1f                        @ shift down?
+@ here a shift up by r2 places
+ cmp r2,#12                    @ will clamp?
+ bge 2f
+ movs r4,r0
+ lsls r1,r2
+ lsls r0,r2
+ rsbs r2,#0
+ adds r2,#32                   @ complementary shift
+ lsrs r4,r2
+ orrs r1,r4
+ pop {r4,r15}
+2:
+ mvns r0,r3
+ mvns r1,r3                    @ overflow: clamp to extreme fixed-point values
+ pop {r4,r15}
+1:
+@ here a shift down by -r2 places
+ adds r2,#32
+ bmi 1f                        @ long shift?
+ mov r4,r1
+ lsls r4,r2
+ rsbs r2,#0
+ adds r2,#32                   @ complementary shift
+ asrs r1,r2
+ lsrs r0,r2
+ orrs r0,r4
+ pop {r4,r15}
+1:
+@ here a long shift down
+ movs r0,r1
+ asrs r1,#31                   @ shift down 32 places
+ adds r2,#32
+ bmi 1f                        @ very long shift?
+ rsbs r2,#0
+ adds r2,#32
+ asrs r0,r2
+ pop {r4,r15}
+1:
+ movs r0,r3                    @ result very near zero: use sign extension bits
+ movs r1,r3
+ pop {r4,r15}
+
+@ float <-> double conversions
+.thumb_func
+qfp_float2double:
+ lsrs r3,r0,#31                @ sign bit
+ lsls r3,#31
+ lsls r1,r0,#1
+ lsrs r2,r1,#24                @ exponent
+ beq 1f                        @ zero?
+ cmp r2,#0xff                  @ Inf?
+ beq 2f
+ lsrs r1,#4                    @ exponent and top 20 bits of mantissa
+ ldr r2,=#(0x3ff-0x7f)<<20     @ difference in exponent offsets
+ adds r1,r2
+ orrs r1,r3
+ lsls r0,#29                   @ bottom 3 bits of mantissa
+ bx r14
+1:
+ movs r1,r3                    @ return signed zero
+3:
+ movs r0,#0
+ bx r14
+2:
+ ldr r1,=#0x7ff00000           @ return signed infinity
+ adds r1,r3
+ b 3b
+
+.thumb_func
+qfp_double2float:
+ lsls r2,r1,#1
+ lsrs r2,#21                   @ exponent
+ ldr r3,=#0x3ff-0x7f
+ subs r2,r3                    @ fix exponent bias
+ ble 1f                        @ underflow or zero
+ cmp r2,#0xff
+ bge 2f                        @ overflow or infinity
+ lsls r2,#23                   @ position exponent of result
+ lsrs r3,r1,#31
+ lsls r3,#31
+ orrs r2,r3                    @ insert sign
+ lsls r3,r0,#3                 @ rounding bits
+ lsrs r0,#29
+ lsls r1,#12
+ lsrs r1,#9
+ orrs r0,r1                    @ assemble mantissa
+ orrs r0,r2                    @ insert exponent and sign
+ lsls r3,#1
+ bcc 3f                        @ no rounding
+ beq 4f                        @ all sticky bits 0?
+5:
+ adds r0,#1
+3:
+ bx r14
+4:
+ lsrs r3,r0,#1                 @ odd? then round up
+ bcs 5b
+ bx r14
+1:
+ beq 6f                        @ check case where value is just less than smallest normal
+7:
+ lsrs r0,r1,#31
+ lsls r0,#31
+ bx r14
+6:
+ lsls r2,r1,#12                @ 20 1:s at top of mantissa?
+ asrs r2,#12
+ adds r2,#1
+ bne 7b
+ lsrs r2,r0,#29                @ and 3 more 1:s?
+ cmp r2,#7
+ bne 7b
+ movs r2,#1                    @ return smallest normal with correct sign
+ b 8f
+2:
+ movs r2,#0xff
+8:
+ lsrs r0,r1,#31                @ return signed infinity
+ lsls r0,#8
+ adds r0,r2
+ lsls r0,#23
+ bx r14
+
+@ convert signed/unsigned 32/64-bit integer/fixed-point value in r0:r1 [with r2 places after point] to packed double in r0:r1, with rounding
+
+.thumb_func
+qfp_uint2double:
+ movs r1,#0                    @ and fall through
+.thumb_func
+qfp_ufix2double:
+ movs r2,r1
+ movs r1,#0
+ b qfp_ufix642double
+
+.thumb_func
+qfp_int2double:
+ movs r1,#0                    @ and fall through
+.thumb_func
+qfp_fix2double:
+ movs r2,r1
+ asrs r1,r0,#31                @ sign extend
+ b qfp_fix642double
+
+.thumb_func
+qfp_uint642double:
+ movs r2,#0                    @ and fall through
+.thumb_func
+qfp_ufix642double:
+ movs r3,#0
+ b uf2d
+
+.thumb_func
+qfp_int642double:
+ movs r2,#0                    @ and fall through
+.thumb_func
+qfp_fix642double:
+ asrs r3,r1,#31                @ sign bit across all bits
+ eors r0,r3
+ eors r1,r3
+ subs r0,r3
+ sbcs r1,r3
+uf2d:
+ push {r4,r5,r14}
+ ldr r4,=#0x432
+ subs r2,r4,r2                 @ form biased exponent
+@ here
+@ r0:r1 unnormalised mantissa
+@ r2 -Q (will become exponent)
+@ r3 sign across all bits
+ cmp r1,#0
+ bne 1f                        @ short normalising shift?
+ movs r1,r0
+ beq 2f                        @ zero? return it
+ movs r0,#0
+ subs r2,#32                   @ fix exponent
+1:
+ asrs r4,r1,#21
+ bne 3f                        @ will need shift down (and rounding?)
+ bcs 4f                        @ normalised already?
+5:
+ subs r2,#1
+ adds r0,r0                    @ shift up
+ adcs r1,r1
+ lsrs r4,r1,#21
+ bcc 5b
+4:
+ ldr r4,=#0x7fe
+ cmp r2,r4
+ bhs 6f                        @ over/underflow? return signed zero/infinity
+7:
+ lsls r2,#20                   @ pack and return
+ adds r1,r2
+ lsls r3,#31
+ adds r1,r3
+2:
+ pop {r4,r5,r15}
+6:                             @ return signed zero/infinity according to unclamped exponent in r2
+ mvns r2,r2
+ lsrs r2,#21
+ movs r0,#0
+ movs r1,#0
+ b 7b
+
+3:
+@ here we need to shift down to normalise and possibly round
+ bmi 1f                        @ already normalised to Q63?
+2:
+ subs r2,#1
+ adds r0,r0                    @ shift up
+ adcs r1,r1
+ bpl 2b
+1:
+@ here we have a 1 in b63 of r0:r1
+ adds r2,#11                   @ correct exponent for subsequent shift down
+ lsls r4,r0,#21                @ save bits for rounding
+ lsrs r0,#11
+ lsls r5,r1,#21
+ orrs r0,r5
+ lsrs r1,#11
+ lsls r4,#1
+ beq 1f                        @ sticky bits are zero?
+8:
+ movs r4,#0
+ adcs r0,r4
+ adcs r1,r4
+ b 4b
+1:
+ bcc 4b                        @ sticky bits are zero but not on rounding boundary
+ lsrs r4,r0,#1                 @ increment if odd (force round to even)
+ b 8b
+
+
+.ltorg
+
+.thumb_func
+dunpacks:
+ mdunpacks r0,r1,r2,r3,r4
+ ldr r3,=#0x3ff
+ subs r2,r3                    @ exponent without offset
+ bx r14
+
+@ r0:r1  signed mantissa Q52
+@ r2     unbiased exponent < 10 (i.e., |x|<2^10)
+@ r4     pointer to:
+@          - divisor reciprocal approximation r=1/d Q15
+@          - divisor d Q62  0..20
+@          - divisor d Q62 21..41
+@          - divisor d Q62 42..62
+@ returns:
+@ r0:r1  reduced result y Q62, -0.6 d < y < 0.6 d (better in practice)
+@ r2     quotient q (number of reductions)
+@ if exponent >=10, returns r0:r1=0, r2=1024*mantissa sign
+@ designed to work for 0.5<d<2, in particular d=ln2 (~0.7) and d=π/2 (~1.6)
+.thumb_func
+dreduce:
+ adds r2,#2                    @ e+2
+ bmi 1f                        @ |x|<0.25, too small to need adjustment
+ cmp r2,#12
+ bge 4f
+2:
+ movs r5,#17
+ subs r5,r2                    @ 15-e
+ movs r3,r1                    @ Q20
+ asrs r3,r5                    @ x Q5
+ adds r2,#8                    @ e+10
+ adds r5,#7                    @ 22-e = 32-(e+10)
+ movs r6,r0
+ lsrs r6,r5
+ lsls r0,r2
+ lsls r1,r2
+ orrs r1,r6                    @ r0:r1 x Q62
+ ldmia r4,{r4-r7}
+ muls r3,r4                    @ rx Q20
+ asrs r2,r3,#20
+ movs r3,#0
+ adcs r2,r3                    @ rx Q0 rounded = q; for e.g. r=1.5 |q|<1.5*2^10
+ muls r5,r2                    @ qd in pieces: L Q62
+ muls r6,r2                    @               M Q41
+ muls r7,r2                    @               H Q20
+ lsls r7,#10
+ asrs r4,r6,#11
+ lsls r6,#21
+ adds r6,r5
+ adcs r7,r4
+ asrs r5,#31
+ adds r7,r5                    @ r6:r7 qd Q62
+ subs r0,r6
+ sbcs r1,r7                    @ remainder Q62
+ bx r14
+4:
+ movs r2,#12                   @ overflow: clamp to +/-1024
+ movs r0,#0
+ asrs r1,#31
+ lsls r1,#1
+ adds r1,#1
+ lsls r1,#20
+ b 2b
+
+1:
+ lsls r1,#8
+ lsrs r3,r0,#24
+ orrs r1,r3
+ lsls r0,#8                    @ r0:r1 Q60, to be shifted down -r2 places
+ rsbs r3,r2,#0
+ adds r2,#32                   @ shift down in r3, complementary shift in r2
+ bmi 1f                        @ long shift?
+2:
+ movs r4,r1
+ asrs r1,r3
+ lsls r4,r2
+ lsrs r0,r3
+ orrs r0,r4
+ movs r2,#0                    @ rounding
+ adcs r0,r2
+ adcs r1,r2
+ bx r14
+
+1:
+ movs r0,r1                    @ down 32 places
+ asrs r1,#31
+ subs r3,#32
+ adds r2,#32
+ bpl 2b
+ movs r0,#0                    @ very long shift? return 0
+ movs r1,#0
+ movs r2,#0
+ bx r14
+
+.thumb_func
+qfp_dtan:
+ push {r4-r7,r14}
+ bl push_r8_r11
+ bl dsincos
+ mov r12,r0                    @ save ε
+ bl dcos_finish
+ push {r0,r1}
+ mov r0,r12
+ bl dsin_finish
+ pop {r2,r3}
+ bl pop_r8_r11
+ b ddiv0                       @ compute sin θ/cos θ
+
+.thumb_func
+qfp_dcos:
+ push {r4-r7,r14}
+ bl push_r8_r11
+ bl dsincos
+ bl dcos_finish
+ b 1f
+
+.thumb_func
+qfp_dsin:
+ push {r4-r7,r14}
+ bl push_r8_r11
+ bl dsincos
+ bl dsin_finish
+1:
+ bl pop_r8_r11
+ pop {r4-r7,r15}
+
+
+@ unpack double θ in r0:r1, range reduce and calculate ε, cos α and sin α such that
+@ θ=α+ε and |ε|≤2^-32
+@ on return:
+@ r0:r1   ε (residual ω, where θ=α+ε) Q62, |ε|≤2^-32 (so fits in r0)
+@ r8:r9   cos α Q62
+@ r10:r11 sin α Q62
+.thumb_func
+dsincos:
+ push {r14}
+ bl dunpacks
+ adr r4,dreddata0
+ bl dreduce
+
+ movs r4,#0
+ ldr r5,=#0x9df04dbb           @ this value compensates for the non-unity scaling of the CORDIC rotations
+ ldr r6,=#0x36f656c5
+ lsls r2,#31
+ bcc 1f
+@ quadrant 2 or 3
+ mvns r6,r6
+ rsbs r5,r5,#0
+ adcs r6,r4
+1:
+ lsls r2,#1
+ bcs 1f
+@ even quadrant
+ mov r10,r4
+ mov r11,r4
+ mov r8,r5
+ mov r9,r6
+ b 2f
+1:
+@ odd quadrant
+ mov r8,r4
+ mov r9,r4
+ mov r10,r5
+ mov r11,r6
+2:
+ adr r4,dtab_cc
+ mov r12,r4
+ movs r7,#1
+ movs r6,#31
+1:
+ bl dcordic_rot_step
+ adds r7,#1
+ subs r6,#1
+ cmp r7,#33
+ bne 1b
+ pop {r15}
+
+dcos_finish:
+@ here
+@ r0:r1   ε (residual ω, where θ=α+ε) Q62, |ε|≤2^-32 (so fits in r0)
+@ r8:r9   cos α Q62
+@ r10:r11 sin α Q62
+@ and we wish to calculate cos θ=cos(α+ε)~cos α - ε sin α
+ mov r1,r11
+@ mov r2,r10
+@ lsrs r2,#31
+@ adds r1,r2                    @ rounding improves accuracy very slightly
+ muls32_s32_64 r0,r1, r2,r3, r4,r5,r6,r2,r3
+@ r2:r3   ε sin α Q(62+62-32)=Q92
+ mov r0,r8
+ mov r1,r9
+ lsls r5,r3,#2
+ asrs r3,r3,#30
+ lsrs r2,r2,#30
+ orrs r2,r5
+ sbcs r0,r2                    @ include rounding
+ sbcs r1,r3
+ movs r2,#62
+ b qfp_fix642double
+
+dsin_finish:
+@ here
+@ r0:r1   ε (residual ω, where θ=α+ε) Q62, |ε|≤2^-32 (so fits in r0)
+@ r8:r9   cos α Q62
+@ r10:r11 sin α Q62
+@ and we wish to calculate sin θ=sin(α+ε)~sin α + ε cos α
+ mov r1,r9
+ muls32_s32_64 r0,r1, r2,r3, r4,r5,r6,r2,r3
+@ r2:r3   ε cos α Q(62+62-32)=Q92
+ mov r0,r10
+ mov r1,r11
+ lsls r5,r3,#2
+ asrs r3,r3,#30
+ lsrs r2,r2,#30
+ orrs r2,r5
+ adcs r0,r2                    @ include rounding
+ adcs r1,r3
+ movs r2,#62
+ b qfp_fix642double
+
+.ltorg
+.align 2
+dreddata0:
+.word 0x0000517d               @ 2/π Q15
+.word 0x0014611A               @ π/2 Q62=6487ED5110B4611A split into 21-bit pieces
+.word 0x000A8885
+.word 0x001921FB
+
+.thumb_func
+qfp_datan2:
+@ r0:r1 y
+@ r2:r3 x
+ push {r4-r7,r14}
+ bl push_r8_r11
+ ldr r5,=#0x7ff00000
+ movs r4,r1
+ ands r4,r5                    @ y==0?
+ beq 1f
+ cmp r4,r5                     @ or Inf/NaN?
+ bne 2f
+1:
+ lsrs r1,#20                   @ flush
+ lsls r1,#20
+ movs r0,#0
+2:
+ movs r4,r3
+ ands r4,r5                    @ x==0?
+ beq 1f
+ cmp r4,r5                     @ or Inf/NaN?
+ bne 2f
+1:
+ lsrs r3,#20                   @ flush
+ lsls r3,#20
+ movs r2,#0
+2:
+ movs r6,#0                    @ quadrant offset
+ lsls r5,#11                   @ constant 0x80000000
+ cmp r3,#0
+ bpl 1f                        @ skip if x positive
+ movs r6,#2
+ eors r3,r5
+ eors r1,r5
+ bmi 1f                        @ quadrant offset=+2 if y was positive
+ rsbs r6,#0                    @ quadrant offset=-2 if y was negative
+1:
+@ now in quadrant 0 or 3
+ adds r7,r1,r5                 @ r7=-r1
+ bpl 1f
+@ y>=0: in quadrant 0
+ cmp r1,r3
+ ble 2f                        @ y<~x so 0≤θ<~π/4: skip
+ adds r6,#1
+ eors r1,r5                    @ negate x
+ b 3f                          @ and exchange x and y = rotate by -π/2
+1:
+ cmp r3,r7
+ bge 2f                        @ -y<~x so -π/4<~θ≤0: skip
+ subs r6,#1
+ eors r3,r5                    @ negate y and ...
+3:
+ movs r7,r0                    @ exchange x and y
+ movs r0,r2
+ movs r2,r7
+ movs r7,r1
+ movs r1,r3
+ movs r3,r7
+2:
+@ here -π/4<~θ<~π/4
+@ r6 has quadrant offset
+ push {r6}
+ cmp r2,#0
+ bne 1f
+ cmp r3,#0
+ beq 10f                       @ x==0 going into division?
+ lsls r4,r3,#1
+ asrs r4,#21
+ adds r4,#1
+ bne 1f                        @ x==Inf going into division?
+ lsls r4,r1,#1
+ asrs r4,#21
+ adds r4,#1                    @ y also ±Inf?
+ bne 10f
+ subs r1,#1                    @ make them both just finite
+ subs r3,#1
+ b 1f
+
+10:
+ movs r0,#0
+ movs r1,#0
+ b 12f
+ 
+1:
+ bl qfp_ddiv
+ movs r2,#62
+ bl qfp_double2fix64
+@ r0:r1 y/x
+ mov r10,r0
+ mov r11,r1
+ movs r0,#0                    @ ω=0
+ movs r1,#0
+ mov r8,r0
+ movs r2,#1
+ lsls r2,#30
+ mov r9,r2                     @ x=1
+
+ adr r4,dtab_cc
+ mov r12,r4
+ movs r7,#1
+ movs r6,#31
+1:
+ bl dcordic_vec_step
+ adds r7,#1
+ subs r6,#1
+ cmp r7,#33
+ bne 1b
+@ r0:r1   atan(y/x) Q62
+@ r8:r9   x residual Q62
+@ r10:r11 y residual Q62
+ mov r2,r9
+ mov r3,r10
+ subs r2,#12                   @ this makes atan(0)==0
+@ the following is basically a division residual y/x ~ atan(residual y/x)
+ movs r4,#1
+ lsls r4,#29
+ movs r7,#0
+2:
+ lsrs r2,#1
+ movs r3,r3                    @ preserve carry
+ bmi 1f
+ sbcs r3,r2
+ adds r0,r4
+ adcs r1,r7
+ lsrs r4,#1
+ bne 2b
+ b 3f
+1:
+ adcs r3,r2
+ subs r0,r4
+ sbcs r1,r7
+ lsrs r4,#1
+ bne 2b
+3:
+ lsls r6,r1,#31
+ asrs r1,#1
+ lsrs r0,#1
+ orrs r0,r6                    @ Q61
+
+12:
+ pop {r6}
+
+ cmp r6,#0
+ beq 1f
+ ldr r4,=#0x885A308D           @ π/2 Q61
+ ldr r5,=#0x3243F6A8
+ bpl 2f
+ mvns r4,r4                    @ negative quadrant offset
+ mvns r5,r5
+2:
+ lsls r6,#31
+ bne 2f                        @ skip if quadrant offset is ±1
+ adds r0,r4
+ adcs r1,r5
+2:
+ adds r0,r4
+ adcs r1,r5
+1:
+ movs r2,#61
+ bl qfp_fix642double
+
+ bl pop_r8_r11
+ pop {r4-r7,r15}
+
+.ltorg
+
+dtab_cc:
+.word 0x61bb4f69, 0x1dac6705   @ atan 2^-1 Q62
+.word 0x96406eb1, 0x0fadbafc   @ atan 2^-2 Q62
+.word 0xab0bdb72, 0x07f56ea6   @ atan 2^-3 Q62
+.word 0xe59fbd39, 0x03feab76   @ atan 2^-4 Q62
+.word 0xba97624b, 0x01ffd55b   @ atan 2^-5 Q62
+.word 0xdddb94d6, 0x00fffaaa   @ atan 2^-6 Q62
+.word 0x56eeea5d, 0x007fff55   @ atan 2^-7 Q62
+.word 0xaab7776e, 0x003fffea   @ atan 2^-8 Q62
+.word 0x5555bbbc, 0x001ffffd   @ atan 2^-9 Q62
+.word 0xaaaaadde, 0x000fffff   @ atan 2^-10 Q62
+.word 0xf555556f, 0x0007ffff   @ atan 2^-11 Q62
+.word 0xfeaaaaab, 0x0003ffff   @ atan 2^-12 Q62
+.word 0xffd55555, 0x0001ffff   @ atan 2^-13 Q62
+.word 0xfffaaaab, 0x0000ffff   @ atan 2^-14 Q62
+.word 0xffff5555, 0x00007fff   @ atan 2^-15 Q62
+.word 0xffffeaab, 0x00003fff   @ atan 2^-16 Q62
+.word 0xfffffd55, 0x00001fff   @ atan 2^-17 Q62
+.word 0xffffffab, 0x00000fff   @ atan 2^-18 Q62
+.word 0xfffffff5, 0x000007ff   @ atan 2^-19 Q62
+.word 0xffffffff, 0x000003ff   @ atan 2^-20 Q62
+.word 0x00000000, 0x00000200   @ atan 2^-21 Q62 @ consider optimising these
+.word 0x00000000, 0x00000100   @ atan 2^-22 Q62
+.word 0x00000000, 0x00000080   @ atan 2^-23 Q62
+.word 0x00000000, 0x00000040   @ atan 2^-24 Q62
+.word 0x00000000, 0x00000020   @ atan 2^-25 Q62
+.word 0x00000000, 0x00000010   @ atan 2^-26 Q62
+.word 0x00000000, 0x00000008   @ atan 2^-27 Q62
+.word 0x00000000, 0x00000004   @ atan 2^-28 Q62
+.word 0x00000000, 0x00000002   @ atan 2^-29 Q62
+.word 0x00000000, 0x00000001   @ atan 2^-30 Q62
+.word 0x80000000, 0x00000000   @ atan 2^-31 Q62
+.word 0x40000000, 0x00000000   @ atan 2^-32 Q62
+
+.thumb_func
+qfp_dexp:
+ push {r4-r7,r14}
+ bl dunpacks
+ adr r4,dreddata1
+ bl dreduce
+ cmp r1,#0
+ bge 1f
+ ldr r4,=#0xF473DE6B
+ ldr r5,=#0x2C5C85FD           @ ln2 Q62
+ adds r0,r4
+ adcs r1,r5
+ subs r2,#1
+1:
+ push {r2}
+ movs r7,#1                    @ shift
+ adr r6,dtab_exp
+ movs r2,#0
+ movs r3,#1
+ lsls r3,#30                   @ x=1 Q62
+
+3:
+ ldmia r6!,{r4,r5}
+ mov r12,r6
+ subs r0,r4
+ sbcs r1,r5
+ bmi 1f
+
+ rsbs r6,r7,#0
+ adds r6,#32                   @ complementary shift
+ movs r5,r3
+ asrs r5,r7
+ movs r4,r3
+ lsls r4,r6
+ movs r6,r2
+ lsrs r6,r7                    @ rounding bit in carry
+ orrs r4,r6
+ adcs r2,r4
+ adcs r3,r5                    @ x+=x>>i
+ b 2f
+
+1:
+ adds r0,r4                    @ restore argument
+ adcs r1,r5
+2:
+ mov r6,r12
+ adds r7,#1
+ cmp r7,#33
+ bne 3b
+
+@ here
+@ r0:r1   ε (residual x, where x=a+ε) Q62, |ε|≤2^-32 (so fits in r0)
+@ r2:r3   exp a Q62
+@ and we wish to calculate exp x=exp a exp ε~(exp a)(1+ε)
+ muls32_32_64 r0,r3, r4,r1, r5,r6,r7,r4,r1
+@ r4:r1 ε exp a Q(62+62-32)=Q92
+ lsrs r4,#30
+ lsls r0,r1,#2
+ orrs r0,r4
+ asrs r1,#30
+ adds r0,r2
+ adcs r1,r3
+
+ pop {r2}
+ rsbs r2,#0
+ adds r2,#62
+ bl qfp_fix642double                 @ in principle we can pack faster than this because we know the exponent
+ pop {r4-r7,r15}
+
+.ltorg
+
+.thumb_func
+qfp_dln:
+ push {r4-r7,r14}
+ lsls r7,r1,#1
+ bcs 5f                        @ <0 ...
+ asrs r7,#21
+ beq 5f                        @ ... or =0? return -Inf
+ adds r7,#1
+ beq 6f                        @ Inf/NaN? return +Inf
+ bl dunpacks
+ push {r2}
+ lsls r1,#9
+ lsrs r2,r0,#23
+ orrs r1,r2
+ lsls r0,#9
+@ r0:r1 m Q61 = m/2 Q62 0.5≤m/2<1
+
+ movs r7,#1                    @ shift
+ adr r6,dtab_exp
+ mov r12,r6
+ movs r2,#0
+ movs r3,#0                    @ y=0 Q62
+
+3:
+ rsbs r6,r7,#0
+ adds r6,#32                   @ complementary shift
+ movs r5,r1
+ asrs r5,r7
+ movs r4,r1
+ lsls r4,r6
+ movs r6,r0
+ lsrs r6,r7
+ orrs r4,r6                    @ x>>i, rounding bit in carry
+ adcs r4,r0
+ adcs r5,r1                    @ x+(x>>i)
+
+ lsrs r6,r5,#30
+ bne 1f                        @ x+(x>>i)>1?
+ movs r0,r4
+ movs r1,r5                    @ x+=x>>i
+ mov r6,r12
+ ldmia r6!,{r4,r5}
+ subs r2,r4
+ sbcs r3,r5
+
+1:
+ movs r4,#8
+ add r12,r4
+ adds r7,#1
+ cmp r7,#33
+ bne 3b
+@ here:
+@ r0:r1 residual x, nearly 1 Q62
+@ r2:r3 y ~ ln m/2 = ln m - ln2 Q62
+@ result is y + ln2 + ln x ~ y + ln2 + (x-1)
+ lsls r1,#2
+ asrs r1,#2                    @ x-1
+ adds r2,r0
+ adcs r3,r1
+
+ pop {r7}
+@ here:
+@ r2:r3 ln m/2 = ln m - ln2 Q62
+@ r7    unbiased exponent
+
+ adr r4,dreddata1+4
+ ldmia r4,{r0,r1,r4}
+ adds r7,#1
+ muls r0,r7                    @ Q62
+ muls r1,r7                    @ Q41
+ muls r4,r7                    @ Q20
+ lsls r7,r1,#21
+ asrs r1,#11
+ asrs r5,r1,#31
+ adds r0,r7
+ adcs r1,r5
+ lsls r7,r4,#10
+ asrs r4,#22
+ asrs r5,r1,#31
+ adds r1,r7
+ adcs r4,r5
+@ r0:r1:r4 exponent*ln2 Q62
+ asrs r5,r3,#31
+ adds r0,r2
+ adcs r1,r3
+ adcs r4,r5
+@ r0:r1:r4 result Q62
+ movs r2,#62
+1:
+ asrs r5,r1,#31
+ cmp r4,r5
+ beq 2f                        @ r4 a sign extension of r1?
+ lsrs r0,#4                    @ no: shift down 4 places and try again
+ lsls r6,r1,#28
+ orrs r0,r6
+ lsrs r1,#4
+ lsls r6,r4,#28
+ orrs r1,r6
+ asrs r4,#4
+ subs r2,#4
+ b 1b
+2:
+ bl qfp_fix642double
+ pop {r4-r7,r15}
+
+5:
+ ldr r1,=#0xfff00000
+ movs r0,#0
+ pop {r4-r7,r15}
+
+6:
+ ldr r1,=#0x7ff00000
+ movs r0,#0
+ pop {r4-r7,r15}
+
+
+.ltorg
+
+.align 2
+dreddata1:
+.word 0x0000B8AA               @ 1/ln2 Q15
+.word 0x0013DE6B               @ ln2 Q62 Q62=2C5C85FDF473DE6B split into 21-bit pieces
+.word 0x000FEFA3
+.word 0x000B1721
+
+dtab_exp:
+.word 0xbf984bf3, 0x19f323ec   @ log 1+2^-1 Q62
+.word 0xcd4d10d6, 0x0e47fbe3   @ log 1+2^-2 Q62
+.word 0x8abcb97a, 0x0789c1db   @ log 1+2^-3 Q62
+.word 0x022c54cc, 0x03e14618   @ log 1+2^-4 Q62
+.word 0xe7833005, 0x01f829b0   @ log 1+2^-5 Q62
+.word 0x87e01f1e, 0x00fe0545   @ log 1+2^-6 Q62
+.word 0xac419e24, 0x007f80a9   @ log 1+2^-7 Q62
+.word 0x45621781, 0x003fe015   @ log 1+2^-8 Q62
+.word 0xa9ab10e6, 0x001ff802   @ log 1+2^-9 Q62
+.word 0x55455888, 0x000ffe00   @ log 1+2^-10 Q62
+.word 0x0aa9aac4, 0x0007ff80   @ log 1+2^-11 Q62
+.word 0x01554556, 0x0003ffe0   @ log 1+2^-12 Q62
+.word 0x002aa9ab, 0x0001fff8   @ log 1+2^-13 Q62
+.word 0x00055545, 0x0000fffe   @ log 1+2^-14 Q62
+.word 0x8000aaaa, 0x00007fff   @ log 1+2^-15 Q62
+.word 0xe0001555, 0x00003fff   @ log 1+2^-16 Q62
+.word 0xf80002ab, 0x00001fff   @ log 1+2^-17 Q62
+.word 0xfe000055, 0x00000fff   @ log 1+2^-18 Q62
+.word 0xff80000b, 0x000007ff   @ log 1+2^-19 Q62
+.word 0xffe00001, 0x000003ff   @ log 1+2^-20 Q62
+.word 0xfff80000, 0x000001ff   @ log 1+2^-21 Q62
+.word 0xfffe0000, 0x000000ff   @ log 1+2^-22 Q62
+.word 0xffff8000, 0x0000007f   @ log 1+2^-23 Q62
+.word 0xffffe000, 0x0000003f   @ log 1+2^-24 Q62
+.word 0xfffff800, 0x0000001f   @ log 1+2^-25 Q62
+.word 0xfffffe00, 0x0000000f   @ log 1+2^-26 Q62
+.word 0xffffff80, 0x00000007   @ log 1+2^-27 Q62
+.word 0xffffffe0, 0x00000003   @ log 1+2^-28 Q62
+.word 0xfffffff8, 0x00000001   @ log 1+2^-29 Q62
+.word 0xfffffffe, 0x00000000   @ log 1+2^-30 Q62
+.word 0x80000000, 0x00000000   @ log 1+2^-31 Q62
+.word 0x40000000, 0x00000000   @ log 1+2^-32 Q62
+
+qfp_lib_end:
diff --git a/sos-iir-filter.h b/sos-iir-filter.h
index cc90b35..4d345f1 100644
--- a/sos-iir-filter.h
+++ b/sos-iir-filter.h
@@ -26,11 +26,50 @@
 #include <ranges>
 #include <utility>
 
-#include <cmath>
-namespace fpm = std;
-using sos_t = float;
-//#include <fpm/math.hpp>
-//using sos_t = fpm::fixed_16_16;
+extern "C" {
+#include <qfplib-m0-full.h>
+}
+
+float qfp_fpow(float b, float e)
+{
+    return qfp_fexp(e * qfp_fln(b));
+}
+
+float qfp_flog10(float x)
+{
+    return qfp_fln(x) / qfp_fln(10);
+}
+
+struct sos_t
+{
+    float v;
+
+    constexpr sos_t(float v_ = 0.f): v(v_) {}
+
+    sos_t operator+(const sos_t& o) const noexcept {
+        return qfp_fadd(v, o.v);
+    }
+
+    sos_t operator-(const sos_t& o) const noexcept {
+        return qfp_fsub(v, o.v);
+    }
+
+    sos_t operator*(const sos_t& o) const noexcept {
+        return qfp_fmul(v, o.v);
+    }
+
+    sos_t operator/(const sos_t& o) const noexcept {
+        return qfp_fdiv(v, o.v);
+    }
+
+    sos_t& operator+=(const sos_t& o) noexcept {
+        return (*this = *this + o);
+    }
+
+    operator float() const noexcept {
+        return v;
+    }
+};
 
 struct SOS_Coefficients {
   sos_t b1;
@@ -60,8 +99,7 @@ struct SOS_IIR_Filter {
     std::copy(_sos, _sos + N, sos.begin());
   }
 
-  void filter(auto samples) {
-    // Apply all but last Second-Order-Section
+  void filter(auto samples, std::size_t n = N) {
     for ([[maybe_unused]] auto _ : std::views::zip_transform(
         [](auto samps, const auto& coeffs, auto& ww) {
             // Assumes a0 and b0 coefficients are one (1.0)
@@ -72,13 +110,25 @@ struct SOS_IIR_Filter {
             }
             return 0;
         },
-        std::views::repeat(samples, N), sos, w)) {}
+        std::views::repeat(samples, n), sos, w)) {}
   }
 
   sos_t filter_sum_sqr(auto samples) {
-    filter(samples);
-    return std::ranges::fold_left(samples, sos_t(0),
-      [this](auto a, auto b) { return a + b * gain * b * gain; });
+    const auto& coeffs = sos.back();
+    auto& ww = w.back();
+    sos_t sum_sqr (0.f);
+
+    filter(samples, N - 1);
+
+    // Assumes a0 and b0 coefficients are one (1.0)
+    for (auto& s : samples) {
+      auto f6 = s + coeffs.a1 * ww.w0 + coeffs.a2 * ww.w1;
+      s = f6 + coeffs.b1 * ww.w0 + coeffs.b2 * ww.w1;
+      ww.w1 = std::exchange(ww.w0, f6);
+      sum_sqr += s * gain * s * gain;
+    }
+
+    return sum_sqr;
   }
 };
 
@@ -90,12 +140,12 @@ struct SOS_IIR_Filter {
 // A ~= [1.0, -1.993853, 0.993863]
 // With additional DC blocking component
 SOS_IIR_Filter SPH0645LM4H_B_RB = {
-  /* gain: */ sos_t(1.00123377961525),
+  /* gain: */ sos_t(1.00123377961525f),
   /* sos: */ { // Second-Order Sections {b1, b2, -a1, -a2}
-         { sos_t(-1.0), sos_t(0.0),
-           sos_t(+0.9992), sos_t(0) },  // DC blocker, a1 = -0.9992
-         { sos_t(-1.988897663539382), sos_t(+0.988928479008099),
-           sos_t(+1.993853376183491), sos_t(-0.993862821429572) } }
+         { sos_t(-1.0f), sos_t(0.0f),
+           sos_t(+0.9992f), sos_t(0.0f) },  // DC blocker, a1 = -0.9992
+         { sos_t(-1.988897663539382f), sos_t(+0.988928479008099f),
+           sos_t(+1.993853376183491f), sos_t(-0.993862821429572f) } }
 };
 
 //
@@ -104,14 +154,14 @@ SOS_IIR_Filter SPH0645LM4H_B_RB = {
 // B = [0.169994948147430, 0.280415310498794, -1.120574766348363, 0.131562559965936, 0.974153561246036, -0.282740857326553, -0.152810756202003]
 // A = [1.0, -2.12979364760736134, 0.42996125885751674, 1.62132698199721426, -0.96669962900852902, 0.00121015844426781, 0.04400300696788968]
 SOS_IIR_Filter A_weighting = {
-  /* gain: */ sos_t(0.169994948147430),
+  /* gain: */ sos_t(0.169994948147430f),
   /* sos: */ { // Second-Order Sections {b1, b2, -a1, -a2}
-         { sos_t(-2.00026996133106), sos_t(+1.00027056142719),
-           sos_t(-1.060868438509278), sos_t(-0.163987445885926) },
-         { sos_t(+4.35912384203144), sos_t(+3.09120265783884),
-           sos_t(+1.208419926363593), sos_t(-0.273166998428332) },
-         { sos_t(-0.70930303489759), sos_t(-0.29071868393580),
-           sos_t(+1.982242159753048), sos_t(-0.982298594928989) } }
+         { sos_t(-2.00026996133106f), sos_t(+1.00027056142719f),
+           sos_t(-1.060868438509278f), sos_t(-0.163987445885926f) },
+         { sos_t(+4.35912384203144f), sos_t(+3.09120265783884f),
+           sos_t(+1.208419926363593f), sos_t(-0.273166998428332f) },
+         { sos_t(-0.70930303489759f), sos_t(-0.29071868393580f),
+           sos_t(+1.982242159753048f), sos_t(-0.982298594928989f) } }
 };
 
 ////