From eea21d422b06445729f1daec764591d083584613 Mon Sep 17 00:00:00 2001 From: Clyne Sullivan Date: Sun, 2 Jun 2024 17:59:04 -0400 Subject: [PATCH] use qfplib; fix I2S clock --- Makefile | 15 +- cfg/mcuconf.h | 4 +- main.cpp | 51 +- qfplib-m0-full-20240105/LICENCE | 339 ++ qfplib-m0-full-20240105/README | 4 + qfplib-m0-full-20240105/qfplib-m0-full.h | 97 + qfplib-m0-full-20240105/qfplib-m0-full.s | 3670 ++++++++++++++++++++++ sos-iir-filter.h | 96 +- 8 files changed, 4221 insertions(+), 55 deletions(-) create mode 100644 qfplib-m0-full-20240105/LICENCE create mode 100644 qfplib-m0-full-20240105/README create mode 100644 qfplib-m0-full-20240105/qfplib-m0-full.h create mode 100644 qfplib-m0-full-20240105/qfplib-m0-full.s diff --git a/Makefile b/Makefile index 4e12bd1..1065763 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ # Compiler options here. ifeq ($(USE_OPT),) - USE_OPT = -O3 -ggdb -fomit-frame-pointer -falign-functions=16 + USE_OPT = -O3 -ggdb -fomit-frame-pointer -falign-functions=16 -fno-stack-protector endif # C specific options here (added to USE_OPT). @@ -25,7 +25,7 @@ endif # Linker extra options here. ifeq ($(USE_LDOPT),) - USE_LDOPT = -lm + USE_LDOPT = endif # Enable this if you want link time optimizations (LTO). @@ -66,12 +66,12 @@ endif # Enables the use of FPU (no, softfp, hard). ifeq ($(USE_FPU),) - USE_FPU = softfp + USE_FPU = no endif # FPU-related options. ifeq ($(USE_FPU_OPT),) - USE_FPU_OPT = -mfloat-abi=$(USE_FPU) -mfpu=fpv4-sp-d16 + USE_FPU_OPT = #-mfloat-abi=$(USE_FPU) -mfpu=fpv4-sp-d16 endif # @@ -89,7 +89,7 @@ PROJECT = ch MCU = cortex-m0 # Imported source files and paths. -CHIBIOS := ../.. +CHIBIOS := ../../ChibiOS CONFDIR := ./cfg BUILDDIR := ./build DEPDIR := ./.dep @@ -127,14 +127,15 @@ CPPSRC = $(ALLCPPSRC) \ main.cpp # List ASM source files here. -ASMSRC = $(ALLASMSRC) +ASMSRC = $(ALLASMSRC) \ + qfplib-m0-full-20240105/qfplib-m0-full.s # List ASM with preprocessor source files here. ASMXSRC = $(ALLXASMSRC) # Inclusion directories. INCDIR = $(CONFDIR) $(ALLINC) \ - fpm/include + qfplib-m0-full-20240105 # Define C warning options here. CWARN = -Wall -Wextra -Wundef -Wstrict-prototypes diff --git a/cfg/mcuconf.h b/cfg/mcuconf.h index 6e9c549..7b25ff4 100644 --- a/cfg/mcuconf.h +++ b/cfg/mcuconf.h @@ -67,7 +67,7 @@ #define STM32_PLLQ_VALUE 4 #define STM32_PLLR_VALUE 4 #define STM32_HPRE STM32_HPRE_DIV1 -#define STM32_PPRE STM32_PPRE_DIV1 +#define STM32_PPRE STM32_PPRE_DIV4 #define STM32_MCOSEL STM32_MCOSEL_NOCLOCK #define STM32_MCOPRE STM32_MCOPRE_DIV1 #define STM32_LSCOSEL STM32_LSCOSEL_NOCLOCK @@ -79,7 +79,7 @@ #define STM32_USART2SEL STM32_USART2SEL_PCLK #define STM32_LPUART1SEL STM32_LPUART1SEL_PCLK #define STM32_I2C1SEL STM32_I2C1SEL_PCLK -#define STM32_I2S1SEL STM32_I2S1SEL_HSI16 +#define STM32_I2S1SEL STM32_I2S1SEL_SYSCLK #define STM32_LPTIM1SEL STM32_LPTIM1SEL_PCLK #define STM32_TIM1SEL STM32_TIM1SEL_TIMPCLK #define STM32_RNGSEL STM32_RNGSEL_HSI16 diff --git a/main.cpp b/main.cpp index 5151e33..2db9ec8 100644 --- a/main.cpp +++ b/main.cpp @@ -9,31 +9,31 @@ static constexpr auto& WEIGHTING = A_weighting; static constexpr auto& MIC_EQUALIZER = SPH0645LM4H_B_RB; -static constexpr sos_t MIC_OFFSET_DB ( 0.f); // Linear offset -static constexpr sos_t MIC_SENSITIVITY (-26.f); // dBFS value expected at MIC_REF_DB -static constexpr sos_t MIC_REF_DB ( 94.f); // dB where sensitivity is specified +static constexpr float MIC_OFFSET_DB ( 0.f); // Linear offset +static constexpr float MIC_SENSITIVITY (-26.f); // dBFS value expected at MIC_REF_DB +static constexpr float MIC_REF_DB ( 94.f); // dB where sensitivity is specified static constexpr sos_t MIC_OVERLOAD_DB (120.f); // dB - Acoustic overload point static constexpr sos_t MIC_NOISE_DB ( 29.f); // dB - Noise floor -static constexpr auto MIC_BITS = 16u; // SPH0645 is actually 18 bits -static constexpr auto SAMPLE_RATE = 32000u; +static constexpr auto MIC_BITS = 18u; +static constexpr auto SAMPLE_RATE = 48000u; -static constexpr unsigned I2S_BUFSIZ = 512; // was 1024 -static constexpr unsigned I2S_STRIDE = 16; // was 8 +static constexpr unsigned I2S_BUFSIZ = 1024; +static constexpr unsigned I2S_STRIDE = 16; // Calculate reference amplitude value at compile time -static const auto MIC_REF_AMPL = fpm::pow(sos_t(10), MIC_SENSITIVITY / 20) * +static const auto MIC_REF_AMPL = qfp_fpow(10.f, MIC_SENSITIVITY / 20) * ((1 << (MIC_BITS - 1)) - 1); static SEMAPHORE_DECL(i2sReady, 0); static THD_WORKING_AREA(waThread1, 128); static std::array i2sBuffer; -static sos_t Leq_sum_sqr (0); +static sos_t Leq_sum_sqr (0.f); static unsigned Leq_samples = 0; static THD_FUNCTION(Thread1, arg); static void i2sCallback(I2SDriver *i2s); -static constexpr auto I2SPRval = SAMPLE_RATE * 64 / 1000 / 1024 * 2; +static constexpr unsigned I2SPRval = 16'000'000 / SAMPLE_RATE / 32 / 2; static constexpr I2SConfig i2sConfig = { /* TX buffer */ NULL, /* RX buffer */ i2sBuffer.data(), @@ -43,11 +43,11 @@ static constexpr I2SConfig i2sConfig = { (0 << SPI_I2SCFGR_I2SSTD_Pos) | // Philips I2S (1 << SPI_I2SCFGR_DATLEN_Pos) | // 24-bit SPI_I2SCFGR_CHLEN, // 32-bit frame - /* I2SPR */ I2SPRval | (((I2SPRval / 2) & 1) ? SPI_I2SPR_ODD : 0) + /* I2SPR */ (I2SPRval / 2) | ((I2SPRval & 1) ? SPI_I2SPR_ODD : 0) }; THD_TABLE_BEGIN - THD_TABLE_THREAD(0, "blinker1", waThread1, Thread1, NULL) + THD_TABLE_THREAD(0, "main", waThread1, Thread1, NULL) THD_TABLE_END int main(void) @@ -64,6 +64,7 @@ THD_FUNCTION(Thread1, arg) chThdSleepMilliseconds(2000); + palSetPadMode(GPIOB, 7, PAL_MODE_OUTPUT_PUSHPULL); palSetPadMode(GPIOF, 2, PAL_MODE_UNCONNECTED); palSetLineMode(LINE_I2S_SD, PAL_MODE_ALTERNATE(0)); palSetLineMode(LINE_I2S_WS, PAL_MODE_ALTERNATE(0)); @@ -79,12 +80,13 @@ THD_FUNCTION(Thread1, arg) uint8_t strbuf[7] = { 0, 0, 0, 'd', 'B', '\n', '\0' }; for (;;) { + palSetPad(GPIOB, 7); chSemWait(&i2sReady); - const auto Leq_RMS = fpm::sqrt(Leq_sum_sqr / Leq_samples); + const auto Leq_RMS = qfp_fsqrt((float)Leq_sum_sqr / Leq_samples); const auto Leq_dB = MIC_OFFSET_DB + MIC_REF_DB + 20 * - fpm::log10(Leq_RMS / MIC_REF_AMPL); - Leq_sum_sqr = sos_t(0); + qfp_flog10(Leq_RMS / MIC_REF_AMPL); + Leq_sum_sqr = sos_t(0.f); Leq_samples = 0; auto n = std::clamp(static_cast(Leq_dB), 0l, 999l); @@ -92,22 +94,24 @@ THD_FUNCTION(Thread1, arg) strbuf[1] = n % 10 + '0'; n /= 10; strbuf[0] = n ? n + '0' : ' '; sdWrite(&SD2, strbuf, sizeof(strbuf)); + palClearPad(GPIOB, 7); } } +int32_t fixsample(uint32_t s) { + return (int32_t)(((s & 0xFFFF) << 16) | (s >> 16)) >> (32 - MIC_BITS); +} + void i2sCallback(I2SDriver *i2s) { - // Note: samples come in as 16-bit big-endian, so for simplicity we just - // take the "high" 16-bits of valid data and drop the rest. Mic is - // technically 18-bit, so we are losing some precision. - + palSetPad(GPIOB, 7); const auto halfsize = i2sBuffer.size() / 2; const auto offset = i2sIsBufferComplete(i2s) ? halfsize : 0; auto samples = reinterpret_cast(i2sBuffer.data() + offset); std::ranges::copy( - std::views::counted(i2sBuffer.begin() + offset, halfsize) - | std::ranges::views::stride(2 * I2S_STRIDE) - | std::views::transform([](auto s) { return sos_t((int16_t)s); }), + std::views::counted(i2sBuffer.begin() + offset, halfsize / I2S_STRIDE) + | std::ranges::views::stride(2) + | std::views::transform([](uint32_t s) { return sos_t(fixsample(s)); }), samples); auto samps = std::views::counted(samples, halfsize / (2 * I2S_STRIDE)); @@ -117,8 +121,9 @@ void i2sCallback(I2SDriver *i2s) Leq_samples += samps.size(); // Wakeup main thread for dB calculation every second - if (Leq_samples >= 2 * SAMPLE_RATE / I2S_STRIDE) { + if (Leq_samples >= SAMPLE_RATE / I2S_STRIDE) { chSemSignalI(&i2sReady); } + palClearPad(GPIOB, 7); } diff --git a/qfplib-m0-full-20240105/LICENCE b/qfplib-m0-full-20240105/LICENCE new file mode 100644 index 0000000..d511905 --- /dev/null +++ b/qfplib-m0-full-20240105/LICENCE @@ -0,0 +1,339 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Lesser General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. diff --git a/qfplib-m0-full-20240105/README b/qfplib-m0-full-20240105/README new file mode 100644 index 0000000..99e878a --- /dev/null +++ b/qfplib-m0-full-20240105/README @@ -0,0 +1,4 @@ +The Qfplib libraries are open source, licensed under version 2 of the +GNU GPL. A copy of that licence is included in this archive. + +Visit http://www.quinapalus.com/qfplib.html for more information. diff --git a/qfplib-m0-full-20240105/qfplib-m0-full.h b/qfplib-m0-full-20240105/qfplib-m0-full.h new file mode 100644 index 0000000..36d17d2 --- /dev/null +++ b/qfplib-m0-full-20240105/qfplib-m0-full.h @@ -0,0 +1,97 @@ +#ifndef __QFPLIB_M0_FULL_H__ +#define __QFPLIB_M0_FULL_H__ + +/* +Copyright 2019-2024 Mark Owen +http://www.quinapalus.com +E-mail: qfp@quinapalus.com + +This file is free software: you can redistribute it and/or modify +it under the terms of version 2 of the GNU General Public License +as published by the Free Software Foundation. + +This file is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this file. If not, see or +write to the Free Software Foundation, Inc., 51 Franklin Street, +Fifth Floor, Boston, MA 02110-1301, USA. +*/ + +typedef unsigned int ui32; +typedef int i32; +typedef unsigned long long int ui64; +typedef long long int i64; + +extern float qfp_fadd (float x,float y); +extern float qfp_fsub (float x,float y); +extern float qfp_fmul (float x,float y); +extern float qfp_fdiv (float x,float y); +extern int qfp_fcmp (float x,float y); +extern float qfp_fsqrt (float x); +extern i32 qfp_float2int (float x); +extern i32 qfp_float2fix (float x,int f); +extern ui32 qfp_float2uint (float x); +extern ui32 qfp_float2ufix (float x,int f); +extern float qfp_int2float (i32 x); +extern float qfp_fix2float (i32 x,int f); +extern float qfp_uint2float (ui32 x); +extern float qfp_ufix2float (ui32 x,int f); +extern float qfp_int642float (i64 x); +extern float qfp_fix642float (i64 x,int f); +extern float qfp_uint642float (ui64 x); +extern float qfp_ufix642float (ui64 x,int f); +extern float qfp_fcos (float x); +extern float qfp_fsin (float x); +extern float qfp_ftan (float x); +extern float qfp_fatan2 (float y,float x); +extern float qfp_fexp (float x); +extern float qfp_fln (float x); + +extern double qfp_dadd (double x,double y); +extern double qfp_dsub (double x,double y); +extern double qfp_dmul (double x,double y); +extern double qfp_ddiv (double x,double y); +extern double qfp_dsqrt (double x); +extern double qfp_dcos (double x); +extern double qfp_dsin (double x); +extern double qfp_dtan (double x); +extern double qfp_datan2 (double y,double x); +extern double qfp_dexp (double x); +extern double qfp_dln (double x); +extern int qfp_dcmp (double x,double y); + +extern i64 qfp_float2int64 (float x); +extern i64 qfp_float2fix64 (float x,int f); +extern ui64 qfp_float2uint64 (float x); +extern ui64 qfp_float2ufix64 (float x,int f); +extern i32 qfp_float2int_z (float x); +extern i64 qfp_float2int64_z (float x); + +extern i32 qfp_double2int (double x); +extern i32 qfp_double2fix (double x,int f); +extern ui32 qfp_double2uint (double x); +extern ui32 qfp_double2ufix (double x,int f); +extern i64 qfp_double2int64 (double x); +extern i64 qfp_double2fix64 (double x,int f); +extern ui64 qfp_double2uint64 (double x); +extern ui64 qfp_double2ufix64 (double x,int f); +extern i32 qfp_double2int_z (double x); +extern i64 qfp_double2int64_z(double x); + +extern double qfp_int2double (i32 x); +extern double qfp_fix2double (i32 x,int f); +extern double qfp_uint2double (ui32 x); +extern double qfp_ufix2double (ui32 x,int f); +extern double qfp_int642double (i64 x); +extern double qfp_fix642double (i64 x,int f); +extern double qfp_uint642double (ui64 x); +extern double qfp_ufix642double (ui64 x,int f); + +extern float qfp_double2float (double x); +extern double qfp_float2double (float x); + +#endif diff --git a/qfplib-m0-full-20240105/qfplib-m0-full.s b/qfplib-m0-full-20240105/qfplib-m0-full.s new file mode 100644 index 0000000..0a354f1 --- /dev/null +++ b/qfplib-m0-full-20240105/qfplib-m0-full.s @@ -0,0 +1,3670 @@ +@ Copyright 2019-2024 Mark Owen +@ http://www.quinapalus.com +@ E-mail: qfp@quinapalus.com +@ +@ This file is free software: you can redistribute it and/or modify +@ it under the terms of version 2 of the GNU General Public License +@ as published by the Free Software Foundation. +@ +@ This file is distributed in the hope that it will be useful, +@ but WITHOUT ANY WARRANTY; without even the implied warranty of +@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +@ GNU General Public License for more details. +@ +@ You should have received a copy of the GNU General Public License +@ along with this file. If not, see or +@ write to the Free Software Foundation, Inc., 51 Franklin Street, +@ Fifth Floor, Boston, MA 02110-1301, USA. + +.syntax unified +.cpu cortex-m0plus +.thumb + +@ exported symbols + +.global qfp_fadd +.global qfp_fsub +.global qfp_fmul +.global qfp_fdiv +.global qfp_fcmp +.global qfp_fsqrt +.global qfp_float2int +.global qfp_float2fix +.global qfp_float2uint +.global qfp_float2ufix +.global qfp_int2float +.global qfp_fix2float +.global qfp_uint2float +.global qfp_ufix2float +.global qfp_int642float +.global qfp_fix642float +.global qfp_uint642float +.global qfp_ufix642float +.global qfp_fcos +.global qfp_fsin +.global qfp_ftan +.global qfp_fatan2 +.global qfp_fexp +.global qfp_fln + +.global qfp_dadd +.global qfp_dsub +.global qfp_dmul +.global qfp_ddiv +.global qfp_dsqrt +.global qfp_dcos +.global qfp_dsin +.global qfp_dtan +.global qfp_datan2 +.global qfp_dexp +.global qfp_dln +.global qfp_dcmp + +.global qfp_float2int64 +.global qfp_float2fix64 +.global qfp_float2uint64 +.global qfp_float2ufix64 +.global qfp_float2int_z +.global qfp_float2int64_z + +.global qfp_double2int +.global qfp_double2fix +.global qfp_double2uint +.global qfp_double2ufix +.global qfp_double2int64 +.global qfp_double2fix64 +.global qfp_double2uint64 +.global qfp_double2ufix64 +.global qfp_double2int_z +.global qfp_double2int64_z + +.global qfp_int2double +.global qfp_fix2double +.global qfp_uint2double +.global qfp_ufix2double +.global qfp_int642double +.global qfp_fix642double +.global qfp_uint642double +.global qfp_ufix642double + +.global qfp_double2float +.global qfp_float2double + +qfp_lib_start: + +@ exchange r0<->r1, r2<->r3 +xchxy: + push {r0,r2,r14} + mov r0,r1 + mov r2,r3 + pop {r1,r3,r15} + +@ IEEE single in r0-> signed (two's complemennt) mantissa in r0 9Q23 (24 significant bits), signed exponent (bias removed) in r2 +@ trashes r4; zero, denormal -> mantissa=+/-1, exponent=-380; Inf, NaN -> mantissa=+/-1, exponent=+640 +unpackx: + lsrs r2,r0,#23 @ save exponent and sign + lsls r0,#9 @ extract mantissa + lsrs r0,#9 + movs r4,#1 + lsls r4,#23 + orrs r0,r4 @ reinstate implied leading 1 + cmp r2,#255 @ test sign bit + uxtb r2,r2 @ clear it + bls 1f @ branch on positive + rsbs r0,#0 @ negate mantissa +1: + subs r2,#1 + cmp r2,#254 @ zero/denormal/Inf/NaN? + bhs 2f + subs r2,#126 @ remove exponent bias: can now be -126..+127 + bx r14 + +2: @ here with special-case values + cmp r0,#0 + mov r0,r4 @ set mantissa to +1 + bpl 3f + rsbs r0,#0 @ zero/denormal/Inf/NaN: mantissa=+/-1 +3: + subs r2,#126 @ zero/denormal: exponent -> -127; Inf, NaN: exponent -> 128 + lsls r2,#2 @ zero/denormal: exponent -> -508; Inf, NaN: exponent -> 512 + adds r2,#128 @ zero/denormal: exponent -> -380; Inf, NaN: exponent -> 640 + bx r14 + +@ normalise and pack signed mantissa in r0 nominally 3Q29, signed exponent in r2-> IEEE single in r0 +@ trashes r4, preserves r1,r3 +@ r5: "sticky bits", must be zero iff all result bits below r0 are zero for correct rounding +packx: + lsrs r4,r0,#31 @ save sign bit + lsls r4,r4,#31 @ sign now in b31 + bpl 2f @ skip if positive + cmp r5,#0 + beq 11f + adds r0,#1 @ fiddle carry in to following rsb if sticky bits are non-zero +11: + rsbs r0,#0 @ can now treat r0 as unsigned +packx0: + bmi 3f @ catch r0=0x80000000 case +2: + subs r2,#1 @ normalisation loop + adds r0,r0 + beq 1f @ zero? special case + bpl 2b @ normalise so leading "1" in bit 31 +3: + adds r2,#129 @ (mis-)offset exponent + bne 12f @ special case: highest denormal can round to lowest normal + adds r0,#0x80 @ in special case, need to add 256 to r0 for rounding + bcs 4f @ tripped carry? then have leading 1 in C as required +12: + adds r0,#0x80 @ rounding + bcs 4f @ tripped carry? then have leading 1 in C as required (and result is even so can ignore sticky bits) + cmp r5,#0 + beq 7f @ sticky bits zero? +8: + lsls r0,#1 @ remove leading 1 +9: + subs r2,#1 @ compensate exponent on this path +4: + cmp r2,#254 + bge 5f @ overflow? + adds r2,#1 @ correct exponent offset + ble 10f @ denormal/underflow? + lsrs r0,#9 @ align mantissa + lsls r2,#23 @ align exponent + orrs r0,r2 @ assemble exponent and mantissa +6: + orrs r0,r4 @ apply sign +1: + bx r14 + +5: + movs r0,#0xff @ create infinity + lsls r0,#23 + b 6b + +10: + movs r0,#0 @ create zero + bx r14 + +7: @ sticky bit rounding case + lsls r5,r0,#24 @ check bottom 8 bits of r0 + bne 8b @ in rounding-tie case? + lsrs r0,#9 @ ensure even result + lsls r0,#10 + b 9b + +.align 2 +.ltorg + +@ signed multiply r0 1Q23 by r1 4Q23, result in r0 7Q25, sticky bits in r5 +@ trashes r3,r4 +mul0: + uxth r3,r0 @ Q23 + asrs r4,r1,#16 @ Q7 + muls r3,r4 @ L*H, Q30 signed + asrs r4,r0,#16 @ Q7 + uxth r5,r1 @ Q23 + muls r4,r5 @ H*L, Q30 signed + adds r3,r4 @ sum of middle partial products + uxth r4,r0 + muls r4,r5 @ L*L, Q46 unsigned + lsls r5,r4,#16 @ initialise sticky bits from low half of low partial product + lsrs r4,#16 @ Q25 + adds r3,r4 @ add high half of low partial product to sum of middle partial products + @ (cannot generate carry by limits on input arguments) + asrs r0,#16 @ Q7 + asrs r1,#16 @ Q7 + muls r0,r1 @ H*H, Q14 signed + lsls r0,#11 @ high partial product Q25 + lsls r1,r3,#27 @ sticky + orrs r5,r1 @ collect further sticky bits + asrs r1,r3,#5 @ middle partial products Q25 + adds r0,r1 @ final result + bx r14 + +.thumb_func +qfp_fcmp: + lsls r2,r0,#1 + lsrs r2,#24 + beq 1f + cmp r2,#0xff + bne 2f +1: + lsrs r0,#23 @ clear mantissa if NaN or denormal + lsls r0,#23 +2: + lsls r2,r1,#1 + lsrs r2,#24 + beq 1f + cmp r2,#0xff + bne 2f +1: + lsrs r1,#23 @ clear mantissa if NaN or denormal + lsls r1,#23 +2: + movs r2,#1 @ initialise result + eors r1,r0 + bmi 4f @ opposite signs? then can proceed on basis of sign of x + eors r1,r0 @ restore y + bpl 1f + rsbs r2,#0 @ both negative? flip comparison +1: + cmp r0,r1 + bgt 2f + blt 3f +5: + movs r2,#0 +3: + rsbs r2,#0 +2: + subs r0,r2,#0 + bx r14 +4: + orrs r1,r0 + adds r1,r1 + beq 5b + cmp r0,#0 + bge 2b + b 3b + +@ convert float to signed int, rounding towards 0, clamping +.thumb_func +qfp_float2int_z: + push {r14} + cmp r0,#0 + blt 1f + bl qfp_float2int @ +ve or zero? then use rounding towards -Inf + cmp r0,#0 + bge 2f + ldr r0,=#0x7fffffff +2: + pop {r15} +1: + lsls r0,#1 @ -ve: clear sign bit + lsrs r0,#1 + bl qfp_float2uint @ convert to unsigned, rounding towards -Inf + cmp r0,#0 + blt 1f + rsbs r0,#0 + pop {r15} +1: + movs r0,#1 + lsls r0,#31 @ 0x80000000 + pop {r15} + +.ltorg + +@ convert float to signed int, rounding towards -Inf, clamping +.thumb_func +qfp_float2int: + movs r1,#0 @ fall through + +@ convert float in r0 to signed fixed point in r0, clamping +.thumb_func +qfp_float2fix: + push {r4,r14} + bl unpackx + movs r3,r2 + adds r3,#130 + bmi 6f @ -0? + add r2,r1 @ incorporate binary point position into exponent + subs r2,#23 @ r2 is now amount of left shift required + blt 1f @ requires right shift? + cmp r2,#7 @ overflow? + ble 4f +3: @ overflow + asrs r1,r0,#31 @ +ve:0 -ve:0xffffffff + mvns r1,r1 @ +ve:0xffffffff -ve:0 + movs r0,#1 + lsls r0,#31 +5: + eors r0,r1 @ +ve:0x7fffffff -ve:0x80000000 (unsigned path: 0xffffffff) + pop {r4,r15} +1: + rsbs r2,#0 @ right shift for r0, >0 + cmp r2,#32 + blt 2f @ more than 32 bits of right shift? + movs r2,#32 +2: + asrs r0,r0,r2 + pop {r4,r15} +6: + movs r0,#0 + pop {r4,r15} + +@ unsigned version +.thumb_func +qfp_float2uint: + movs r1,#0 @ fall through +.thumb_func +qfp_float2ufix: + push {r4,r14} + bl unpackx + add r2,r1 @ incorporate binary point position into exponent + movs r1,r0 + bmi 5b @ negative? return zero + subs r2,#23 @ r2 is now amount of left shift required + blt 1b @ requires right shift? + mvns r1,r0 @ ready to return 0xffffffff + cmp r2,#8 @ overflow? + bgt 5b +4: + lsls r0,r0,r2 @ result fits, left shifted + pop {r4,r15} + + +@ convert uint64 to float, rounding +.thumb_func +qfp_uint642float: + movs r2,#0 @ fall through + +@ convert unsigned 64-bit fix to float, rounding; number of r0:r1 bits after point in r2 +.thumb_func +qfp_ufix642float: + push {r4,r5,r14} + cmp r1,#0 + bpl 3f @ positive? we can use signed code + lsls r5,r1,#31 @ contribution to sticky bits + orrs r5,r0 + lsrs r0,r1,#1 + subs r2,#1 + b 4f + +@ convert int64 to float, rounding +.thumb_func +qfp_int642float: + movs r2,#0 @ fall through + +@ convert signed 64-bit fix to float, rounding; number of r0:r1 bits after point in r2 +.thumb_func +qfp_fix642float: + push {r4,r5,r14} +3: + movs r5,r0 + orrs r5,r1 + beq ret_pop45 @ zero? return +0 + asrs r5,r1,#31 @ sign bits +2: + asrs r4,r1,#24 @ try shifting 7 bits at a time + cmp r4,r5 + bne 1f @ next shift will overflow? + lsls r1,#7 + lsrs r4,r0,#25 + orrs r1,r4 + lsls r0,#7 + adds r2,#7 + b 2b +1: + movs r5,r0 + movs r0,r1 +4: + rsbs r2,#0 + adds r2,#32+29 + b packret + +@ convert signed int to float, rounding +.thumb_func +qfp_int2float: + movs r1,#0 @ fall through + +@ convert signed fix to float, rounding; number of r0 bits after point in r1 +.thumb_func +qfp_fix2float: + push {r4,r5,r14} +1: + movs r2,#29 + subs r2,r1 @ fix exponent +packretns: @ pack and return, sticky bits=0 + movs r5,#0 +packret: @ common return point: "pack and return" + bl packx +ret_pop45: + pop {r4,r5,r15} + + +@ unsigned version +.thumb_func +qfp_uint2float: + movs r1,#0 @ fall through +.thumb_func +qfp_ufix2float: + push {r4,r5,r14} + cmp r0,#0 + bge 1b @ treat <2^31 as signed + movs r2,#30 + subs r2,r1 @ fix exponent + lsls r5,r0,#31 @ one sticky bit + lsrs r0,#1 + b packret + +@ All the scientific functions are implemented using the CORDIC algorithm. For notation, +@ details not explained in the comments below, and a good overall survey see +@ "50 Years of CORDIC: Algorithms, Architectures, and Applications" by Meher et al., +@ IEEE Transactions on Circuits and Systems Part I, Volume 56 Issue 9. + +@ Register use: +@ r0: x +@ r1: y +@ r2: z/omega +@ r3: coefficient pointer +@ r4,r12: m +@ r5: i (shift) + +cordic_start: @ initialisation + movs r5,#0 @ initial shift=0 + mov r12,r4 + b 5f + +cordic_vstep: @ one step of algorithm in vector mode + cmp r1,#0 @ check sign of y + bgt 4f + b 1f +cordic_rstep: @ one step of algorithm in rotation mode + cmp r2,#0 @ check sign of angle + bge 1f +4: + subs r1,r6 @ negative rotation: y=y-(x>>i) + rsbs r7,#0 + adds r2,r4 @ accumulate angle + b 2f +1: + adds r1,r6 @ positive rotation: y=y+(x>>i) + subs r2,r4 @ accumulate angle +2: + mov r4,r12 + muls r7,r4 @ apply sign from m + subs r0,r7 @ finish rotation: x=x{+/-}(y>>i) +5: + ldmia r3!,{r4} @ fetch next angle from table and bump pointer + lsrs r4,#1 @ repeated angle? + bcs 3f + adds r5,#1 @ adjust shift if not +3: + mov r6,r0 + asrs r6,r5 @ x>>i + mov r7,r1 + asrs r7,r5 @ y>>i + lsrs r4,#1 @ shift end flag into carry + bx r14 + +@ CORDIC rotation mode +cordic_rot: + push {r6,r7,r14} + bl cordic_start @ initialise +1: + bl cordic_rstep + bcc 1b @ step until table finished + asrs r6,r0,#14 @ remaining small rotations can be linearised: see IV.B of paper referenced above + asrs r7,r1,#14 + asrs r2,#3 + muls r6,r2 @ all remaining CORDIC steps in a multiplication + muls r7,r2 + mov r4,r12 + muls r7,r4 + asrs r6,#12 + asrs r7,#12 + subs r0,r7 @ x=x{+/-}(yz>>k) + adds r1,r6 @ y=y+(xz>>k) +cordic_exit: + pop {r6,r7,r15} + +@ CORDIC vector mode +cordic_vec: + push {r6,r7,r14} + bl cordic_start @ initialise +1: + bl cordic_vstep + bcc 1b @ step until table finished +4: + cmp r1,#0 @ continue as in cordic_vstep but without using table; x is not affected as y is small + bgt 2f @ check sign of y + adds r1,r6 @ positive rotation: y=y+(x>>i) + subs r2,r4 @ accumulate angle + b 3f +2: + subs r1,r6 @ negative rotation: y=y-(x>>i) + adds r2,r4 @ accumulate angle +3: + asrs r6,#1 + asrs r4,#1 @ next "table entry" + bne 4b + b cordic_exit + +.thumb_func +qfp_fsin: @ calculate sin and cos using CORDIC rotation method + push {r4,r5,r14} + movs r1,#24 + bl qfp_float2fix @ range reduction by repeated subtraction/addition in fixed point + ldr r4,pi_q29 + lsrs r4,#4 @ 2pi Q24 +1: + subs r0,r4 + bge 1b +1: + adds r0,r4 + bmi 1b @ now in range 0..2pi + lsls r2,r0,#2 @ z Q26 + lsls r5,r4,#1 @ pi Q26 (r4=pi/2 Q26) + ldr r0,=#0x136e9db4 @ initialise CORDIC x,y with scaling + movs r1,#0 +1: + cmp r2,r4 @ >pi/2? + blt 2f + subs r2,r5 @ reduce range to -pi/2..pi/2 + rsbs r0,#0 @ rotate vector by pi + b 1b +2: + lsls r2,#3 @ Q29 + adr r3,tab_cc @ circular coefficients + movs r4,#1 @ m=1 + bl cordic_rot + adds r1,#9 @ fiddle factor to make sin(0)==0 + movs r2,#0 @ exponents to zero + movs r3,#0 + movs r5,#0 @ no sticky bits + bl clampx + bl packx @ pack cosine + bl xchxy + bl clampx + b packretns @ pack sine + +.thumb_func +qfp_fcos: + push {r14} + bl qfp_fsin + mov r0,r1 @ extract cosine result + pop {r15} + +@ force r0 to lie in range [-1,1] Q29 +clampx: + movs r4,#1 + lsls r4,#29 + cmp r0,r4 + bgt 1f + rsbs r4,#0 + cmp r0,r4 + ble 1f + bx r14 +1: + movs r0,r4 + bx r14 + +.thumb_func +qfp_ftan: + push {r4,r5,r6,r14} + bl qfp_fsin @ sine in r0/r2, cosine in r1/r3 + b fdiv_n @ sin/cos + +.thumb_func +qfp_fexp: + push {r4,r5,r14} + movs r1,#24 + bl qfp_float2fix @ Q24: covers entire valid input range + asrs r1,r0,#16 @ Q8 + ldr r2,=#5909 @ log_2(e) Q12 + muls r2,r1 @ estimate exponent of result Q20 (always an underestimate) + asrs r2,#20 @ Q0 + lsls r1,r0,#6 @ Q30 + ldr r0,=#0x2c5c85fe @ ln(2) Q30 + muls r0,r2 @ accurate contribution of estimated exponent + subs r1,r0 @ residual to be exponentiated, guaranteed ≥0, < about 0.75 Q30 + +@ here +@ r1: mantissa to exponentiate, 0...~0.75 Q30 +@ r2: first exponent estimate + + movs r5,#1 @ shift + adr r3,ftab_exp @ could use alternate words from dtab_exp to save space if required + movs r0,#1 + lsls r0,#29 @ x=1 Q29 + +3: + ldmia r3!,{r4} + subs r4,r1,r4 + bmi 1f + movs r1,r4 @ keep result of subtraction + movs r4,r0 + lsrs r4,r5 + adcs r0,r4 @ x+=x>>i with rounding + +1: + adds r5,#1 + cmp r5,#15 + bne 3b + +@ here +@ r0: exp a Q29 1..2+ +@ r1: ε (residual x where x=a+ε), < 2^-14 Q30 +@ r2: first exponent estimate +@ and we wish to calculate exp x=exp a exp ε~(exp a)(1+ε) + + lsrs r3,r0,#15 @ exp a Q14 + muls r3,r1 @ ε exp a Q44 + lsrs r3,#15 @ ε exp a Q29 + adcs r0,r3 @ (1+ε) exp a Q29 with rounding + + b packretns @ pack result + +.thumb_func +qfp_fln: + push {r4,r5,r14} + asrs r1,r0,#23 + bmi 3f @ -ve argument? + beq 3f @ 0 argument? + cmp r1,#0xff + beq 4f @ +Inf/NaN + bl unpackx + adds r2,#1 + ldr r3,=#0x2c5c85fe @ ln(2) Q30 + lsrs r1,r3,#14 @ ln(2) Q16 + muls r1,r2 @ result estimate Q16 + asrs r1,#16 @ integer contribution to result + muls r3,r2 + lsls r4,r1,#30 + subs r3,r4 @ fractional contribution to result Q30, signed + lsls r0,#8 @ Q31 + +@ here +@ r0: mantissa Q31 +@ r1: integer contribution to result +@ r3: fractional contribution to result Q30, signed + + movs r5,#1 @ shift + adr r4,ftab_exp @ could use alternate words from dtab_exp to save space if required + +2: + movs r2,r0 + lsrs r2,r5 + adcs r2,r0 @ x+(x>>i) with rounding + bcs 1f @ >=2? + movs r0,r2 @ keep result + ldr r2,[r4] + subs r3,r2 +1: + adds r4,#4 + adds r5,#1 + cmp r5,#15 + bne 2b + +@ here +@ r0: residual x, nearly 2 Q31 +@ r1: integer contribution to result +@ r3: fractional part of result Q30 + + asrs r0,#2 + adds r0,r3,r0 + + cmp r1,#0 + bne 2f + + asrs r0,#1 + lsls r1,#29 + adds r0,r1 + movs r2,#0 + b packretns + +2: + lsls r1,#24 + asrs r0,#6 @ Q24 + adcs r0,r1 @ with rounding + movs r2,#5 + b packretns + +3: + ldr r0,=#0xff800000 @ -Inf + pop {r4,r5,r15} +4: + ldr r0,=#0x7f800000 @ +Inf + pop {r4,r5,r15} + +.align 2 +ftab_exp: +.word 0x19f323ed @ log 1+2^-1 Q30 +.word 0x0e47fbe4 @ log 1+2^-2 Q30 +.word 0x0789c1dc @ log 1+2^-3 Q30 +.word 0x03e14618 @ log 1+2^-4 Q30 +.word 0x01f829b1 @ log 1+2^-5 Q30 +.word 0x00fe0546 @ log 1+2^-6 Q30 +.word 0x007f80aa @ log 1+2^-7 Q30 +.word 0x003fe015 @ log 1+2^-8 Q30 +.word 0x001ff803 @ log 1+2^-9 Q30 +.word 0x000ffe00 @ log 1+2^-10 Q30 +.word 0x0007ff80 @ log 1+2^-11 Q30 +.word 0x0003ffe0 @ log 1+2^-12 Q30 +.word 0x0001fff8 @ log 1+2^-13 Q30 +.word 0x0000fffe @ log 1+2^-14 Q30 + +.thumb_func +qfp_fatan2: + push {r4,r5,r14} + +@ unpack arguments and shift one down to have common exponent + bl unpackx + bl xchxy + bl unpackx + lsls r0,r0,#5 @ Q28 + lsls r1,r1,#5 @ Q28 + adds r4,r2,r3 @ this is -760 if both arguments are 0 and at least -380-126=-506 otherwise + asrs r4,#9 + adds r4,#1 + bmi 2f @ force y to 0 proper, so result will be zero + subs r4,r2,r3 @ calculate shift + bge 1f @ ex>=ey? + rsbs r4,#0 @ make shift positive + asrs r0,r4 + cmp r4,#28 + blo 3f + asrs r0,#31 + b 3f +1: + asrs r1,r4 + cmp r4,#28 + blo 3f +2: +@ here |x|>>|y| or both x and y are ±0 + cmp r0,#0 + bge 4f @ x positive, return signed 0 + ldr r0,pi_q29 @ x negative, return +/- pi + asrs r1,#31 + eors r0,r1 + b 7f +4: + asrs r0,r1,#31 + b 7f +3: + movs r2,#0 @ initial angle + cmp r0,#0 @ x negative + bge 5f + rsbs r0,#0 @ rotate to 1st/4th quadrants + rsbs r1,#0 + ldr r2,pi_q29 @ pi Q29 +5: + adr r3,tab_cc @ circular coefficients + movs r4,#1 @ m=1 + bl cordic_vec @ also produces magnitude (with scaling factor 1.646760119), which is discarded + mov r0,r2 @ result here is -pi/2..3pi/2 Q29 +@ asrs r2,#29 +@ subs r0,r2 + ldr r2,pi_q29 @ pi Q29 + adds r4,r0,r2 @ attempt to fix -3pi/2..-pi case + bcs 6f @ -pi/2..0? leave result as is + subs r4,r0,r2 @ pi: take off 2pi +6: + subs r0,#1 @ fiddle factor so atan2(0,1)==0 +7: + movs r2,#0 @ exponent for pack + b packretns + +.align 2 +.ltorg + +@ first entry in following table is pi Q29 +pi_q29: +@ circular CORDIC coefficients: atan(2^-i), b0=flag for preventing shift, b1=flag for end of table +tab_cc: +.word 0x1921fb54*4+1 @ no shift before first iteration +.word 0x0ed63383*4+0 +.word 0x07d6dd7e*4+0 +.word 0x03fab753*4+0 +.word 0x01ff55bb*4+0 +.word 0x00ffeaae*4+0 +.word 0x007ffd55*4+0 +.word 0x003fffab*4+0 +.word 0x001ffff5*4+0 +.word 0x000fffff*4+0 +.word 0x0007ffff*4+0 +.word 0x00040000*4+0 +.word 0x00020000*4+0+2 @ +2 marks end + +.align 2 +.thumb_func +qfp_fsub: + ldr r2,=#0x80000000 + eors r1,r2 @ flip sign on second argument +@ drop into fadd, on .align2:ed boundary + +.thumb_func +qfp_fadd: + push {r4,r5,r6,r14} + asrs r4,r0,#31 + lsls r2,r0,#1 + lsrs r2,#24 @ x exponent + beq fa_xe0 + cmp r2,#255 + beq fa_xe255 +fa_xe: + asrs r5,r1,#31 + lsls r3,r1,#1 + lsrs r3,#24 @ y exponent + beq fa_ye0 + cmp r3,#255 + beq fa_ye255 +fa_ye: + ldr r6,=#0x007fffff + ands r0,r0,r6 @ extract mantissa bits + ands r1,r1,r6 + adds r6,#1 @ r6=0x00800000 + orrs r0,r0,r6 @ set implied 1 + orrs r1,r1,r6 + eors r0,r0,r4 @ complement... + eors r1,r1,r5 + subs r0,r0,r4 @ ... and add 1 if sign bit is set: 2's complement + subs r1,r1,r5 + subs r5,r3,r2 @ ye-xe + subs r4,r2,r3 @ xe-ye + bmi fa_ygtx +@ here xe>=ye + cmp r4,#30 + bge fa_xmgty @ xe much greater than ye? + adds r5,#32 + movs r3,r2 @ save exponent +@ here y in r1 must be shifted down r4 places to align with x in r0 + movs r2,r1 + lsls r2,r2,r5 @ keep the bits we will shift off the bottom of r1 + asrs r1,r1,r4 + b fa_0 + +.ltorg + +fa_ymgtx: + movs r2,#0 @ result is just y + movs r0,r1 + b fa_1 +fa_xmgty: + movs r3,r2 @ result is just x + movs r2,#0 + b fa_1 + +fa_ygtx: +@ here ye>xe + cmp r5,#30 + bge fa_ymgtx @ ye much greater than xe? + adds r4,#32 +@ here x in r0 must be shifted down r5 places to align with y in r1 + movs r2,r0 + lsls r2,r2,r4 @ keep the bits we will shift off the bottom of r0 + asrs r0,r0,r5 +fa_0: + adds r0,r1 @ result is now in r0:r2, possibly highly denormalised or zero; exponent in r3 + beq fa_9 @ if zero, inputs must have been of identical magnitude and opposite sign, so return +0 +fa_1: + lsrs r1,r0,#31 @ sign bit + beq fa_8 + mvns r0,r0 + rsbs r2,r2,#0 + bne fa_8 + adds r0,#1 +fa_8: + adds r6,r6 +@ r6=0x01000000 + cmp r0,r6 + bhs fa_2 +fa_3: + adds r2,r2 @ normalisation loop + adcs r0,r0 + subs r3,#1 @ adjust exponent + cmp r0,r6 + blo fa_3 +fa_2: +@ here r0:r2 is the result mantissa 0x01000000<=r0<0x02000000, r3 the exponent, and r1 the sign bit + lsrs r0,#1 + bcc fa_4 +@ rounding bits here are 1:r2 + adds r0,#1 @ round up + cmp r2,#0 + beq fa_5 @ sticky bits all zero? +fa_4: + cmp r3,#254 + bhs fa_6 @ exponent too large or negative? + lsls r1,#31 @ pack everything + add r0,r1 + lsls r3,#23 + add r0,r3 +fa_end: + pop {r4,r5,r6,r15} + +fa_9: + cmp r2,#0 @ result zero? + beq fa_end @ return +0 + b fa_1 + +fa_5: + lsrs r0,#1 + lsls r0,#1 @ round to even + b fa_4 + +fa_6: + bge fa_7 +@ underflow +@ can handle denormals here + lsls r0,r1,#31 @ result is signed zero + pop {r4,r5,r6,r15} +fa_7: +@ overflow + lsls r0,r1,#8 + adds r0,#255 + lsls r0,#23 @ result is signed infinity + pop {r4,r5,r6,r15} + + +fa_xe0: +@ can handle denormals here + subs r2,#32 + adds r2,r4 @ exponent -32 for +Inf, -33 for -Inf + b fa_xe + +fa_xe255: +@ can handle NaNs here + lsls r2,#8 + add r2,r2,r4 @ exponent ~64k for +Inf, ~64k-1 for -Inf + b fa_xe + +fa_ye0: +@ can handle denormals here + subs r3,#32 + adds r3,r5 @ exponent -32 for +Inf, -33 for -Inf + b fa_ye + +fa_ye255: +@ can handle NaNs here + lsls r3,#8 + add r3,r3,r5 @ exponent ~64k for +Inf, ~64k-1 for -Inf + b fa_ye + + +.align 2 +.thumb_func +qfp_fmul: + push {r7,r14} + mov r2,r0 + eors r2,r1 @ sign of result + lsrs r2,#31 + lsls r2,#31 + mov r14,r2 + lsls r0,#1 + lsls r1,#1 + lsrs r2,r0,#24 @ xe + beq fm_xe0 + cmp r2,#255 + beq fm_xe255 +fm_xe: + lsrs r3,r1,#24 @ ye + beq fm_ye0 + cmp r3,#255 + beq fm_ye255 +fm_ye: + adds r7,r2,r3 @ exponent of result (will possibly be incremented) + subs r7,#128 @ adjust bias for packing + lsls r0,#8 @ x mantissa + lsls r1,#8 @ y mantissa + lsrs r0,#9 + lsrs r1,#9 + + adds r2,r0,r1 @ for later + mov r12,r2 + lsrs r2,r0,#7 @ x[22..7] Q16 + lsrs r3,r1,#7 @ y[22..7] Q16 + muls r2,r2,r3 @ result [45..14] Q32: never an overestimate and worst case error is 2*(2^7-1)*(2^23-2^7)+(2^7-1)^2 = 2130690049 < 2^31 + muls r0,r0,r1 @ result [31..0] Q46 + lsrs r2,#18 @ result [45..32] Q14 + bcc 1f + cmp r0,#0 + bmi 1f + adds r2,#1 @ fix error in r2 +1: + lsls r3,r0,#9 @ bits off bottom of result + lsrs r0,#23 @ Q23 + lsls r2,#9 + adds r0,r2 @ cut'n'shut + add r0,r12 @ implied 1*(x+y) to compensate for no insertion of implied 1s +@ result-1 in r3:r0 Q23+32, i.e., in range [0,3) + + lsrs r1,r0,#23 + bne fm_0 @ branch if we need to shift down one place +@ here 1<=result<2 + cmp r7,#254 + bhs fm_3a @ catches both underflow and overflow + lsls r3,#1 @ sticky bits at top of R3, rounding bit in carry + bcc fm_1 @ no rounding + beq fm_2 @ rounding tie? + adds r0,#1 @ round up +fm_1: + adds r7,#1 @ for implied 1 + lsls r7,#23 @ pack result + add r0,r7 + add r0,r14 + pop {r7,r15} +fm_2: @ rounding tie + adds r0,#1 +fm_3: + lsrs r0,#1 + lsls r0,#1 @ clear bottom bit + b fm_1 + +@ here 1<=result-1<3 +fm_0: + adds r7,#1 @ increment exponent + cmp r7,#254 + bhs fm_3b @ catches both underflow and overflow + lsrs r0,#1 @ shift mantissa down + bcc fm_1a @ no rounding + adds r0,#1 @ assume we will round up + cmp r3,#0 @ sticky bits + beq fm_3c @ rounding tie? +fm_1a: + adds r7,r7 + adds r7,#1 @ for implied 1 + lsls r7,#22 @ pack result + add r0,r7 + add r0,r14 + pop {r7,r15} + +fm_3c: + lsrs r0,#1 + lsls r0,#1 @ clear bottom bit + b fm_1a + +fm_xe0: + subs r2,#16 +fm_xe255: + lsls r2,#8 + b fm_xe +fm_ye0: + subs r3,#16 +fm_ye255: + lsls r3,#8 + b fm_ye + +@ here the result is under- or overflowing +fm_3b: + bge fm_4 @ branch on overflow +@ trap case where result is denormal 0x007fffff + 0.5ulp or more + adds r7,#1 @ exponent=-1? + bne fm_5 +@ corrected mantissa will be >= 3.FFFFFC (0x1fffffe Q23) +@ so r0 >= 2.FFFFFC (0x17ffffe Q23) + adds r0,#2 + lsrs r0,#23 + cmp r0,#3 + bne fm_5 + b fm_6 + +fm_3a: + bge fm_4 @ branch on overflow +@ trap case where result is denormal 0x007fffff + 0.5ulp or more + adds r7,#1 @ exponent=-1? + bne fm_5 + adds r0,#1 @ mantissa=0xffffff (i.e., r0=0x7fffff)? + lsrs r0,#23 + beq fm_5 +fm_6: + movs r0,#1 @ return smallest normal + lsls r0,#23 + add r0,r14 + pop {r7,r15} + +fm_5: + mov r0,r14 + pop {r7,r15} +fm_4: + movs r0,#0xff + lsls r0,#23 + add r0,r14 + pop {r7,r15} + +@ This version of the division algorithm uses external divider hardware to estimate the +@ reciprocal of the divisor to about 14 bits; then a multiplication step to get a first +@ quotient estimate; then the remainder based on this estimate is used to calculate a +@ correction to the quotient. The result is good to about 27 bits and so we only need +@ to calculate the exact remainder when close to a rounding boundary. +.align 2 +.thumb_func +qfp_fdiv: + push {r4,r5,r6,r14} +fdiv_n: + + movs r4,#1 + lsls r4,#23 @ implied 1 position + lsls r2,r1,#9 @ clear out sign and exponent + lsrs r2,r2,#9 + orrs r2,r2,r4 @ divisor mantissa Q23 with implied 1 + +@ here +@ r0=packed dividend +@ r1=packed divisor +@ r2=divisor mantissa Q23 +@ r4=1<<23 + +// see divtest.c + lsrs r3,r2,#18 @ x2=x>>18; // Q5 32..63 + adr r5,rcpapp-32 + ldrb r3,[r5,r3] @ u=lut5[x2-32]; // Q8 + lsls r5,r2,#5 + muls r5,r5,r3 + asrs r5,#14 @ e=(i32)(u*(x<<5))>>14; // Q22 + asrs r6,r5,#11 + muls r6,r6,r6 @ e2=(e>>11)*(e>>11); // Q22 + subs r5,r6 + muls r5,r5,r3 @ c=(e-e2)*u; // Q30 + lsls r6,r3,#8 + asrs r5,#13 + adds r5,#1 + asrs r5,#1 + subs r5,r6,r5 @ u0=(u<<8)-((c+0x2000)>>14); // Q16 + +@ here +@ r0=packed dividend +@ r1=packed divisor +@ r2=divisor mantissa Q23 +@ r4=1<<23 +@ r5=reciprocal estimate Q16 + + lsrs r6,r0,#23 + uxtb r3,r6 @ dividend exponent + lsls r0,#9 + lsrs r0,#9 + orrs r0,r0,r4 @ dividend mantissa Q23 + + lsrs r1,#23 + eors r6,r1 @ sign of result in bit 8 + lsrs r6,#8 + lsls r6,#31 @ sign of result in bit 31, other bits clear + +@ here +@ r0=dividend mantissa Q23 +@ r1=divisor sign+exponent +@ r2=divisor mantissa Q23 +@ r3=dividend exponent +@ r5=reciprocal estimate Q16 +@ r6b31=sign of result + + uxtb r1,r1 @ divisor exponent + cmp r1,#0 + beq retinf + cmp r1,#255 + beq 20f @ divisor is infinite + cmp r3,#0 + beq retzero + cmp r3,#255 + beq retinf + subs r3,r1 @ initial result exponent (no bias) + adds r3,#125 @ add bias + + lsrs r1,r0,#8 @ dividend mantissa Q15 + +@ here +@ r0=dividend mantissa Q23 +@ r1=dividend mantissa Q15 +@ r2=divisor mantissa Q23 +@ r3=initial result exponent +@ r5=reciprocal estimate Q16 +@ r6b31=sign of result + + muls r1,r5 + + lsrs r1,#16 @ Q15 qu0=(q15)(u*y0); + lsls r0,r0,#15 @ dividend Q38 + movs r4,r2 + muls r4,r1 @ Q38 qu0*x + subs r4,r0,r4 @ Q38 re0=(y<<15)-qu0*x; note this remainder is signed + asrs r4,#10 + muls r4,r5 @ Q44 qu1=(re0>>10)*u; this quotient correction is also signed + asrs r4,#16 @ Q28 + lsls r1,#13 + adds r1,r1,r4 @ Q28 qu=(qu0<<13)+(qu1>>16); + +@ here +@ r0=dividend mantissa Q38 +@ r1=quotient Q28 +@ r2=divisor mantissa Q23 +@ r3=initial result exponent +@ r6b31=sign of result + + lsrs r4,r1,#28 + bne 1f +@ here the quotient is less than 1<<28 (i.e., result mantissa <1.0) + + adds r1,#5 + lsrs r4,r1,#4 @ rounding + small reduction in systematic bias + bcc 2f @ skip if we are not near a rounding boundary + lsrs r1,#3 @ quotient Q25 + lsls r0,#10 @ dividend mantissa Q48 + muls r1,r1,r2 @ quotient*divisor Q48 + subs r0,r0,r1 @ remainder Q48 + bmi 2f + b 3f + +1: +@ here the quotient is at least 1<<28 (i.e., result mantissa >=1.0) + + adds r3,#1 @ bump exponent (and shift mantissa down one more place) + adds r1,#9 + lsrs r4,r1,#5 @ rounding + small reduction in systematic bias + bcc 2f @ skip if we are not near a rounding boundary + + lsrs r1,#4 @ quotient Q24 + lsls r0,#9 @ dividend mantissa Q47 + muls r1,r1,r2 @ quotient*divisor Q47 + subs r0,r0,r1 @ remainder Q47 + bmi 2f +3: + adds r4,#1 @ increment quotient as we are above the rounding boundary + +@ here +@ r3=result exponent +@ r4=correctly rounded quotient Q23 in range [1,2] *note closed interval* +@ r6b31=sign of result + +2: + cmp r3,#254 + bhs 10f @ this catches both underflow and overflow + lsls r1,r3,#23 + adds r0,r4,r1 + adds r0,r6 + pop {r4,r5,r6,r15} + +@ here divisor is infinite; dividend exponent in r3 +20: + cmp r3,#255 + bne retzero + +retinf: + movs r0,#255 +21: + lsls r0,#23 + orrs r0,r6 + pop {r4,r5,r6,r15} + +10: + bge retinf @ overflow? + adds r1,r3,#1 + bne retzero @ exponent <-1? return 0 +@ here exponent is exactly -1 + lsrs r1,r4,#25 + bcc retzero @ mantissa is not 01000000? +@ return minimum normal + movs r0,#1 + lsls r0,#23 + orrs r0,r6 + pop {r4,r5,r6,r15} + +retzero: + movs r0,r6 + pop {r4,r5,r6,r15} + +@ x2=[32:1:63]/32; +@ round(256 ./(x2+1/64)) +.align 2 +rcpapp: +.byte 252,245,237,231,224,218,213,207,202,197,193,188,184,180,176,172 +.byte 169,165,162,159,156,153,150,148,145,142,140,138,135,133,131,129 + +@ The square root routine uses an initial approximation to the reciprocal of the square root of the argument based +@ on the top four bits of the mantissa (possibly shifted one place to make the exponent even). It then performs two +@ Newton-Raphson iterations, resulting in about 14 bits of accuracy. This reciprocal is then multiplied by +@ the original argument to produce an approximation to the result, again with about 14 bits of accuracy. +@ Then a remainder is calculated, and multiplied by the reciprocal estiamte to generate a correction term +@ giving a final answer to about 28 bits of accuracy. A final remainder calculation rounds to the correct +@ result if necessary. +@ Again, the fixed-point calculation is carefully implemented to preserve accuracy, and similar comments to those +@ made above on the fast division routine apply. +@ The reciprocal square root calculation has been tested for all possible (possibly shifted) input mantissa values. +.align 2 +.thumb_func +qfp_fsqrt: + push {r4} + lsls r1,r0,#1 + bcs sq_0 @ negative? + lsls r1,#8 + lsrs r1,#9 @ mantissa + movs r2,#1 + lsls r2,#23 + adds r1,r2 @ insert implied 1 + lsrs r2,r0,#23 @ extract exponent + beq sq_2 @ zero? + cmp r2,#255 @ infinite? + beq sq_1 + adds r2,#125 @ correction for packing + asrs r2,#1 @ exponent/2, LSB into carry + bcc 1f + lsls r1,#1 @ was even: double mantissa; mantissa y now 1..4 Q23 +1: + adr r4,rsqrtapp-4@ first four table entries are never accessed because of the mantissa's leading 1 + lsrs r3,r1,#21 @ y Q2 + ldrb r4,[r4,r3] @ initial approximation to reciprocal square root a0 Q8 + + lsrs r0,r1,#7 @ y Q16: first Newton-Raphson iteration + muls r0,r4 @ a0*y Q24 + muls r0,r4 @ r0=p0=a0*y*y Q32 + asrs r0,#12 @ r0 Q20 + muls r0,r4 @ dy0=a0*r0 Q28 + asrs r0,#13 @ dy0 Q15 + lsls r4,#8 @ a0 Q16 + subs r4,r0 @ a1=a0-dy0/2 Q16-Q15/2 -> Q16 + adds r4,#170 @ mostly remove systematic error in this approximation: gains approximately 1 bit + + movs r0,r4 @ second Newton-Raphson iteration + muls r0,r0 @ a1*a1 Q32 + lsrs r0,#15 @ a1*a1 Q17 + lsrs r3,r1,#8 @ y Q15 + muls r0,r3 @ r1=p1=a1*a1*y Q32 + asrs r0,#12 @ r1 Q20 + muls r0,r4 @ dy1=a1*r1 Q36 + asrs r0,#21 @ dy1 Q15 + subs r4,r0 @ a2=a1-dy1/2 Q16-Q15/2 -> Q16 + + muls r3,r4 @ a3=y*a2 Q31 + lsrs r3,#15 @ a3 Q16 +@ here a2 is an approximation to the reciprocal square root +@ and a3 is an approximation to the square root + movs r0,r3 + muls r0,r0 @ a3*a3 Q32 + lsls r1,#9 @ y Q32 + subs r0,r1,r0 @ r2=y-a3*a3 Q32 remainder + asrs r0,#5 @ r2 Q27 + muls r4,r0 @ r2*a2 Q43 + lsls r3,#7 @ a3 Q23 + asrs r0,r4,#15 @ r2*a2 Q28 + adds r0,#16 @ rounding to Q24 + asrs r0,r0,#6 @ r2*a2 Q22 + add r3,r0 @ a4 Q23: candidate final result + bcc sq_3 @ near rounding boundary? skip if no rounding needed + mov r4,r3 + adcs r4,r4 @ a4+0.5ulp Q24 + muls r4,r4 @ Q48 + lsls r1,#16 @ y Q48 + subs r1,r4 @ remainder Q48 + bmi sq_3 + adds r3,#1 @ round up +sq_3: + lsls r2,#23 @ pack exponent + adds r0,r2,r3 +sq_6: + pop {r4} + bx r14 + +sq_0: + lsrs r1,#24 + beq sq_2 @ -0: return it +@ here negative and not -0: return -Inf + asrs r0,#31 +sq_5: + lsls r0,#23 + b sq_6 +sq_1: @ +Inf + lsrs r0,#23 + b sq_5 +sq_2: + lsrs r0,#31 + lsls r0,#31 + b sq_6 + +@ round(sqrt(2^22./[72:16:248])) +rsqrtapp: +.byte 0xf1,0xda,0xc9,0xbb, 0xb0,0xa6,0x9e,0x97, 0x91,0x8b,0x86,0x82 + + + +@ Notation: +@ rx:ry means the concatenation of rx and ry with rx having the less significant bits + +@ IEEE double in ra:rb -> +@ mantissa in ra:rb 12Q52 (53 significant bits) with implied 1 set +@ exponent in re +@ sign in rs +@ trashes rt +.macro mdunpack ra,rb,re,rs,rt + lsrs \re,\rb,#20 @ extract sign and exponent + subs \rs,\re,#1 + lsls \rs,#20 + subs \rb,\rs @ clear sign and exponent in mantissa; insert implied 1 + lsrs \rs,\re,#11 @ sign + lsls \re,#21 + lsrs \re,#21 @ exponent + beq l\@_1 @ zero exponent? + adds \rt,\re,#1 + lsrs \rt,#11 + beq l\@_2 @ exponent != 0x7ff? then done +l\@_1: + movs \ra,#0 + movs \rb,#1 + lsls \rb,#20 + subs \re,#128 + lsls \re,#12 +l\@_2: +.endm + +@ IEEE double in ra:rb -> +@ signed mantissa in ra:rb 12Q52 (53 significant bits) with implied 1 +@ exponent in re +@ trashes rt0 and rt1 +@ +zero, +denormal -> exponent=-0x80000 +@ -zero, -denormal -> exponent=-0x80000 +@ +Inf, +NaN -> exponent=+0x77f000 +@ -Inf, -NaN -> exponent=+0x77e000 +.macro mdunpacks ra,rb,re,rt0,rt1 + lsrs \re,\rb,#20 @ extract sign and exponent + lsrs \rt1,\rb,#31 @ sign only + subs \rt0,\re,#1 + lsls \rt0,#20 + subs \rb,\rt0 @ clear sign and exponent in mantissa; insert implied 1 + lsls \re,#21 + bcc l\@_1 @ skip on positive + mvns \rb,\rb @ negate mantissa + rsbs \ra,#0 + bcc l\@_1 + adds \rb,#1 +l\@_1: + lsrs \re,#21 + beq l\@_2 @ zero exponent? + adds \rt0,\re,#1 + lsrs \rt0,#11 + beq l\@_3 @ exponent != 0x7ff? then done + subs \re,\rt1 +l\@_2: + movs \ra,#0 + lsls \rt1,#1 @ +ve: 0 -ve: 2 + adds \rb,\rt1,#1 @ +ve: 1 -ve: 3 + lsls \rb,#30 @ create +/-1 mantissa + asrs \rb,#10 + subs \re,#128 + lsls \re,#12 +l\@_3: +.endm + +.align 2 +.thumb_func +qfp_dsub: + push {r4-r7,r14} + movs r4,#1 + lsls r4,#31 + eors r3,r4 @ flip sign on second argument + b da_entry @ continue in dadd + +.align 2 +.thumb_func +qfp_dadd: + push {r4-r7,r14} +da_entry: + mdunpacks r0,r1,r4,r6,r7 + mdunpacks r2,r3,r5,r6,r7 + subs r7,r5,r4 @ ye-xe + subs r6,r4,r5 @ xe-ye + bmi da_ygtx +@ here xe>=ye: need to shift y down r6 places + mov r12,r4 @ save exponent + cmp r6,#32 + bge da_xrgty @ xe rather greater than ye? + adds r7,#32 + movs r4,r2 + lsls r4,r4,r7 @ rounding bit + sticky bits +da_xgty0: + movs r5,r3 + lsls r5,r5,r7 + lsrs r2,r6 + asrs r3,r6 + orrs r2,r5 +da_add: + adds r0,r2 + adcs r1,r3 +da_pack: +@ here unnormalised signed result (possibly 0) is in r0:r1 with exponent r12, rounding + sticky bits in r4 +@ Note that if a large normalisation shift is required then the arguments were close in magnitude and so we +@ cannot have not gone via the xrgty/yrgtx paths. There will therefore always be enough high bits in r4 +@ to provide a correct continuation of the exact result. +@ now pack result back up + lsrs r3,r1,#31 @ get sign bit + beq 1f @ skip on positive + mvns r1,r1 @ negate mantissa + mvns r0,r0 + movs r2,#0 + rsbs r4,#0 + adcs r0,r2 + adcs r1,r2 +1: + mov r2,r12 @ get exponent + lsrs r5,r1,#21 + bne da_0 @ shift down required? + lsrs r5,r1,#20 + bne da_1 @ normalised? + cmp r0,#0 + beq da_5 @ could mantissa be zero? +da_2: + adds r4,r4 + adcs r0,r0 + adcs r1,r1 + subs r2,#1 @ adjust exponent + lsrs r5,r1,#20 + beq da_2 +da_1: + lsls r4,#1 @ check rounding bit + bcc da_3 +da_4: + adds r0,#1 @ round up + bcc 2f + adds r1,#1 +2: + cmp r4,#0 @ sticky bits zero? + bne da_3 + lsrs r0,#1 @ round to even + lsls r0,#1 +da_3: + subs r2,#1 + bmi da_6 + adds r4,r2,#2 @ check if exponent is overflowing + lsrs r4,#11 + bne da_7 + lsls r2,#20 @ pack exponent and sign + add r1,r2 + lsls r3,#31 + add r1,r3 + pop {r4-r7,r15} + +da_7: +@ here exponent overflow: return signed infinity + lsls r1,r3,#31 + ldr r3,=#0x7ff00000 + orrs r1,r3 + b 1f +da_6: +@ here exponent underflow: return signed zero + lsls r1,r3,#31 +1: + movs r0,#0 + pop {r4-r7,r15} + +da_5: +@ here mantissa could be zero + cmp r1,#0 + bne da_2 + cmp r4,#0 + bne da_2 +@ inputs must have been of identical magnitude and opposite sign, so return +0 + pop {r4-r7,r15} + +da_0: +@ here a shift down by one place is required for normalisation + adds r2,#1 @ adjust exponent + lsls r6,r0,#31 @ save rounding bit + lsrs r0,#1 + lsls r5,r1,#31 + orrs r0,r5 + lsrs r1,#1 + cmp r6,#0 + beq da_3 + b da_4 + +da_xrgty: @ xe>ye and shift>=32 places + cmp r6,#60 + bge da_xmgty @ xe much greater than ye? + subs r6,#32 + adds r7,#64 + + movs r4,r2 + lsls r4,r4,r7 @ these would be shifted off the bottom of the sticky bits + beq 1f + movs r4,#1 +1: + lsrs r2,r2,r6 + orrs r4,r2 + movs r2,r3 + lsls r3,r3,r7 + orrs r4,r3 + asrs r3,r2,#31 @ propagate sign bit + b da_xgty0 + +da_ygtx: +@ here ye>xe: need to shift x down r7 places + mov r12,r5 @ save exponent + cmp r7,#32 + bge da_yrgtx @ ye rather greater than xe? + adds r6,#32 + movs r4,r0 + lsls r4,r4,r6 @ rounding bit + sticky bits +da_ygtx0: + movs r5,r1 + lsls r5,r5,r6 + lsrs r0,r7 + asrs r1,r7 + orrs r0,r5 + b da_add + +da_yrgtx: + cmp r7,#60 + bge da_ymgtx @ ye much greater than xe? + subs r7,#32 + adds r6,#64 + + movs r4,r0 + lsls r4,r4,r6 @ these would be shifted off the bottom of the sticky bits + beq 1f + movs r4,#1 +1: + lsrs r0,r0,r7 + orrs r4,r0 + movs r0,r1 + lsls r1,r1,r6 + orrs r4,r1 + asrs r1,r0,#31 @ propagate sign bit + b da_ygtx0 + +da_ymgtx: @ result is just y + movs r0,r2 + movs r1,r3 +da_xmgty: @ result is just x + movs r4,#0 @ clear sticky bits + b da_pack + +.ltorg + +@ equivalent of UMULL +@ needs five temporary registers +@ can have rt3==rx, in which case rx trashed +@ can have rt4==ry, in which case ry trashed +@ can have rzl==rx +@ can have rzh==ry +@ can have rzl,rzh==rt3,rt4 +.macro mul32_32_64 rx,ry,rzl,rzh,rt0,rt1,rt2,rt3,rt4 + @ t0 t1 t2 t3 t4 + @ (x) (y) + uxth \rt0,\rx @ xl + uxth \rt1,\ry @ yl + muls \rt0,\rt1 @ xlyl=L + lsrs \rt2,\rx,#16 @ xh + muls \rt1,\rt2 @ xhyl=M0 + lsrs \rt4,\ry,#16 @ yh + muls \rt2,\rt4 @ xhyh=H + uxth \rt3,\rx @ xl + muls \rt3,\rt4 @ xlyh=M1 + adds \rt1,\rt3 @ M0+M1=M + bcc l\@_1 @ addition of the two cross terms can overflow, so add carry into H + movs \rt3,#1 @ 1 + lsls \rt3,#16 @ 0x10000 + adds \rt2,\rt3 @ H' +l\@_1: + @ t0 t1 t2 t3 t4 + @ (zl) (zh) + lsls \rzl,\rt1,#16 @ ML + lsrs \rzh,\rt1,#16 @ MH + adds \rzl,\rt0 @ ZL + adcs \rzh,\rt2 @ ZH +.endm + +@ SUMULL: x signed, y unsigned +@ in table below ¯ means signed variable +@ needs five temporary registers +@ can have rt3==rx, in which case rx trashed +@ can have rt4==ry, in which case ry trashed +@ can have rzl==rx +@ can have rzh==ry +@ can have rzl,rzh==rt3,rt4 +.macro muls32_32_64 rx,ry,rzl,rzh,rt0,rt1,rt2,rt3,rt4 + @ t0 t1 t2 t3 t4 + @ ¯(x) (y) + uxth \rt0,\rx @ xl + uxth \rt1,\ry @ yl + muls \rt0,\rt1 @ xlyl=L + asrs \rt2,\rx,#16 @ ¯xh + muls \rt1,\rt2 @ ¯xhyl=M0 + lsrs \rt4,\ry,#16 @ yh + muls \rt2,\rt4 @ ¯xhyh=H + uxth \rt3,\rx @ xl + muls \rt3,\rt4 @ xlyh=M1 + asrs \rt4,\rt1,#31 @ M0sx (M1 sign extension is zero) + adds \rt1,\rt3 @ M0+M1=M + movs \rt3,#0 @ 0 + adcs \rt4,\rt3 @ ¯Msx + lsls \rt4,#16 @ ¯Msx<<16 + adds \rt2,\rt4 @ H' + + @ t0 t1 t2 t3 t4 + @ (zl) (zh) + lsls \rzl,\rt1,#16 @ M~ + lsrs \rzh,\rt1,#16 @ M~ + adds \rzl,\rt0 @ ZL + adcs \rzh,\rt2 @ ¯ZH +.endm + +@ SSMULL: x signed, y signed +@ in table below ¯ means signed variable +@ needs five temporary registers +@ can have rt3==rx, in which case rx trashed +@ can have rt4==ry, in which case ry trashed +@ can have rzl==rx +@ can have rzh==ry +@ can have rzl,rzh==rt3,rt4 +.macro muls32_s32_64 rx,ry,rzl,rzh,rt0,rt1,rt2,rt3,rt4 + @ t0 t1 t2 t3 t4 + @ ¯(x) (y) + uxth \rt0,\rx @ xl + uxth \rt1,\ry @ yl + muls \rt0,\rt1 @ xlyl=L + asrs \rt2,\rx,#16 @ ¯xh + muls \rt1,\rt2 @ ¯xhyl=M0 + asrs \rt4,\ry,#16 @ ¯yh + muls \rt2,\rt4 @ ¯xhyh=H + uxth \rt3,\rx @ xl + muls \rt3,\rt4 @ ¯xlyh=M1 + adds \rt1,\rt3 @ ¯M0+M1=M + asrs \rt3,\rt1,#31 @ Msx + bvc l\@_1 @ + mvns \rt3,\rt3 @ ¯Msx flip sign extension bits if overflow +l\@_1: + lsls \rt3,#16 @ ¯Msx<<16 + adds \rt2,\rt3 @ H' + + @ t0 t1 t2 t3 t4 + @ (zl) (zh) + lsls \rzl,\rt1,#16 @ M~ + lsrs \rzh,\rt1,#16 @ M~ + adds \rzl,\rt0 @ ZL + adcs \rzh,\rt2 @ ¯ZH +.endm + +@ can have rt2==rx, in which case rx trashed +@ can have rzl==rx +@ can have rzh==rt1 +.macro square32_64 rx,rzl,rzh,rt0,rt1,rt2 + @ t0 t1 t2 zl zh + uxth \rt0,\rx @ xl + muls \rt0,\rt0 @ xlxl=L + uxth \rt1,\rx @ xl + lsrs \rt2,\rx,#16 @ xh + muls \rt1,\rt2 @ xlxh=M + muls \rt2,\rt2 @ xhxh=H + lsls \rzl,\rt1,#17 @ ML + lsrs \rzh,\rt1,#15 @ MH + adds \rzl,\rt0 @ ZL + adcs \rzh,\rt2 @ ZH +.endm + +.align 2 +.thumb_func +qfp_dmul: + push {r4-r7,r14} + mdunpack r0,r1,r4,r6,r5 + mov r12,r4 + mdunpack r2,r3,r4,r7,r5 + eors r7,r6 @ sign of result + add r4,r12 @ exponent of result + push {r0-r2,r4,r7} + +@ accumulate full product in r12:r5:r6:r7 + mul32_32_64 r0,r2, r0,r5, r4,r6,r7,r0,r5 @ XL*YL + mov r12,r0 @ save LL bits + + mul32_32_64 r1,r3, r6,r7, r0,r2,r4,r6,r7 @ XH*YH + + pop {r0} @ XL + mul32_32_64 r0,r3, r0,r3, r1,r2,r4,r0,r3 @ XL*YH + adds r5,r0 + adcs r6,r3 + movs r0,#0 + adcs r7,r0 + + pop {r1,r2} @ XH,YL + mul32_32_64 r1,r2, r1,r2, r0,r3,r4, r1,r2 @ XH*YL + adds r5,r1 + adcs r6,r2 + movs r0,#0 + adcs r7,r0 + +@ here r5:r6:r7 holds the product [1..4) in Q(104-32)=Q72, with extra LSBs in r12 + pop {r3,r4} @ exponent in r3, sign in r4 + lsls r1,r7,#11 + lsrs r2,r6,#21 + orrs r1,r2 + lsls r0,r6,#11 + lsrs r2,r5,#21 + orrs r0,r2 + lsls r5,#11 @ now r5:r0:r1 Q83=Q(51+32), extra LSBs in r12 + lsrs r2,r1,#20 + bne 1f @ skip if in range [2..4) + adds r5,r5 @ shift up so always [2..4) Q83, i.e. [1..2) Q84=Q(52+32) + adcs r0,r0 + adcs r1,r1 + subs r3,#1 @ correct exponent +1: + ldr r6,=#0x3ff + subs r3,r6 @ correct for exponent bias + lsls r6,#1 @ 0x7fe + cmp r3,r6 + bhs dm_0 @ exponent over- or underflow + lsls r5,#1 @ rounding bit to carry + bcc 1f @ result is correctly rounded + adds r0,#1 + movs r6,#0 + adcs r1,r6 @ round up + mov r6,r12 @ remaining sticky bits + orrs r5,r6 + bne 1f @ some sticky bits set? + lsrs r0,#1 + lsls r0,#1 @ round to even +1: + lsls r3,#20 + adds r1,r3 +dm_2: + lsls r4,#31 + add r1,r4 + pop {r4-r7,r15} + +@ here for exponent over- or underflow +dm_0: + bge dm_1 @ overflow? + adds r3,#1 @ would-be zero exponent? + bne 1f + adds r0,#1 + bne 1f @ all-ones mantissa? + adds r1,#1 + lsrs r7,r1,#21 + beq 1f + lsrs r1,#1 + b dm_2 +1: + lsls r1,r4,#31 + movs r0,#0 + pop {r4-r7,r15} + +@ here for exponent overflow +dm_1: + adds r6,#1 @ 0x7ff + lsls r1,r6,#20 + movs r0,#0 + b dm_2 + +.ltorg + +@ Approach to division y/x is as follows. +@ +@ First generate u1, an approximation to 1/x to about 29 bits. Multiply this by the top +@ 32 bits of y to generate a0, a first approximation to the result (good to 28 bits or so). +@ Calculate the exact remainder r0=y-a0*x, which will be about 0. Calculate a correction +@ d0=r0*u1, and then write a1=a0+d0. If near a rounding boundary, compute the exact +@ remainder r1=y-a1*x (which can be done using r0 as a basis) to determine whether to +@ round up or down. +@ +@ The calculation of 1/x is as given in dreciptest.c. That code verifies exhaustively +@ that | u1*x-1 | < 10*2^-32. +@ +@ More precisely: +@ +@ x0=(q16)x; +@ x1=(q30)x; +@ y0=(q31)y; +@ u0=(q15~)"(0xffffffffU/(unsigned int)roundq(x/x_ulp))/powq(2,16)"(x0); // q15 approximation to 1/x; "~" denotes rounding rather than truncation +@ v=(q30)(u0*x1-1); +@ u1=(q30)u0-(q30~)(u0*v); +@ +@ a0=(q30)(u1*y0); +@ r0=(q82)y-a0*x; +@ r0x=(q57)r0; +@ d0=r0x*u1; +@ a1=d0+a0; +@ +@ Error analysis +@ +@ Use Greek letters to represent the errors introduced by rounding and truncation. +@ +@ r₀ = y - a₀x +@ = y - [ u₁ ( y - α ) - β ] x where 0 ≤ α < 2^-31, 0 ≤ β < 2^-30 +@ = y ( 1 - u₁x ) + ( u₁α + β ) x +@ +@ Hence +@ +@ | r₀ / x | < 2 * 10*2^-32 + 2^-31 + 2^-30 +@ = 26*2^-32 +@ +@ r₁ = y - a₁x +@ = y - a₀x - d₀x +@ = r₀ - d₀x +@ = r₀ - u₁ ( r₀ - γ ) x where 0 ≤ γ < 2^-57 +@ = r₀ ( 1 - u₁x ) + u₁γx +@ +@ Hence +@ +@ | r₁ / x | < 26*2^-32 * 10*2^-32 + 2^-57 +@ = (260+128)*2^-64 +@ < 2^-55 +@ +@ Empirically it seems to be nearly twice as good as this. +@ +@ To determine correctly whether the exact remainder calculation can be skipped we need a result +@ accurate to < 0.25ulp. In the case where x>y the quotient will be shifted up one place for normalisation +@ and so 1ulp is 2^-53 and so the calculation above suffices. + +.align 2 +.thumb_func +qfp_ddiv: + push {r4-r7,r14} +ddiv0: @ entry point from dtan + mdunpack r2,r3,r4,r7,r6 @ unpack divisor + +@ unpack dividend by hand to save on register use + lsrs r6,r1,#31 + adds r6,r7 + mov r12,r6 @ result sign in r12b0; r12b1 trashed + lsls r1,#1 + lsrs r7,r1,#21 @ exponent + beq 1f @ zero exponent? + adds r6,r7,#1 + lsrs r6,#11 + beq 2f @ exponent != 0x7ff? then done +1: + movs r0,#0 + movs r1,#0 + subs r7,#64 @ less drastic fiddling of exponents to get 0/0, Inf/Inf correct + lsls r7,#12 +2: + subs r6,r7,r4 + lsls r6,#2 + add r12,r12,r6 @ (signed) exponent in r12[31..8] + subs r7,#1 @ implied 1 + lsls r7,#21 + subs r1,r7 + lsrs r1,#1 + +// see dreciptest-boxc.c + lsrs r4,r3,#15 @ x2=x>>15; // Q5 32..63 + ldr r5,=#(rcpapp-32) + ldrb r4,[r5,r4] @ u=lut5[x2-32]; // Q8 + lsls r5,r3,#8 + muls r5,r5,r4 + asrs r5,#14 @ e=(i32)(u*(x<<8))>>14; // Q22 + asrs r6,r5,#11 + muls r6,r6,r6 @ e2=(e>>11)*(e>>11); // Q22 + subs r5,r6 + muls r5,r5,r4 @ c=(e-e2)*u; // Q30 + lsls r6,r4,#7 + asrs r5,#14 + adds r5,#1 + asrs r5,#1 + subs r6,r5 @ u0=(u<<7)-((c+0x4000)>>15); // Q15 + +@ here +@ r0:r1 y mantissa +@ r2:r3 x mantissa +@ r6 u0, first approximation to 1/x Q15 +@ r12: result sign, exponent + + lsls r4,r3,#10 + lsrs r5,r2,#22 + orrs r5,r4 @ x1=(q30)x + muls r5,r6 @ u0*x1 Q45 + asrs r5,#15 @ v=u0*x1-1 Q30 + muls r5,r6 @ u0*v Q45 + asrs r5,#14 + adds r5,#1 + asrs r5,#1 @ round u0*v to Q30 + lsls r6,#15 + subs r6,r5 @ u1 Q30 + +@ here +@ r0:r1 y mantissa +@ r2:r3 x mantissa +@ r6 u1, second approximation to 1/x Q30 +@ r12: result sign, exponent + + push {r2,r3} + lsls r4,r1,#11 + lsrs r5,r0,#21 + orrs r4,r5 @ y0=(q31)y + mul32_32_64 r4,r6, r4,r5, r2,r3,r7,r4,r5 @ y0*u1 Q61 + adds r4,r4 + adcs r5,r5 @ a0=(q30)(y0*u1) + +@ here +@ r0:r1 y mantissa +@ r5 a0, first approximation to y/x Q30 +@ r6 u1, second approximation to 1/x Q30 +@ r12 result sign, exponent + + ldr r2,[r13,#0] @ xL + mul32_32_64 r2,r5, r2,r3, r1,r4,r7,r2,r3 @ xL*a0 + ldr r4,[r13,#4] @ xH + muls r4,r5 @ xH*a0 + adds r3,r4 @ r2:r3 now x*a0 Q82 + lsrs r2,#25 + lsls r1,r3,#7 + orrs r2,r1 @ r2 now x*a0 Q57; r7:r2 is x*a0 Q89 + lsls r4,r0,#5 @ y Q57 + subs r0,r4,r2 @ r0x=y-x*a0 Q57 (signed) + +@ here +@ r0 r0x Q57 +@ r5 a0, first approximation to y/x Q30 +@ r4 yL Q57 +@ r6 u1 Q30 +@ r12 result sign, exponent + + muls32_32_64 r0,r6, r7,r6, r1,r2,r3, r7,r6 @ r7:r6 r0x*u1 Q87 + asrs r3,r6,#25 + adds r5,r3 + lsls r3,r6,#7 @ r3:r5 a1 Q62 (but bottom 7 bits are zero so 55 bits of precision after binary point) +@ here we could recover another 7 bits of precision (but not accuracy) from the top of r7 +@ but these bits are thrown away in the rounding and conversion to Q52 below + +@ here +@ r3:r5 a1 Q62 candidate quotient [0.5,2) or so +@ r4 yL Q57 +@ r12 result sign, exponent + + movs r6,#0 + adds r3,#128 @ for initial rounding to Q53 + adcs r5,r5,r6 + lsrs r1,r5,#30 + bne dd_0 +@ here candidate quotient a1 is in range [0.5,1) +@ so 30 significant bits in r5 + + lsls r4,#1 @ y now Q58 + lsrs r1,r5,#9 @ to Q52 + lsls r0,r5,#23 + lsrs r3,#9 @ 0.5ulp-significance bit in carry: if this is 1 we may need to correct result + orrs r0,r3 + bcs dd_1 + b dd_2 +dd_0: +@ here candidate quotient a1 is in range [1,2) +@ so 31 significant bits in r5 + + movs r2,#4 + add r12,r12,r2 @ fix exponent; r3:r5 now effectively Q61 + adds r3,#128 @ complete rounding to Q53 + adcs r5,r5,r6 + lsrs r1,r5,#10 + lsls r0,r5,#22 + lsrs r3,#10 @ 0.5ulp-significance bit in carry: if this is 1 we may need to correct result + orrs r0,r3 + bcc dd_2 +dd_1: + +@ here +@ r0:r1 rounded result Q53 [0.5,1) or Q52 [1,2), but may not be correctly rounded-to-nearest +@ r4 yL Q58 or Q57 +@ r12 result sign, exponent +@ carry set + + adcs r0,r0,r0 + adcs r1,r1,r1 @ z Q53 with 1 in LSB + lsls r4,#16 @ Q105-32=Q73 + ldr r2,[r13,#0] @ xL Q52 + ldr r3,[r13,#4] @ xH Q20 + + movs r5,r1 @ zH Q21 + muls r5,r2 @ zH*xL Q73 + subs r4,r5 + muls r3,r0 @ zL*xH Q73 + subs r4,r3 + mul32_32_64 r2,r0, r2,r3, r5,r6,r7,r2,r3 @ xL*zL + rsbs r2,#0 @ borrow from low half? + sbcs r4,r3 @ y-xz Q73 (remainder bits 52..73) + + cmp r4,#0 + + bmi 1f + movs r2,#0 @ round up + adds r0,#1 + adcs r1,r2 +1: + lsrs r0,#1 @ shift back down to Q52 + lsls r2,r1,#31 + orrs r0,r2 + lsrs r1,#1 +dd_2: + add r13,#8 + mov r2,r12 + lsls r7,r2,#31 @ result sign + asrs r2,#2 @ result exponent + ldr r3,=#0x3fd + adds r2,r3 + ldr r3,=#0x7fe + cmp r2,r3 + bhs dd_3 @ over- or underflow? + lsls r2,#20 + adds r1,r2 @ pack exponent +dd_5: + adds r1,r7 @ pack sign + pop {r4-r7,r15} + +dd_3: + movs r0,#0 + cmp r2,#0 + bgt dd_4 @ overflow? + movs r1,r7 + pop {r4-r7,r15} + +dd_4: + adds r3,#1 @ 0x7ff + lsls r1,r3,#20 + b dd_5 + +/* +Approach to square root x=sqrt(y) is as follows. + +First generate a3, an approximation to 1/sqrt(y) to about 30 bits. Multiply this by y +to give a4~sqrt(y) to about 28 bits and a remainder r4=y-a4^2. Then, because +d sqrt(y) / dy = 1 / (2 sqrt(y)) let d4=r4*a3/2 and then the value a5=a4+d4 is +a better approximation to sqrt(y). If this is near a rounding boundary we +compute an exact remainder y-a5*a5 to decide whether to round up or down. + +The calculation of a3 and a4 is as given in dsqrttest.c. That code verifies exhaustively +that | 1 - a3a4 | < 10*2^-32, | r4 | < 40*2^-32 and | r4/y | < 20*2^-32. + +More precisely, with "y" representing y truncated to 30 binary places: + +u=(q3)y; // 24-entry table +a0=(q8~)"1/sqrtq(x+x_ulp/2)"(u); // first approximation from table +p0=(q16)(a0*a0) * (q16)y; +r0=(q20)(p0-1); +dy0=(q15)(r0*a0); // Newton-Raphson correction term +a1=(q16)a0-dy0/2; // good to ~9 bits + +p1=(q19)(a1*a1)*(q19)y; +r1=(q23)(p1-1); +dy1=(q15~)(r1*a1); // second Newton-Raphson correction +a2x=(q16)a1-dy1/2; // good to ~16 bits +a2=a2x-a2x/1t16; // prevent overflow of a2*a2 in 32 bits + +p2=(a2*a2)*(q30)y; // Q62 +r2=(q36)(p2-1+1t-31); +dy2=(q30)(r2*a2); // Q52->Q30 +a3=(q31)a2-dy2/2; // good to about 30 bits +a4=(q30)(a3*(q30)y+1t-31); // good to about 28 bits + +Error analysis + + r₄ = y - a₄² + d₄ = 1/2 a₃r₄ + a₅ = a₄ + d₄ + r₅ = y - a₅² + = y - ( a₄ + d₄ )² + = y - a₄² - a₃a₄r₄ - 1/4 a₃²r₄² + = r₄ - a₃a₄r₄ - 1/4 a₃²r₄² + + | r₅ | < | r₄ | | 1 - a₃a₄ | + 1/4 r₄² + + a₅ = √y √( 1 - r₅/y ) + = √y ( 1 - 1/2 r₅/y + ... ) + +So to first order (second order being very tiny) + + √y - a₅ = 1/2 r₅/y + +and + + | √y - a₅ | < 1/2 ( | r₄/y | | 1 - a₃a₄ | + 1/4 r₄²/y ) + +From dsqrttest.c (conservatively): + + < 1/2 ( 20*2^-32 * 10*2^-32 + 1/4 * 40*2^-32*20*2^-32 ) + = 1/2 ( 200 + 200 ) * 2^-64 + < 2^-56 + +Empirically we see about 1ulp worst-case error including rounding at Q57. + +To determine correctly whether the exact remainder calculation can be skipped we need a result +accurate to < 0.25ulp at Q52, or 2^-54. +*/ + +dq_2: + bge dq_3 @ +Inf? + movs r1,#0 + b dq_4 + +dq_0: + lsrs r1,#31 + lsls r1,#31 @ preserve sign bit + lsrs r2,#21 @ extract exponent + beq dq_4 @ -0? return it + asrs r1,#11 @ make -Inf + b dq_4 + +dq_3: + ldr r1,=#0x7ff + lsls r1,#20 @ return +Inf +dq_4: + movs r0,#0 +dq_1: + bx r14 + +.align 2 +.thumb_func +qfp_dsqrt: + lsls r2,r1,#1 + bcs dq_0 @ negative? + lsrs r2,#21 @ extract exponent + subs r2,#1 + ldr r3,=#0x7fe + cmp r2,r3 + bhs dq_2 @ catches 0 and +Inf + push {r4-r7,r14} + lsls r4,r2,#20 + subs r1,r4 @ insert implied 1 + lsrs r2,#1 + bcc 1f @ even exponent? skip + adds r0,r0,r0 @ odd exponent: shift up mantissa + adcs r1,r1,r1 +1: + lsrs r3,#2 + adds r2,r3 + lsls r2,#20 + mov r12,r2 @ save result exponent + +@ here +@ r0:r1 y mantissa Q52 [1,4) +@ r12 result exponent + + adr r4,drsqrtapp-8 @ first eight table entries are never accessed because of the mantissa's leading 1 + lsrs r2,r1,#17 @ y Q3 + ldrb r2,[r4,r2] @ initial approximation to reciprocal square root a0 Q8 + lsrs r3,r1,#4 @ first Newton-Raphson iteration + muls r3,r2 + muls r3,r2 @ i32 p0=a0*a0*(y>>14); // Q32 + asrs r3,r3,#12 @ i32 r0=p0>>12; // Q20 + muls r3,r2 + asrs r3,#13 @ i32 dy0=(r0*a0)>>13; // Q15 + lsls r2,#8 + subs r2,r3 @ i32 a1=(a0<<8)-dy0; // Q16 + + movs r3,r2 + muls r3,r3 + lsrs r3,#13 + lsrs r4,r1,#1 + muls r3,r4 @ i32 p1=((a1*a1)>>11)*(y>>11); // Q19*Q19=Q38 + asrs r3,#15 @ i32 r1=p1>>15; // Q23 + muls r3,r2 + asrs r3,#23 + adds r3,#1 + asrs r3,#1 @ i32 dy1=(r1*a1+(1<<23))>>24; // Q23*Q16=Q39; Q15 + subs r2,r3 @ i32 a2=a1-dy1; // Q16 + lsrs r3,r2,#16 + subs r2,r3 @ if(a2>=0x10000) a2=0xffff; to prevent overflow of a2*a2 + +@ here +@ r0:r1 y mantissa +@ r2 a2 ~ 1/sqrt(y) Q16 +@ r12 result exponent + + movs r3,r2 + muls r3,r3 + lsls r1,#10 + lsrs r4,r0,#22 + orrs r1,r4 @ y Q30 + mul32_32_64 r1,r3, r4,r3, r5,r6,r7,r4,r3 @ i64 p2=(ui64)(a2*a2)*(ui64)y; // Q62 r4:r3 + lsls r5,r3,#6 + lsrs r4,#26 + orrs r4,r5 + adds r4,#0x20 @ i32 r2=(p2>>26)+0x20; // Q36 r4 + uxth r5,r4 + muls r5,r2 + asrs r4,#16 + muls r4,r2 + lsrs r5,#16 + adds r4,r5 + asrs r4,#6 @ i32 dy2=((i64)r2*(i64)a2)>>22; // Q36*Q16=Q52; Q30 + lsls r2,#15 + subs r2,r4 + +@ here +@ r0 y low bits +@ r1 y Q30 +@ r2 a3 ~ 1/sqrt(y) Q31 +@ r12 result exponent + + mul32_32_64 r2,r1, r3,r4, r5,r6,r7,r3,r4 + adds r3,r3,r3 + adcs r4,r4,r4 + adds r3,r3,r3 + movs r3,#0 + adcs r3,r4 @ ui32 a4=((ui64)a3*(ui64)y+(1U<<31))>>31; // Q30 + +@ here +@ r0 y low bits +@ r1 y Q30 +@ r2 a3 Q31 ~ 1/sqrt(y) +@ r3 a4 Q30 ~ sqrt(y) +@ r12 result exponent + + square32_64 r3, r4,r5, r6,r5,r7 + lsls r6,r0,#8 + lsrs r7,r1,#2 + subs r6,r4 + sbcs r7,r5 @ r4=(q60)y-a4*a4 + +@ by exhaustive testing, r4 = fffffffc0e134fdc .. 00000003c2bf539c Q60 + + lsls r5,r7,#29 + lsrs r6,#3 + adcs r6,r5 @ r4 Q57 with rounding + muls32_32_64 r6,r2, r6,r2, r4,r5,r7,r6,r2 @ d4=a3*r4/2 Q89 +@ r4+d4 is correct to 1ULP at Q57, tested on ~9bn cases including all extreme values of r4 for each possible y Q30 + + adds r2,#8 + asrs r2,#5 @ d4 Q52, rounded to Q53 with spare bit in carry + +@ here +@ r0 y low bits +@ r1 y Q30 +@ r2 d4 Q52, rounded to Q53 +@ C flag contains d4_b53 +@ r3 a4 Q30 + + bcs dq_5 + + lsrs r5,r3,#10 @ a4 Q52 + lsls r4,r3,#22 + + asrs r1,r2,#31 + adds r0,r2,r4 + adcs r1,r5 @ a4+d4 + + add r1,r12 @ pack exponent + pop {r4-r7,r15} + +.ltorg + + +@ round(sqrt(2^22./[68:8:252])) +drsqrtapp: +.byte 0xf8,0xeb,0xdf,0xd6,0xcd,0xc5,0xbe,0xb8 +.byte 0xb2,0xad,0xa8,0xa4,0xa0,0x9c,0x99,0x95 +.byte 0x92,0x8f,0x8d,0x8a,0x88,0x85,0x83,0x81 + +dq_5: +@ here we are near a rounding boundary, C is set + adcs r2,r2,r2 @ d4 Q53+1ulp + lsrs r5,r3,#9 + lsls r4,r3,#23 @ r4:r5 a4 Q53 + asrs r1,r2,#31 + adds r4,r2,r4 + adcs r5,r1 @ r4:r5 a5=a4+d4 Q53+1ulp + movs r3,r5 + muls r3,r4 + square32_64 r4,r1,r2,r6,r2,r7 + adds r2,r3 + adds r2,r3 @ r1:r2 a5^2 Q106 + lsls r0,#22 @ y Q84 + + rsbs r1,#0 + sbcs r0,r2 @ remainder y-a5^2 + bmi 1f @ y +@ also set flags accordingly +.thumb_func +qfp_dcmp: + push {r4,r6,r7,r14} + ldr r7,=#0x7ff @ flush NaNs and denormals + lsls r4,r1,#1 + lsrs r4,#21 + beq 1f + cmp r4,r7 + bne 2f +1: + movs r0,#0 + lsrs r1,#20 + lsls r1,#20 +2: + lsls r4,r3,#1 + lsrs r4,#21 + beq 1f + cmp r4,r7 + bne 2f +1: + movs r2,#0 + lsrs r3,#20 + lsls r3,#20 +2: +dcmp_fast_entry: + movs r6,#1 + eors r3,r1 + bmi 4f @ opposite signs? then can proceed on basis of sign of x + eors r3,r1 @ restore r3 + bpl 1f + rsbs r6,#0 @ negative? flip comparison +1: + cmp r1,r3 + bne 1f + cmp r0,r2 + bhi 2f + blo 3f +5: + movs r6,#0 @ equal? result is 0 +1: + bgt 2f +3: + rsbs r6,#0 +2: + subs r0,r6,#0 @ copy and set flags + pop {r4,r6,r7,r15} +4: + orrs r3,r1 @ make -0==+0 + adds r3,r3 + orrs r3,r0 + orrs r3,r2 + beq 5b + cmp r1,#0 + bge 2b + b 3b + + +@ "scientific" functions start here + +.thumb_func +push_r8_r11: + mov r4,r8 + mov r5,r9 + mov r6,r10 + mov r7,r11 + push {r4-r7} + bx r14 + +.thumb_func +pop_r8_r11: + pop {r4-r7} + mov r8,r4 + mov r9,r5 + mov r10,r6 + mov r11,r7 + bx r14 + +@ double-length CORDIC rotation step + +@ r0:r1 ω +@ r6 32-i (complementary shift) +@ r7 i (shift) +@ r8:r9 x +@ r10:r11 y +@ r12 coefficient pointer + +@ an option in rotation mode would be to compute the sequence of σ values +@ in one pass, rotate the initial vector by the residual ω and then run a +@ second pass to compute the final x and y. This would relieve pressure +@ on registers and hence possibly be faster. The same trick does not work +@ in vectoring mode (but perhaps one could work to single precision in +@ a first pass and then double precision in a second pass?). + +.thumb_func +dcordic_vec_step: + mov r2,r12 + ldmia r2!,{r3,r4} + mov r12,r2 + mov r2,r11 + cmp r2,#0 + blt 1f + b 2f + +.thumb_func +dcordic_rot_step: + mov r2,r12 + ldmia r2!,{r3,r4} + mov r12,r2 + cmp r1,#0 + bge 1f +2: +@ ω<0 / y>=0 +@ ω+=dω +@ x+=y>>i, y-=x>>i + adds r0,r3 + adcs r1,r4 + + mov r3,r11 + asrs r3,r7 + mov r4,r11 + lsls r4,r6 + mov r2,r10 + lsrs r2,r7 + orrs r2,r4 @ r2:r3 y>>i, rounding in carry + mov r4,r8 + mov r5,r9 @ r4:r5 x + adcs r2,r4 + adcs r3,r5 @ r2:r3 x+(y>>i) + mov r8,r2 + mov r9,r3 + + mov r3,r5 + lsls r3,r6 + asrs r5,r7 + lsrs r4,r7 + orrs r4,r3 @ r4:r5 x>>i, rounding in carry + mov r2,r10 + mov r3,r11 + sbcs r2,r4 + sbcs r3,r5 @ r2:r3 y-(x>>i) + mov r10,r2 + mov r11,r3 + bx r14 + + +@ ω>0 / y<0 +@ ω-=dω +@ x-=y>>i, y+=x>>i +1: + subs r0,r3 + sbcs r1,r4 + + mov r3,r9 + asrs r3,r7 + mov r4,r9 + lsls r4,r6 + mov r2,r8 + lsrs r2,r7 + orrs r2,r4 @ r2:r3 x>>i, rounding in carry + mov r4,r10 + mov r5,r11 @ r4:r5 y + adcs r2,r4 + adcs r3,r5 @ r2:r3 y+(x>>i) + mov r10,r2 + mov r11,r3 + + mov r3,r5 + lsls r3,r6 + asrs r5,r7 + lsrs r4,r7 + orrs r4,r3 @ r4:r5 y>>i, rounding in carry + mov r2,r8 + mov r3,r9 + sbcs r2,r4 + sbcs r3,r5 @ r2:r3 x-(y>>i) + mov r8,r2 + mov r9,r3 + bx r14 + +ret_dzero: + movs r0,#0 + movs r1,#0 + bx r14 + +@ convert double to signed int, rounding towards 0, clamping +.thumb_func +qfp_double2int_z: + cmp r1,#0 + bge qfp_double2int @ +ve or zero? then use rounding towards -Inf + push {r14} + lsls r1,#1 @ -ve: clear sign bit + lsrs r1,#1 + bl qfp_double2uint @ convert to unsigned, rounding towards -Inf + movs r1,#1 + lsls r1,#31 @ r1=0x80000000 + cmp r0,r1 + bhi 1f + rsbs r0,#0 + pop {r15} +1: + mov r0,r1 + pop {r15} + +@ convert packed double in r0:r1 to signed/unsigned 32/64-bit integer/fixed-point value in r0:r1 [with r2 places after point], with rounding towards -Inf +@ fixed-point versions only work with reasonable values in r2 because of the way dunpacks works + +.thumb_func +qfp_double2int: + movs r2,#0 @ and fall through +.thumb_func +qfp_double2fix: + push {r14} + adds r2,#32 + bl qfp_double2fix64 + movs r0,r1 + pop {r15} + +.thumb_func +qfp_double2uint: + movs r2,#0 @ and fall through +.thumb_func +qfp_double2ufix: + push {r14} + adds r2,#32 + bl qfp_double2ufix64 + movs r0,r1 + pop {r15} + +.thumb_func +qfp_float2int64_z: + cmp r0,#0 + bge qfp_float2int64 @ +ve or zero? then use rounding towards -Inf + push {r14} + lsls r0,#1 @ -ve: clear sign bit + lsrs r0,#1 + bl qfp_float2uint64 @ convert to unsigned, rounding towards -Inf + movs r2,#1 + lsls r2,#31 @ r2=0x80000000 + cmp r1,r2 + bhs 1f + mvns r1,r1 + rsbs r0,#0 + bcc 2f + adds r1,#1 +2: + pop {r15} +1: + movs r0,#0 + mov r1,r2 + pop {r15} + +.thumb_func +qfp_float2int64: + movs r1,#0 @ and fall through +.thumb_func +qfp_float2fix64: + push {r14} + bl f2fix + b d2f64_a + +.thumb_func +qfp_float2uint64: + movs r1,#0 @ and fall through +.thumb_func +qfp_float2ufix64: + asrs r3,r0,#23 @ negative? return 0 + bmi ret_dzero +@ and fall through + +@ convert float in r0 to signed fixed point in r0:r1:r3, r1 places after point, rounding towards -Inf +@ result clamped so that r3 can only be 0 or -1 +@ trashes r12 +.thumb_func +f2fix: + push {r4,r14} + mov r12,r1 + asrs r3,r0,#31 + lsls r0,#1 + lsrs r2,r0,#24 + beq 1f @ zero? + cmp r2,#0xff @ Inf? + beq 2f + subs r1,r2,#1 + subs r2,#0x7f @ remove exponent bias + lsls r1,#24 + subs r0,r1 @ insert implied 1 + eors r0,r3 + subs r0,r3 @ top two's complement + asrs r1,r0,#4 @ convert to double format + lsls r0,#28 + b d2fix_a +1: + movs r0,#0 + movs r1,r0 + movs r3,r0 + pop {r4,r15} +2: + mvns r0,r3 @ return max/min value + mvns r1,r3 + pop {r4,r15} + +.thumb_func +qfp_double2int64_z: + cmp r1,#0 + bge qfp_double2int64 @ +ve or zero? then use rounding towards -Inf + push {r14} + lsls r1,#1 @ -ve: clear sign bit + lsrs r1,#1 + bl qfp_double2uint64 @ convert to unsigned, rounding towards -Inf + cmp r1,#0 + blt 1f + mvns r1,r1 + rsbs r0,#0 + bcc 2f + adds r1,#1 +2: + pop {r15} +1: + movs r0,#0 + movs r1,#1 + lsls r1,#31 @ 0x80000000 + pop {r15} + +.thumb_func +qfp_double2int64: + movs r2,#0 @ and fall through +.thumb_func +qfp_double2fix64: + push {r14} + bl d2fix +d2f64_a: + asrs r2,r1,#31 + cmp r2,r3 + bne 1f @ sign extension bits fail to match sign of result? + pop {r15} +1: + mvns r0,r3 + movs r1,#1 + lsls r1,#31 + eors r1,r1,r0 @ generate extreme fixed-point values + pop {r15} + +.thumb_func +qfp_double2uint64: + movs r2,#0 @ and fall through +.thumb_func +qfp_double2ufix64: + asrs r3,r1,#20 @ negative? return 0 + bmi ret_dzero +@ and fall through + +@ convert double in r0:r1 to signed fixed point in r0:r1:r3, r2 places after point, rounding towards -Inf +@ result clamped so that r3 can only be 0 or -1 +@ trashes r12 +.thumb_func +d2fix: + push {r4,r14} + mov r12,r2 + bl dunpacks + asrs r4,r2,#16 + adds r4,#1 + bge 1f + movs r1,#0 @ -0 -> +0 +1: + asrs r3,r1,#31 +d2fix_a: +@ here +@ r0:r1 two's complement mantissa +@ r2 unbaised exponent +@ r3 mantissa sign extension bits + add r2,r12 @ exponent plus offset for required binary point position + subs r2,#52 @ required shift + bmi 1f @ shift down? +@ here a shift up by r2 places + cmp r2,#12 @ will clamp? + bge 2f + movs r4,r0 + lsls r1,r2 + lsls r0,r2 + rsbs r2,#0 + adds r2,#32 @ complementary shift + lsrs r4,r2 + orrs r1,r4 + pop {r4,r15} +2: + mvns r0,r3 + mvns r1,r3 @ overflow: clamp to extreme fixed-point values + pop {r4,r15} +1: +@ here a shift down by -r2 places + adds r2,#32 + bmi 1f @ long shift? + mov r4,r1 + lsls r4,r2 + rsbs r2,#0 + adds r2,#32 @ complementary shift + asrs r1,r2 + lsrs r0,r2 + orrs r0,r4 + pop {r4,r15} +1: +@ here a long shift down + movs r0,r1 + asrs r1,#31 @ shift down 32 places + adds r2,#32 + bmi 1f @ very long shift? + rsbs r2,#0 + adds r2,#32 + asrs r0,r2 + pop {r4,r15} +1: + movs r0,r3 @ result very near zero: use sign extension bits + movs r1,r3 + pop {r4,r15} + +@ float <-> double conversions +.thumb_func +qfp_float2double: + lsrs r3,r0,#31 @ sign bit + lsls r3,#31 + lsls r1,r0,#1 + lsrs r2,r1,#24 @ exponent + beq 1f @ zero? + cmp r2,#0xff @ Inf? + beq 2f + lsrs r1,#4 @ exponent and top 20 bits of mantissa + ldr r2,=#(0x3ff-0x7f)<<20 @ difference in exponent offsets + adds r1,r2 + orrs r1,r3 + lsls r0,#29 @ bottom 3 bits of mantissa + bx r14 +1: + movs r1,r3 @ return signed zero +3: + movs r0,#0 + bx r14 +2: + ldr r1,=#0x7ff00000 @ return signed infinity + adds r1,r3 + b 3b + +.thumb_func +qfp_double2float: + lsls r2,r1,#1 + lsrs r2,#21 @ exponent + ldr r3,=#0x3ff-0x7f + subs r2,r3 @ fix exponent bias + ble 1f @ underflow or zero + cmp r2,#0xff + bge 2f @ overflow or infinity + lsls r2,#23 @ position exponent of result + lsrs r3,r1,#31 + lsls r3,#31 + orrs r2,r3 @ insert sign + lsls r3,r0,#3 @ rounding bits + lsrs r0,#29 + lsls r1,#12 + lsrs r1,#9 + orrs r0,r1 @ assemble mantissa + orrs r0,r2 @ insert exponent and sign + lsls r3,#1 + bcc 3f @ no rounding + beq 4f @ all sticky bits 0? +5: + adds r0,#1 +3: + bx r14 +4: + lsrs r3,r0,#1 @ odd? then round up + bcs 5b + bx r14 +1: + beq 6f @ check case where value is just less than smallest normal +7: + lsrs r0,r1,#31 + lsls r0,#31 + bx r14 +6: + lsls r2,r1,#12 @ 20 1:s at top of mantissa? + asrs r2,#12 + adds r2,#1 + bne 7b + lsrs r2,r0,#29 @ and 3 more 1:s? + cmp r2,#7 + bne 7b + movs r2,#1 @ return smallest normal with correct sign + b 8f +2: + movs r2,#0xff +8: + lsrs r0,r1,#31 @ return signed infinity + lsls r0,#8 + adds r0,r2 + lsls r0,#23 + bx r14 + +@ convert signed/unsigned 32/64-bit integer/fixed-point value in r0:r1 [with r2 places after point] to packed double in r0:r1, with rounding + +.thumb_func +qfp_uint2double: + movs r1,#0 @ and fall through +.thumb_func +qfp_ufix2double: + movs r2,r1 + movs r1,#0 + b qfp_ufix642double + +.thumb_func +qfp_int2double: + movs r1,#0 @ and fall through +.thumb_func +qfp_fix2double: + movs r2,r1 + asrs r1,r0,#31 @ sign extend + b qfp_fix642double + +.thumb_func +qfp_uint642double: + movs r2,#0 @ and fall through +.thumb_func +qfp_ufix642double: + movs r3,#0 + b uf2d + +.thumb_func +qfp_int642double: + movs r2,#0 @ and fall through +.thumb_func +qfp_fix642double: + asrs r3,r1,#31 @ sign bit across all bits + eors r0,r3 + eors r1,r3 + subs r0,r3 + sbcs r1,r3 +uf2d: + push {r4,r5,r14} + ldr r4,=#0x432 + subs r2,r4,r2 @ form biased exponent +@ here +@ r0:r1 unnormalised mantissa +@ r2 -Q (will become exponent) +@ r3 sign across all bits + cmp r1,#0 + bne 1f @ short normalising shift? + movs r1,r0 + beq 2f @ zero? return it + movs r0,#0 + subs r2,#32 @ fix exponent +1: + asrs r4,r1,#21 + bne 3f @ will need shift down (and rounding?) + bcs 4f @ normalised already? +5: + subs r2,#1 + adds r0,r0 @ shift up + adcs r1,r1 + lsrs r4,r1,#21 + bcc 5b +4: + ldr r4,=#0x7fe + cmp r2,r4 + bhs 6f @ over/underflow? return signed zero/infinity +7: + lsls r2,#20 @ pack and return + adds r1,r2 + lsls r3,#31 + adds r1,r3 +2: + pop {r4,r5,r15} +6: @ return signed zero/infinity according to unclamped exponent in r2 + mvns r2,r2 + lsrs r2,#21 + movs r0,#0 + movs r1,#0 + b 7b + +3: +@ here we need to shift down to normalise and possibly round + bmi 1f @ already normalised to Q63? +2: + subs r2,#1 + adds r0,r0 @ shift up + adcs r1,r1 + bpl 2b +1: +@ here we have a 1 in b63 of r0:r1 + adds r2,#11 @ correct exponent for subsequent shift down + lsls r4,r0,#21 @ save bits for rounding + lsrs r0,#11 + lsls r5,r1,#21 + orrs r0,r5 + lsrs r1,#11 + lsls r4,#1 + beq 1f @ sticky bits are zero? +8: + movs r4,#0 + adcs r0,r4 + adcs r1,r4 + b 4b +1: + bcc 4b @ sticky bits are zero but not on rounding boundary + lsrs r4,r0,#1 @ increment if odd (force round to even) + b 8b + + +.ltorg + +.thumb_func +dunpacks: + mdunpacks r0,r1,r2,r3,r4 + ldr r3,=#0x3ff + subs r2,r3 @ exponent without offset + bx r14 + +@ r0:r1 signed mantissa Q52 +@ r2 unbiased exponent < 10 (i.e., |x|<2^10) +@ r4 pointer to: +@ - divisor reciprocal approximation r=1/d Q15 +@ - divisor d Q62 0..20 +@ - divisor d Q62 21..41 +@ - divisor d Q62 42..62 +@ returns: +@ r0:r1 reduced result y Q62, -0.6 d < y < 0.6 d (better in practice) +@ r2 quotient q (number of reductions) +@ if exponent >=10, returns r0:r1=0, r2=1024*mantissa sign +@ designed to work for 0.5=0: in quadrant 0 + cmp r1,r3 + ble 2f @ y<~x so 0≤θ<~π/4: skip + adds r6,#1 + eors r1,r5 @ negate x + b 3f @ and exchange x and y = rotate by -π/2 +1: + cmp r3,r7 + bge 2f @ -y<~x so -π/4<~θ≤0: skip + subs r6,#1 + eors r3,r5 @ negate y and ... +3: + movs r7,r0 @ exchange x and y + movs r0,r2 + movs r2,r7 + movs r7,r1 + movs r1,r3 + movs r3,r7 +2: +@ here -π/4<~θ<~π/4 +@ r6 has quadrant offset + push {r6} + cmp r2,#0 + bne 1f + cmp r3,#0 + beq 10f @ x==0 going into division? + lsls r4,r3,#1 + asrs r4,#21 + adds r4,#1 + bne 1f @ x==Inf going into division? + lsls r4,r1,#1 + asrs r4,#21 + adds r4,#1 @ y also ±Inf? + bne 10f + subs r1,#1 @ make them both just finite + subs r3,#1 + b 1f + +10: + movs r0,#0 + movs r1,#0 + b 12f + +1: + bl qfp_ddiv + movs r2,#62 + bl qfp_double2fix64 +@ r0:r1 y/x + mov r10,r0 + mov r11,r1 + movs r0,#0 @ ω=0 + movs r1,#0 + mov r8,r0 + movs r2,#1 + lsls r2,#30 + mov r9,r2 @ x=1 + + adr r4,dtab_cc + mov r12,r4 + movs r7,#1 + movs r6,#31 +1: + bl dcordic_vec_step + adds r7,#1 + subs r6,#1 + cmp r7,#33 + bne 1b +@ r0:r1 atan(y/x) Q62 +@ r8:r9 x residual Q62 +@ r10:r11 y residual Q62 + mov r2,r9 + mov r3,r10 + subs r2,#12 @ this makes atan(0)==0 +@ the following is basically a division residual y/x ~ atan(residual y/x) + movs r4,#1 + lsls r4,#29 + movs r7,#0 +2: + lsrs r2,#1 + movs r3,r3 @ preserve carry + bmi 1f + sbcs r3,r2 + adds r0,r4 + adcs r1,r7 + lsrs r4,#1 + bne 2b + b 3f +1: + adcs r3,r2 + subs r0,r4 + sbcs r1,r7 + lsrs r4,#1 + bne 2b +3: + lsls r6,r1,#31 + asrs r1,#1 + lsrs r0,#1 + orrs r0,r6 @ Q61 + +12: + pop {r6} + + cmp r6,#0 + beq 1f + ldr r4,=#0x885A308D @ π/2 Q61 + ldr r5,=#0x3243F6A8 + bpl 2f + mvns r4,r4 @ negative quadrant offset + mvns r5,r5 +2: + lsls r6,#31 + bne 2f @ skip if quadrant offset is ±1 + adds r0,r4 + adcs r1,r5 +2: + adds r0,r4 + adcs r1,r5 +1: + movs r2,#61 + bl qfp_fix642double + + bl pop_r8_r11 + pop {r4-r7,r15} + +.ltorg + +dtab_cc: +.word 0x61bb4f69, 0x1dac6705 @ atan 2^-1 Q62 +.word 0x96406eb1, 0x0fadbafc @ atan 2^-2 Q62 +.word 0xab0bdb72, 0x07f56ea6 @ atan 2^-3 Q62 +.word 0xe59fbd39, 0x03feab76 @ atan 2^-4 Q62 +.word 0xba97624b, 0x01ffd55b @ atan 2^-5 Q62 +.word 0xdddb94d6, 0x00fffaaa @ atan 2^-6 Q62 +.word 0x56eeea5d, 0x007fff55 @ atan 2^-7 Q62 +.word 0xaab7776e, 0x003fffea @ atan 2^-8 Q62 +.word 0x5555bbbc, 0x001ffffd @ atan 2^-9 Q62 +.word 0xaaaaadde, 0x000fffff @ atan 2^-10 Q62 +.word 0xf555556f, 0x0007ffff @ atan 2^-11 Q62 +.word 0xfeaaaaab, 0x0003ffff @ atan 2^-12 Q62 +.word 0xffd55555, 0x0001ffff @ atan 2^-13 Q62 +.word 0xfffaaaab, 0x0000ffff @ atan 2^-14 Q62 +.word 0xffff5555, 0x00007fff @ atan 2^-15 Q62 +.word 0xffffeaab, 0x00003fff @ atan 2^-16 Q62 +.word 0xfffffd55, 0x00001fff @ atan 2^-17 Q62 +.word 0xffffffab, 0x00000fff @ atan 2^-18 Q62 +.word 0xfffffff5, 0x000007ff @ atan 2^-19 Q62 +.word 0xffffffff, 0x000003ff @ atan 2^-20 Q62 +.word 0x00000000, 0x00000200 @ atan 2^-21 Q62 @ consider optimising these +.word 0x00000000, 0x00000100 @ atan 2^-22 Q62 +.word 0x00000000, 0x00000080 @ atan 2^-23 Q62 +.word 0x00000000, 0x00000040 @ atan 2^-24 Q62 +.word 0x00000000, 0x00000020 @ atan 2^-25 Q62 +.word 0x00000000, 0x00000010 @ atan 2^-26 Q62 +.word 0x00000000, 0x00000008 @ atan 2^-27 Q62 +.word 0x00000000, 0x00000004 @ atan 2^-28 Q62 +.word 0x00000000, 0x00000002 @ atan 2^-29 Q62 +.word 0x00000000, 0x00000001 @ atan 2^-30 Q62 +.word 0x80000000, 0x00000000 @ atan 2^-31 Q62 +.word 0x40000000, 0x00000000 @ atan 2^-32 Q62 + +.thumb_func +qfp_dexp: + push {r4-r7,r14} + bl dunpacks + adr r4,dreddata1 + bl dreduce + cmp r1,#0 + bge 1f + ldr r4,=#0xF473DE6B + ldr r5,=#0x2C5C85FD @ ln2 Q62 + adds r0,r4 + adcs r1,r5 + subs r2,#1 +1: + push {r2} + movs r7,#1 @ shift + adr r6,dtab_exp + movs r2,#0 + movs r3,#1 + lsls r3,#30 @ x=1 Q62 + +3: + ldmia r6!,{r4,r5} + mov r12,r6 + subs r0,r4 + sbcs r1,r5 + bmi 1f + + rsbs r6,r7,#0 + adds r6,#32 @ complementary shift + movs r5,r3 + asrs r5,r7 + movs r4,r3 + lsls r4,r6 + movs r6,r2 + lsrs r6,r7 @ rounding bit in carry + orrs r4,r6 + adcs r2,r4 + adcs r3,r5 @ x+=x>>i + b 2f + +1: + adds r0,r4 @ restore argument + adcs r1,r5 +2: + mov r6,r12 + adds r7,#1 + cmp r7,#33 + bne 3b + +@ here +@ r0:r1 ε (residual x, where x=a+ε) Q62, |ε|≤2^-32 (so fits in r0) +@ r2:r3 exp a Q62 +@ and we wish to calculate exp x=exp a exp ε~(exp a)(1+ε) + muls32_32_64 r0,r3, r4,r1, r5,r6,r7,r4,r1 +@ r4:r1 ε exp a Q(62+62-32)=Q92 + lsrs r4,#30 + lsls r0,r1,#2 + orrs r0,r4 + asrs r1,#30 + adds r0,r2 + adcs r1,r3 + + pop {r2} + rsbs r2,#0 + adds r2,#62 + bl qfp_fix642double @ in principle we can pack faster than this because we know the exponent + pop {r4-r7,r15} + +.ltorg + +.thumb_func +qfp_dln: + push {r4-r7,r14} + lsls r7,r1,#1 + bcs 5f @ <0 ... + asrs r7,#21 + beq 5f @ ... or =0? return -Inf + adds r7,#1 + beq 6f @ Inf/NaN? return +Inf + bl dunpacks + push {r2} + lsls r1,#9 + lsrs r2,r0,#23 + orrs r1,r2 + lsls r0,#9 +@ r0:r1 m Q61 = m/2 Q62 0.5≤m/2<1 + + movs r7,#1 @ shift + adr r6,dtab_exp + mov r12,r6 + movs r2,#0 + movs r3,#0 @ y=0 Q62 + +3: + rsbs r6,r7,#0 + adds r6,#32 @ complementary shift + movs r5,r1 + asrs r5,r7 + movs r4,r1 + lsls r4,r6 + movs r6,r0 + lsrs r6,r7 + orrs r4,r6 @ x>>i, rounding bit in carry + adcs r4,r0 + adcs r5,r1 @ x+(x>>i) + + lsrs r6,r5,#30 + bne 1f @ x+(x>>i)>1? + movs r0,r4 + movs r1,r5 @ x+=x>>i + mov r6,r12 + ldmia r6!,{r4,r5} + subs r2,r4 + sbcs r3,r5 + +1: + movs r4,#8 + add r12,r4 + adds r7,#1 + cmp r7,#33 + bne 3b +@ here: +@ r0:r1 residual x, nearly 1 Q62 +@ r2:r3 y ~ ln m/2 = ln m - ln2 Q62 +@ result is y + ln2 + ln x ~ y + ln2 + (x-1) + lsls r1,#2 + asrs r1,#2 @ x-1 + adds r2,r0 + adcs r3,r1 + + pop {r7} +@ here: +@ r2:r3 ln m/2 = ln m - ln2 Q62 +@ r7 unbiased exponent + + adr r4,dreddata1+4 + ldmia r4,{r0,r1,r4} + adds r7,#1 + muls r0,r7 @ Q62 + muls r1,r7 @ Q41 + muls r4,r7 @ Q20 + lsls r7,r1,#21 + asrs r1,#11 + asrs r5,r1,#31 + adds r0,r7 + adcs r1,r5 + lsls r7,r4,#10 + asrs r4,#22 + asrs r5,r1,#31 + adds r1,r7 + adcs r4,r5 +@ r0:r1:r4 exponent*ln2 Q62 + asrs r5,r3,#31 + adds r0,r2 + adcs r1,r3 + adcs r4,r5 +@ r0:r1:r4 result Q62 + movs r2,#62 +1: + asrs r5,r1,#31 + cmp r4,r5 + beq 2f @ r4 a sign extension of r1? + lsrs r0,#4 @ no: shift down 4 places and try again + lsls r6,r1,#28 + orrs r0,r6 + lsrs r1,#4 + lsls r6,r4,#28 + orrs r1,r6 + asrs r4,#4 + subs r2,#4 + b 1b +2: + bl qfp_fix642double + pop {r4-r7,r15} + +5: + ldr r1,=#0xfff00000 + movs r0,#0 + pop {r4-r7,r15} + +6: + ldr r1,=#0x7ff00000 + movs r0,#0 + pop {r4-r7,r15} + + +.ltorg + +.align 2 +dreddata1: +.word 0x0000B8AA @ 1/ln2 Q15 +.word 0x0013DE6B @ ln2 Q62 Q62=2C5C85FDF473DE6B split into 21-bit pieces +.word 0x000FEFA3 +.word 0x000B1721 + +dtab_exp: +.word 0xbf984bf3, 0x19f323ec @ log 1+2^-1 Q62 +.word 0xcd4d10d6, 0x0e47fbe3 @ log 1+2^-2 Q62 +.word 0x8abcb97a, 0x0789c1db @ log 1+2^-3 Q62 +.word 0x022c54cc, 0x03e14618 @ log 1+2^-4 Q62 +.word 0xe7833005, 0x01f829b0 @ log 1+2^-5 Q62 +.word 0x87e01f1e, 0x00fe0545 @ log 1+2^-6 Q62 +.word 0xac419e24, 0x007f80a9 @ log 1+2^-7 Q62 +.word 0x45621781, 0x003fe015 @ log 1+2^-8 Q62 +.word 0xa9ab10e6, 0x001ff802 @ log 1+2^-9 Q62 +.word 0x55455888, 0x000ffe00 @ log 1+2^-10 Q62 +.word 0x0aa9aac4, 0x0007ff80 @ log 1+2^-11 Q62 +.word 0x01554556, 0x0003ffe0 @ log 1+2^-12 Q62 +.word 0x002aa9ab, 0x0001fff8 @ log 1+2^-13 Q62 +.word 0x00055545, 0x0000fffe @ log 1+2^-14 Q62 +.word 0x8000aaaa, 0x00007fff @ log 1+2^-15 Q62 +.word 0xe0001555, 0x00003fff @ log 1+2^-16 Q62 +.word 0xf80002ab, 0x00001fff @ log 1+2^-17 Q62 +.word 0xfe000055, 0x00000fff @ log 1+2^-18 Q62 +.word 0xff80000b, 0x000007ff @ log 1+2^-19 Q62 +.word 0xffe00001, 0x000003ff @ log 1+2^-20 Q62 +.word 0xfff80000, 0x000001ff @ log 1+2^-21 Q62 +.word 0xfffe0000, 0x000000ff @ log 1+2^-22 Q62 +.word 0xffff8000, 0x0000007f @ log 1+2^-23 Q62 +.word 0xffffe000, 0x0000003f @ log 1+2^-24 Q62 +.word 0xfffff800, 0x0000001f @ log 1+2^-25 Q62 +.word 0xfffffe00, 0x0000000f @ log 1+2^-26 Q62 +.word 0xffffff80, 0x00000007 @ log 1+2^-27 Q62 +.word 0xffffffe0, 0x00000003 @ log 1+2^-28 Q62 +.word 0xfffffff8, 0x00000001 @ log 1+2^-29 Q62 +.word 0xfffffffe, 0x00000000 @ log 1+2^-30 Q62 +.word 0x80000000, 0x00000000 @ log 1+2^-31 Q62 +.word 0x40000000, 0x00000000 @ log 1+2^-32 Q62 + +qfp_lib_end: diff --git a/sos-iir-filter.h b/sos-iir-filter.h index cc90b35..4d345f1 100644 --- a/sos-iir-filter.h +++ b/sos-iir-filter.h @@ -26,11 +26,50 @@ #include #include -#include -namespace fpm = std; -using sos_t = float; -//#include -//using sos_t = fpm::fixed_16_16; +extern "C" { +#include +} + +float qfp_fpow(float b, float e) +{ + return qfp_fexp(e * qfp_fln(b)); +} + +float qfp_flog10(float x) +{ + return qfp_fln(x) / qfp_fln(10); +} + +struct sos_t +{ + float v; + + constexpr sos_t(float v_ = 0.f): v(v_) {} + + sos_t operator+(const sos_t& o) const noexcept { + return qfp_fadd(v, o.v); + } + + sos_t operator-(const sos_t& o) const noexcept { + return qfp_fsub(v, o.v); + } + + sos_t operator*(const sos_t& o) const noexcept { + return qfp_fmul(v, o.v); + } + + sos_t operator/(const sos_t& o) const noexcept { + return qfp_fdiv(v, o.v); + } + + sos_t& operator+=(const sos_t& o) noexcept { + return (*this = *this + o); + } + + operator float() const noexcept { + return v; + } +}; struct SOS_Coefficients { sos_t b1; @@ -60,8 +99,7 @@ struct SOS_IIR_Filter { std::copy(_sos, _sos + N, sos.begin()); } - void filter(auto samples) { - // Apply all but last Second-Order-Section + void filter(auto samples, std::size_t n = N) { for ([[maybe_unused]] auto _ : std::views::zip_transform( [](auto samps, const auto& coeffs, auto& ww) { // Assumes a0 and b0 coefficients are one (1.0) @@ -72,13 +110,25 @@ struct SOS_IIR_Filter { } return 0; }, - std::views::repeat(samples, N), sos, w)) {} + std::views::repeat(samples, n), sos, w)) {} } sos_t filter_sum_sqr(auto samples) { - filter(samples); - return std::ranges::fold_left(samples, sos_t(0), - [this](auto a, auto b) { return a + b * gain * b * gain; }); + const auto& coeffs = sos.back(); + auto& ww = w.back(); + sos_t sum_sqr (0.f); + + filter(samples, N - 1); + + // Assumes a0 and b0 coefficients are one (1.0) + for (auto& s : samples) { + auto f6 = s + coeffs.a1 * ww.w0 + coeffs.a2 * ww.w1; + s = f6 + coeffs.b1 * ww.w0 + coeffs.b2 * ww.w1; + ww.w1 = std::exchange(ww.w0, f6); + sum_sqr += s * gain * s * gain; + } + + return sum_sqr; } }; @@ -90,12 +140,12 @@ struct SOS_IIR_Filter { // A ~= [1.0, -1.993853, 0.993863] // With additional DC blocking component SOS_IIR_Filter SPH0645LM4H_B_RB = { - /* gain: */ sos_t(1.00123377961525), + /* gain: */ sos_t(1.00123377961525f), /* sos: */ { // Second-Order Sections {b1, b2, -a1, -a2} - { sos_t(-1.0), sos_t(0.0), - sos_t(+0.9992), sos_t(0) }, // DC blocker, a1 = -0.9992 - { sos_t(-1.988897663539382), sos_t(+0.988928479008099), - sos_t(+1.993853376183491), sos_t(-0.993862821429572) } } + { sos_t(-1.0f), sos_t(0.0f), + sos_t(+0.9992f), sos_t(0.0f) }, // DC blocker, a1 = -0.9992 + { sos_t(-1.988897663539382f), sos_t(+0.988928479008099f), + sos_t(+1.993853376183491f), sos_t(-0.993862821429572f) } } }; // @@ -104,14 +154,14 @@ SOS_IIR_Filter SPH0645LM4H_B_RB = { // B = [0.169994948147430, 0.280415310498794, -1.120574766348363, 0.131562559965936, 0.974153561246036, -0.282740857326553, -0.152810756202003] // A = [1.0, -2.12979364760736134, 0.42996125885751674, 1.62132698199721426, -0.96669962900852902, 0.00121015844426781, 0.04400300696788968] SOS_IIR_Filter A_weighting = { - /* gain: */ sos_t(0.169994948147430), + /* gain: */ sos_t(0.169994948147430f), /* sos: */ { // Second-Order Sections {b1, b2, -a1, -a2} - { sos_t(-2.00026996133106), sos_t(+1.00027056142719), - sos_t(-1.060868438509278), sos_t(-0.163987445885926) }, - { sos_t(+4.35912384203144), sos_t(+3.09120265783884), - sos_t(+1.208419926363593), sos_t(-0.273166998428332) }, - { sos_t(-0.70930303489759), sos_t(-0.29071868393580), - sos_t(+1.982242159753048), sos_t(-0.982298594928989) } } + { sos_t(-2.00026996133106f), sos_t(+1.00027056142719f), + sos_t(-1.060868438509278f), sos_t(-0.163987445885926f) }, + { sos_t(+4.35912384203144f), sos_t(+3.09120265783884f), + sos_t(+1.208419926363593f), sos_t(-0.273166998428332f) }, + { sos_t(-0.70930303489759f), sos_t(-0.29071868393580f), + sos_t(+1.982242159753048f), sos_t(-0.982298594928989f) } } }; ////