From 7fad01e939f72195c3a0af76f878ddb488b545b7 Mon Sep 17 00:00:00 2001
From: Clyne Sullivan <clyne@bitgloo.com>
Date: Fri, 7 Jun 2024 07:36:24 -0400
Subject: [PATCH] go os-less; filter in sram; consider dynamic freq

---
 Makefile         |   9 +-
 STM32G031x6.ld   |   4 +-
 cfg/chconf.h     |   2 +-
 main.cpp         |  79 ++++++----
 osalconf.h       |  68 +++++++++
 qfplib-port.h    | 381 +++++++++++++++++++++++++++++++++++++++++++++++
 sos-iir-filter.h |  16 +-
 7 files changed, 514 insertions(+), 45 deletions(-)
 create mode 100644 osalconf.h
 create mode 100644 qfplib-port.h
diff --git a/Makefile b/Makefile
index bfd4c37..e08fa19 100644
--- a/Makefile
+++ b/Makefile
@@ -5,7 +5,7 @@
 
 # Compiler options here.
 ifeq ($(USE_OPT),)
-  USE_OPT = -O3 -ggdb -fomit-frame-pointer -falign-functions=16 -fno-stack-protector
+  USE_OPT = -O3 -fomit-frame-pointer -falign-functions=16 -fno-stack-protector
 endif
 
 # C specific options here (added to USE_OPT).
@@ -102,10 +102,11 @@ include $(CHIBIOS)/os/common/startup/ARMCMx/compilers/GCC/mk/startup_stm32g0xx.m
 include $(CHIBIOS)/os/hal/hal.mk
 include $(CHIBIOS)/os/hal/ports/STM32/STM32G0xx/platform.mk
 #include $(CHIBIOS)/os/hal/boards/ST_NUCLEO64_G071RB/board.mk
-include $(CHIBIOS)/os/hal/osal/rt-nil/osal.mk
+#include $(CHIBIOS)/os/hal/osal/rt-nil/osal.mk
+include $(CHIBIOS)/os/hal/osal/os-less/ARMCMx/osal.mk
 # RTOS files (optional).
-include $(CHIBIOS)/os/nil/nil.mk
-include $(CHIBIOS)/os/common/ports/ARMv6-M/compilers/GCC/mk/port.mk
+#include $(CHIBIOS)/os/nil/nil.mk
+#include $(CHIBIOS)/os/common/ports/ARMv6-M/compilers/GCC/mk/port.mk
 ## Auto-build files in ./source recursively.
 #include $(CHIBIOS)/tools/mk/autobuild.mk
 ## Other files (optional).
diff --git a/STM32G031x6.ld b/STM32G031x6.ld
index ea0c0d4..d53e6d5 100644
--- a/STM32G031x6.ld
+++ b/STM32G031x6.ld
@@ -27,7 +27,7 @@ MEMORY
     flash5 (rx) : org = 0x00000000, len = 0
     flash6 (rx) : org = 0x00000000, len = 0
     flash7 (rx) : org = 0x00000000, len = 0
-    ram0   (wx) : org = 0x20000000, len = 8k
+    ram0   (rwx) : org = 0x20000000, len = 8k
     ram1   (wx) : org = 0x00000000, len = 0
     ram2   (wx) : org = 0x00000000, len = 0
     ram3   (wx) : org = 0x00000000, len = 0
@@ -79,7 +79,7 @@ REGION_ALIAS("DATA_RAM_LMA", flash0);
 REGION_ALIAS("BSS_RAM", ram0);
 
 /* RAM region to be used for the default heap.*/
-REGION_ALIAS("HEAP_RAM", ram0);
+REGION_ALIAS("HEAP_RAM", ram1);
 
 /* Generic rules inclusion.*/
 INCLUDE rules.ld
diff --git a/cfg/chconf.h b/cfg/chconf.h
index 8b143c3..5f90c03 100644
--- a/cfg/chconf.h
+++ b/cfg/chconf.h
@@ -48,7 +48,7 @@
  *          (0..CH_CFG_MAX_THREADS-1).
  */
 #if !defined(CH_CFG_MAX_THREADS)
-#define CH_CFG_MAX_THREADS                  2
+#define CH_CFG_MAX_THREADS                  1
 #endif
 
 /**
diff --git a/main.cpp b/main.cpp
index 5a04302..e3f49f0 100644
--- a/main.cpp
+++ b/main.cpp
@@ -15,10 +15,10 @@
  * along with this program.  If not, see <https://www.gnu.org/licenses/>.
  */
 #include "hal.h"
-#include "ch.h"
 #include "sos-iir-filter.h"
 
 #include <algorithm>
+#include <atomic>
 #include <array>
 #include <cstring>
 #include <ranges>
@@ -40,13 +40,11 @@ static constexpr unsigned I2S_STRIDE = 16;
 static const auto MIC_REF_AMPL = sos_t((1 << (MIC_BITS - 1)) - 1) *
     qfp_fpow(10.f, MIC_SENSITIVITY / 20.f);
 
-static SEMAPHORE_DECL(i2sReady, 0);
-static THD_WORKING_AREA(waThread1, 128);
+static std::atomic_bool i2sReady;
 static std::array<uint32_t, I2S_BUFSIZ> i2sBuffer;
 static sos_t Leq_sum_sqr (0.f);
 static unsigned Leq_samples = 0;
 
-static THD_FUNCTION(Thread1, arg);
 static void i2sCallback(I2SDriver *i2s);
 
 static constexpr unsigned I2SPRval = 16'000'000 / SAMPLE_RATE / 32 / 2;
@@ -62,23 +60,35 @@ static constexpr I2SConfig i2sConfig = {
     /* I2SPR */     (I2SPRval / 2) | ((I2SPRval & 1) ? SPI_I2SPR_ODD : 0)
 };
 
-THD_TABLE_BEGIN
-  THD_TABLE_THREAD(0, "main",     waThread1,       Thread1,      NULL)
-THD_TABLE_END
+//static const halclkcfg_t halClockDefault = {
+//  .pwr_cr1              = PWR_CR1_VOS_0,
+//  .pwr_cr2              = STM32_PWR_CR2,
+//  .rcc_cr               = RCC_CR_PLLON | RCC_CR_HSION,
+//  .rcc_cfgr             = (6 << RCC_CFGR_PPRE_Pos) | (1 << RCC_CFGR_HPRE_Pos) | RCC_CFGR_SW_PLL,
+//  .rcc_pllcfgr          = (STM32_PLLR_VALUE << RCC_PLLCFGR_PLLR_Pos) | RCC_PLLCFGR_PLLREN |
+//                          (STM32_PLLN_VALUE << RCC_PLLCFGR_PLLN_Pos) |
+//                          ((STM32_PLLM_VALUE - 1) << RCC_PLLCFGR_PLLM_Pos) |
+//                          RCC_PLLCFGR_PLLSRC_HSI,
+//  .flash_acr            = FLASH_ACR_PRFTEN | FLASH_ACR_ICEN | (2 << FLASH_ACR_LATENCY_Pos)
+//};
+//
+//static const halclkcfg_t halClockSleep = {
+//  .pwr_cr1              = PWR_CR1_VOS_0,
+//  .pwr_cr2              = STM32_PWR_CR2,
+//  .rcc_cr               = RCC_CR_PLLON | RCC_CR_HSION,
+//  .rcc_cfgr             = (0 << RCC_CFGR_PPRE_Pos) | (10 << RCC_CFGR_HPRE_Pos) | RCC_CFGR_SW_PLL,
+//  .rcc_pllcfgr          = (STM32_PLLR_VALUE << RCC_PLLCFGR_PLLR_Pos) | RCC_PLLCFGR_PLLREN |
+//                          (STM32_PLLN_VALUE << RCC_PLLCFGR_PLLN_Pos) |
+//                          ((STM32_PLLM_VALUE - 1) << RCC_PLLCFGR_PLLM_Pos) |
+//                          RCC_PLLCFGR_PLLSRC_HSI,
+//  .flash_acr            = FLASH_ACR_PRFTEN | FLASH_ACR_ICEN | (2 << FLASH_ACR_LATENCY_Pos)
+//};
 
 int main(void)
 {
     halInit();
-    chSysInit();
-    for (;;)
-        asm("wfi");
-}
-
-THD_FUNCTION(Thread1, arg)
-{
-    (void)arg;
-  
-    chThdSleepMilliseconds(2000);
+    osalSysEnable();
+    osalThreadSleepMilliseconds(2000);
   
     palSetPadMode(GPIOB, 7, PAL_MODE_OUTPUT_PUSHPULL);
     palSetPadMode(GPIOF, 2, PAL_MODE_UNCONNECTED);
@@ -89,16 +99,28 @@ THD_FUNCTION(Thread1, arg)
   
     sdStart(&SD2, NULL);
     sdWrite(&SD2, (uint8_t *)"Noisemeter\n", 11);
-    chThdSleepMilliseconds(100);
+    osalThreadSleepMilliseconds(2);
   
     i2sStart(&I2SD1, &i2sConfig);
     i2sStartExchange(&I2SD1);
+
+    i2sReady.store(false);
   
     uint8_t strbuf[7] = { 0, 0, 0, 'd', 'B', '\n', '\0' };
     for (;;) {
-        palSetPad(GPIOB, 7);
-        chSemWait(&i2sReady);
+        //if (halClockSwitchMode(&halClockSleep)) {
+        //    sdWrite(&SD2, (uint8_t *)"sleepf\n", 7);
+        //    osalThreadSleepMilliseconds(5000);
+        //}
+        while (!i2sReady.load())
+            asm("wfi");
+        i2sReady.store(false);
+        //if (halClockSwitchMode(&halClockDefault)) {
+        //    sdWrite(&SD2, (uint8_t *)"sleepf\n", 7);
+        //    osalThreadSleepMilliseconds(5000);
+        //}
 
+        palSetPad(GPIOB, 7);
         const sos_t Leq_RMS = qfp_fsqrt(Leq_sum_sqr / qfp_uint2float(Leq_samples));
         const sos_t Leq_dB = MIC_OFFSET_DB + MIC_REF_DB + sos_t(20.f) *
             qfp_flog10(Leq_RMS / MIC_REF_AMPL);
@@ -110,24 +132,29 @@ THD_FUNCTION(Thread1, arg)
         strbuf[1] = n % 10 + '0'; n /= 10;
         strbuf[0] = n ? n + '0' : ' ';
         sdWrite(&SD2, strbuf, sizeof(strbuf));
+        osalThreadSleepMilliseconds(2);
         palClearPad(GPIOB, 7);
     }
 }
 
+__attribute__((section(".data")))
 int32_t fixsample(uint32_t s) {
     return (int32_t)(((s & 0xFFFF) << 16) | (s >> 16)) >> (32 - MIC_BITS);
 }
 
+__attribute__((section(".data")))
 void i2sCallback(I2SDriver *i2s)
 {
+    //halClockSwitchMode(&halClockDefault);
+
     palSetPad(GPIOB, 7);
     const auto halfsize = i2sBuffer.size() / 2;
-    const auto offset = i2sIsBufferComplete(i2s) ? halfsize : 0;
-    auto samples = reinterpret_cast<sos_t *>(i2sBuffer.data() + offset);
+    const auto source = i2sBuffer.data() + (i2sIsBufferComplete(i2s) ? halfsize : 0);
+    auto samples = reinterpret_cast<sos_t *>(source);
     std::ranges::copy(
-        std::views::counted(i2sBuffer.begin() + offset, halfsize / I2S_STRIDE)
+        std::views::counted(source, halfsize / I2S_STRIDE)
             | std::ranges::views::stride(2)
-            | std::views::transform([](uint32_t s) { return sos_t(qfp_int2float(fixsample(s))); }),
+            | std::views::transform([](uint32_t s) { return sos_t(qfp_int2float_asm(fixsample(s))); }),
         samples);
     auto samps = std::views::counted(samples, halfsize / (2 * I2S_STRIDE));
 
@@ -138,8 +165,10 @@ void i2sCallback(I2SDriver *i2s)
 
     // Wakeup main thread for dB calculation every second
     if (Leq_samples >= SAMPLE_RATE / I2S_STRIDE) {
-        chSemSignalI(&i2sReady);
+        i2sReady.store(true);
     }
     palClearPad(GPIOB, 7);
+
+    //halClockSwitchMode(&halClockSleep);
 }
 
diff --git a/osalconf.h b/osalconf.h
new file mode 100644
index 0000000..f6baf01
--- /dev/null
+++ b/osalconf.h
@@ -0,0 +1,68 @@
+/*
+    ChibiOS - Copyright (C) 2006..2018 Giovanni Di Sirio
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+/**
+ * @file    templates/halconf.h
+ * @brief   Bare-metal OSAL configuration header.
+ *
+ * @addtogroup OSAL_CONF
+ * @{
+ */
+
+#ifndef OSALCONF_H
+#define OSALCONF_H
+
+/**
+ * @brief   Frequency in Hertz of the system tick.
+ */
+#if !defined(OSAL_ST_FREQUENCY) || defined(__DOXYGEN__)
+#define OSAL_ST_FREQUENCY                   20
+#endif
+
+/**
+ * @brief   Enables OSAL assertions.
+ */
+#if !defined(OSAL_DBG_ENABLE_ASSERTS) || defined(__DOXYGEN__)
+#define OSAL_DBG_ENABLE_ASSERTS             FALSE
+#endif
+
+/**
+ * @brief   Enables OSAL functions parameters checks.
+ */
+#if !defined(OSAL_DBG_ENABLE_CHECKS) || defined(__DOXYGEN__)
+#define OSAL_DBG_ENABLE_CHECKS              FALSE
+#endif
+
+/**
+ * @brief   OSAL initialization hook.
+ */
+#if !defined(OSAL_INIT_HOOK) || defined(__DOXYGEN__)
+#define OSAL_INIT_HOOK() {                                                  \
+}
+#endif
+
+/**
+ * @brief   Idle loop hook macro.
+ */
+#if !defined(OSAL_IDLE_HOOK) || defined(__DOXYGEN__)
+#define OSAL_IDLE_HOOK() {                                                  \
+    asm("wfi");                                                             \
+}
+#endif
+
+#endif /* OSALCONF_H */
+
+/** @} */
diff --git a/qfplib-port.h b/qfplib-port.h
new file mode 100644
index 0000000..1070d35
--- /dev/null
+++ b/qfplib-port.h
@@ -0,0 +1,381 @@
+inline float qfp_fpow(float b, float e)
+{
+    return qfp_fexp(qfp_fmul(e, qfp_fln(b)));
+}
+
+inline float qfp_flog10(float x)
+{
+    static const auto ln10 = qfp_fln(10.f);
+    return qfp_fdiv(qfp_fln(x), ln10);
+}
+
+__attribute__((naked, section(".data")))
+inline float qfp_fadd_asm(float, float)
+{
+asm(R"(
+.syntax unified
+ push {r4,r5,r6,r14}
+ asrs r4,r0,#31
+ lsls r2,r0,#1
+ lsrs r2,#24     @ x exponent
+ beq fa_xe0
+ cmp r2,#255
+ beq fa_xe255
+fa_xe:
+ asrs r5,r1,#31
+ lsls r3,r1,#1
+ lsrs r3,#24     @ y exponent
+ beq fa_ye0
+ cmp r3,#255
+ beq fa_ye255
+fa_ye:
+ ldr r6,=#0x007fffff
+ ands r0,r0,r6   @ extract mantissa bits
+ ands r1,r1,r6
+ adds r6,#1      @ r6=0x00800000
+ orrs r0,r0,r6   @ set implied 1
+ orrs r1,r1,r6
+ eors r0,r0,r4   @ complement...
+ eors r1,r1,r5
+ subs r0,r0,r4   @ ... and add 1 if sign bit is set: 2's complement
+ subs r1,r1,r5
+ subs r5,r3,r2   @ ye-xe
+ subs r4,r2,r3   @ xe-ye
+ bmi fa_ygtx
+@ here xe>=ye
+ cmp r4,#30
+ bge fa_xmgty    @ xe much greater than ye?
+ adds r5,#32
+ movs r3,r2      @ save exponent
+@ here y in r1 must be shifted down r4 places to align with x in r0
+ movs r2,r1
+ lsls r2,r2,r5   @ keep the bits we will shift off the bottom of r1
+ asrs r1,r1,r4
+ b fa_0
+
+.ltorg
+ 
+fa_ymgtx:
+ movs r2,#0      @ result is just y
+ movs r0,r1
+ b fa_1
+fa_xmgty:
+ movs r3,r2      @ result is just x
+ movs r2,#0
+ b fa_1
+
+fa_ygtx:
+@ here ye>xe
+ cmp r5,#30
+ bge fa_ymgtx    @ ye much greater than xe?
+ adds r4,#32
+@ here x in r0 must be shifted down r5 places to align with y in r1
+ movs r2,r0
+ lsls r2,r2,r4   @ keep the bits we will shift off the bottom of r0
+ asrs r0,r0,r5
+fa_0:
+ adds r0,r1      @ result is now in r0:r2, possibly highly denormalised or zero; exponent in r3
+ beq fa_9        @ if zero, inputs must have been of identical magnitude and opposite sign, so return +0
+fa_1: 
+ lsrs r1,r0,#31  @ sign bit
+ beq fa_8
+ mvns r0,r0
+ rsbs r2,r2,#0
+ bne fa_8
+ adds r0,#1
+fa_8:
+ adds r6,r6
+@ r6=0x01000000
+ cmp r0,r6
+ bhs fa_2
+fa_3:
+ adds r2,r2      @ normalisation loop
+ adcs r0,r0
+ subs r3,#1      @ adjust exponent
+ cmp r0,r6
+ blo fa_3
+fa_2:
+@ here r0:r2 is the result mantissa 0x01000000<=r0<0x02000000, r3 the exponent, and r1 the sign bit
+ lsrs r0,#1
+ bcc fa_4
+@ rounding bits here are 1:r2
+ adds r0,#1      @ round up
+ cmp r2,#0
+ beq fa_5        @ sticky bits all zero?
+fa_4:
+ cmp r3,#254
+ bhs fa_6        @ exponent too large or negative?
+ lsls r1,#31     @ pack everything
+ add r0,r1
+ lsls r3,#23
+ add r0,r3
+fa_end:
+ pop {r4,r5,r6,r15}
+
+fa_9:
+ cmp r2,#0       @ result zero?
+ beq fa_end      @ return +0
+ b fa_1
+
+fa_5:
+ lsrs r0,#1
+ lsls r0,#1      @ round to even
+ b fa_4
+
+fa_6:
+ bge fa_7
+@ underflow
+@ can handle denormals here
+ lsls r0,r1,#31  @ result is signed zero
+ pop {r4,r5,r6,r15}
+fa_7:
+@ overflow
+ lsls r0,r1,#8
+ adds r0,#255
+ lsls r0,#23     @ result is signed infinity
+ pop {r4,r5,r6,r15}
+
+
+fa_xe0:
+@ can handle denormals here
+ subs r2,#32
+ adds r2,r4       @ exponent -32 for +Inf, -33 for -Inf
+ b fa_xe
+
+fa_xe255:
+@ can handle NaNs here
+ lsls r2,#8
+ add r2,r2,r4 @ exponent ~64k for +Inf, ~64k-1 for -Inf
+ b fa_xe
+
+fa_ye0:
+@ can handle denormals here
+ subs r3,#32
+ adds r3,r5       @ exponent -32 for +Inf, -33 for -Inf
+ b fa_ye
+
+fa_ye255:
+@ can handle NaNs here
+ lsls r3,#8
+ add r3,r3,r5 @ exponent ~64k for +Inf, ~64k-1 for -Inf
+ b fa_ye
+ )");
+}
+
+__attribute__((naked, section(".data")))
+inline float qfp_fmul_asm(float, float)
+{
+asm(R"(
+.syntax unified
+ push {r7,r14}
+ mov r2,r0
+ eors r2,r1       @ sign of result
+ lsrs r2,#31
+ lsls r2,#31
+ mov r14,r2
+ lsls r0,#1
+ lsls r1,#1
+ lsrs r2,r0,#24   @ xe
+ beq fm_xe0
+ cmp r2,#255
+ beq fm_xe255
+fm_xe:
+ lsrs r3,r1,#24   @ ye
+ beq fm_ye0
+ cmp r3,#255
+ beq fm_ye255
+fm_ye:
+ adds r7,r2,r3    @ exponent of result (will possibly be incremented)
+ subs r7,#128     @ adjust bias for packing
+ lsls r0,#8       @ x mantissa
+ lsls r1,#8       @ y mantissa
+ lsrs r0,#9
+ lsrs r1,#9
+
+ adds r2,r0,r1    @ for later
+ mov r12,r2
+ lsrs r2,r0,#7    @ x[22..7] Q16
+ lsrs r3,r1,#7    @ y[22..7] Q16
+ muls r2,r2,r3    @ result [45..14] Q32: never an overestimate and worst case error is 2*(2^7-1)*(2^23-2^7)+(2^7-1)^2 = 2130690049 < 2^31
+ muls r0,r0,r1    @ result [31..0] Q46
+ lsrs r2,#18      @ result [45..32] Q14
+ bcc 1f
+ cmp r0,#0
+ bmi 1f
+ adds r2,#1       @ fix error in r2
+1:
+ lsls r3,r0,#9    @ bits off bottom of result
+ lsrs r0,#23      @ Q23
+ lsls r2,#9
+ adds r0,r2       @ cut'n'shut
+ add r0,r12       @ implied 1*(x+y) to compensate for no insertion of implied 1s
+@ result-1 in r3:r0 Q23+32, i.e., in range [0,3)
+
+ lsrs r1,r0,#23
+ bne fm_0         @ branch if we need to shift down one place
+@ here 1<=result<2
+ cmp r7,#254
+ bhs fm_3a        @ catches both underflow and overflow
+ lsls r3,#1       @ sticky bits at top of R3, rounding bit in carry
+ bcc fm_1         @ no rounding
+ beq fm_2         @ rounding tie?
+ adds r0,#1       @ round up
+fm_1:
+ adds r7,#1       @ for implied 1
+ lsls r7,#23      @ pack result
+ add r0,r7
+ add r0,r14
+ pop {r7,r15}
+fm_2:             @ rounding tie
+ adds r0,#1
+fm_3:
+ lsrs r0,#1
+ lsls r0,#1       @ clear bottom bit
+ b fm_1
+
+@ here 1<=result-1<3
+fm_0:
+ adds r7,#1       @ increment exponent
+ cmp r7,#254
+ bhs fm_3b        @ catches both underflow and overflow
+ lsrs r0,#1       @ shift mantissa down
+ bcc fm_1a        @ no rounding
+ adds r0,#1       @ assume we will round up
+ cmp r3,#0        @ sticky bits
+ beq fm_3c        @ rounding tie?
+fm_1a:
+ adds r7,r7
+ adds r7,#1       @ for implied 1
+ lsls r7,#22      @ pack result
+ add r0,r7
+ add r0,r14
+ pop {r7,r15}
+
+fm_3c:
+ lsrs r0,#1
+ lsls r0,#1       @ clear bottom bit
+ b fm_1a
+
+fm_xe0:
+ subs r2,#16
+fm_xe255:
+ lsls r2,#8
+ b fm_xe
+fm_ye0:
+ subs r3,#16
+fm_ye255:
+ lsls r3,#8
+ b fm_ye
+
+@ here the result is under- or overflowing
+fm_3b:
+ bge fm_4        @ branch on overflow
+@ trap case where result is denormal 0x007fffff + 0.5ulp or more
+ adds r7,#1      @ exponent=-1?
+ bne fm_5
+@ corrected mantissa will be >= 3.FFFFFC (0x1fffffe Q23)
+@ so r0 >= 2.FFFFFC (0x17ffffe Q23)
+ adds r0,#2
+ lsrs r0,#23
+ cmp r0,#3
+ bne fm_5
+ b fm_6
+
+fm_3a:
+ bge fm_4        @ branch on overflow
+@ trap case where result is denormal 0x007fffff + 0.5ulp or more
+ adds r7,#1      @ exponent=-1?
+ bne fm_5
+ adds r0,#1      @ mantissa=0xffffff (i.e., r0=0x7fffff)?
+ lsrs r0,#23
+ beq fm_5
+fm_6:
+ movs r0,#1      @ return smallest normal
+ lsls r0,#23
+ add r0,r14
+ pop {r7,r15}
+
+fm_5:
+ mov r0,r14
+ pop {r7,r15}
+fm_4:
+ movs r0,#0xff
+ lsls r0,#23
+ add r0,r14
+ pop {r7,r15}
+)");
+}
+
+__attribute__((naked, section(".data")))
+inline float qfp_int2float_asm(int)
+{
+asm(R"(
+.syntax unified
+ movs r1,#0      @ fall through
+ push {r4,r5,r6,r14}
+ movs r2,#29
+ subs r2,r1      @ fix exponent
+ movs r5,#0
+ bl qfp_int2float_packx
+ pop {r4,r5,r6,r15}
+qfp_int2float_packx:
+ lsrs r4,r0,#31 @ save sign bit
+ lsls r4,r4,#31 @ sign now in b31
+ bpl 2f         @ skip if positive
+ cmp r5,#0
+ beq 11f
+ adds r0,#1     @ fiddle carry in to following rsb if sticky bits are non-zero
+11:
+ rsbs r0,#0     @ can now treat r0 as unsigned
+ bmi 3f         @ catch r0=0x80000000 case
+2:
+ subs r2,#1     @ normalisation loop
+ adds r0,r0
+ beq 1f         @ zero? special case
+ bpl 2b         @ normalise so leading "1" in bit 31
+3:
+ adds r2,#129   @ (mis-)offset exponent
+ bne 12f        @ special case: highest denormal can round to lowest normal
+ adds r0,#0x80  @ in special case, need to add 256 to r0 for rounding
+ bcs 4f         @ tripped carry? then have leading 1 in C as required
+12:
+ adds r0,#0x80  @ rounding
+ bcs 4f         @ tripped carry? then have leading 1 in C as required (and result is even so can ignore sticky bits)
+ cmp r5,#0
+ beq 7f         @ sticky bits zero?
+8:
+ lsls r0,#1     @ remove leading 1
+9:
+ subs r2,#1     @ compensate exponent on this path
+4:
+ cmp r2,#254
+ bge 5f         @ overflow?
+ adds r2,#1     @ correct exponent offset
+ ble 10f        @ denormal/underflow?
+ lsrs r0,#9     @ align mantissa
+ lsls r2,#23    @ align exponent
+ orrs r0,r2     @ assemble exponent and mantissa
+6:
+ orrs r0,r4     @ apply sign
+1:
+ bx r14
+
+5:
+ movs r0,#0xff  @ create infinity
+ lsls r0,#23
+ b 6b
+
+10:
+ movs r0,#0     @ create zero
+ bx r14
+
+7:              @ sticky bit rounding case
+ lsls r5,r0,#24 @ check bottom 8 bits of r0
+ bne 8b         @ in rounding-tie case?
+ lsrs r0,#9     @ ensure even result
+ lsls r0,#10
+ b 9b
+)");
+}
+
+
diff --git a/sos-iir-filter.h b/sos-iir-filter.h
index c827ef9..f4280e8 100644
--- a/sos-iir-filter.h
+++ b/sos-iir-filter.h
@@ -28,17 +28,7 @@
 
 extern "C" {
 #include <qfplib-m0-full.h>
-}
-
-float qfp_fpow(float b, float e)
-{
-    return qfp_fexp(qfp_fmul(e, qfp_fln(b)));
-}
-
-float qfp_flog10(float x)
-{
-    static const auto ln10 = qfp_fln(10.f);
-    return qfp_fdiv(qfp_fln(x), ln10);
+#include "qfplib-port.h"
 }
 
 class sos_t
@@ -49,7 +39,7 @@ public:
     constexpr sos_t(float v_ = 0.f): v(v_) {}
 
     sos_t operator+(auto x) const noexcept {
-        return qfp_fadd(v, x);
+        return qfp_fadd_asm(v, x);
     }
 
     sos_t operator-(const sos_t& o) const noexcept {
@@ -57,7 +47,7 @@ public:
     }
 
     sos_t operator*(auto x) const noexcept {
-        return qfp_fmul(v, x);
+        return qfp_fmul_asm(v, x);
     }
 
     sos_t operator/(auto x) const noexcept {