From d2f50fb9259a3142fa6b319a7a735a0e19355d01 Mon Sep 17 00:00:00 2001 From: Clyne Sullivan Date: Sat, 8 May 2021 11:55:12 -0400 Subject: [PATCH] include dsp lib; split timers; user message --- gui/cmsis/arm_conv_f32.c | 647 +++++++++++++++++++++++ gui/cmsis/arm_conv_q15.c | 734 ++++++++++++++++++++++++++ gui/cmsis/arm_conv_q31.c | 565 ++++++++++++++++++++ gui/cmsis/arm_conv_q7.c | 690 ++++++++++++++++++++++++ gui/cmsis/arm_fir_f32.c | 997 +++++++++++++++++++++++++++++++++++ gui/cmsis/arm_fir_init_f32.c | 96 ++++ gui/cmsis/arm_fir_init_q15.c | 154 ++++++ gui/cmsis/arm_fir_init_q31.c | 96 ++++ gui/cmsis/arm_fir_init_q7.c | 94 ++++ gui/cmsis/arm_fir_q15.c | 691 ++++++++++++++++++++++++ gui/cmsis/arm_fir_q31.c | 365 +++++++++++++ gui/cmsis/arm_fir_q7.c | 397 ++++++++++++++ gui/wxmain.cpp | 140 ++--- gui/wxmain.hpp | 8 +- gui/wxmain_devdata.cpp | 5 +- gui/wxmain_mrun.cpp | 16 +- source/main.cpp | 19 + 17 files changed, 5640 insertions(+), 74 deletions(-) create mode 100644 gui/cmsis/arm_conv_f32.c create mode 100644 gui/cmsis/arm_conv_q15.c create mode 100644 gui/cmsis/arm_conv_q31.c create mode 100644 gui/cmsis/arm_conv_q7.c create mode 100644 gui/cmsis/arm_fir_f32.c create mode 100644 gui/cmsis/arm_fir_init_f32.c create mode 100644 gui/cmsis/arm_fir_init_q15.c create mode 100644 gui/cmsis/arm_fir_init_q31.c create mode 100644 gui/cmsis/arm_fir_init_q7.c create mode 100644 gui/cmsis/arm_fir_q15.c create mode 100644 gui/cmsis/arm_fir_q31.c create mode 100644 gui/cmsis/arm_fir_q7.c diff --git a/gui/cmsis/arm_conv_f32.c b/gui/cmsis/arm_conv_f32.c new file mode 100644 index 0000000..65f7ab8 --- /dev/null +++ b/gui/cmsis/arm_conv_f32.c @@ -0,0 +1,647 @@ +/* ---------------------------------------------------------------------------- +* Copyright (C) 2010-2014 ARM Limited. All rights reserved. +* +* $Date: 19. March 2015 +* $Revision: V.1.4.5 +* +* Project: CMSIS DSP Library +* Title: arm_conv_f32.c +* +* Description: Convolution of floating-point sequences. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Redistribution and use in source and binary forms, with or without +* modification, are permitted provided that the following conditions +* are met: +* - Redistributions of source code must retain the above copyright +* notice, this list of conditions and the following disclaimer. +* - Redistributions in binary form must reproduce the above copyright +* notice, this list of conditions and the following disclaimer in +* the documentation and/or other materials provided with the +* distribution. +* - Neither the name of ARM LIMITED nor the names of its contributors +* may be used to endorse or promote products derived from this +* software without specific prior written permission. +* +* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +* POSSIBILITY OF SUCH DAMAGE. +* -------------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @defgroup Conv Convolution + * + * Convolution is a mathematical operation that operates on two finite length vectors to generate a finite length output vector. + * Convolution is similar to correlation and is frequently used in filtering and data analysis. + * The CMSIS DSP library contains functions for convolving Q7, Q15, Q31, and floating-point data types. + * The library also provides fast versions of the Q15 and Q31 functions on Cortex-M4 and Cortex-M3. + * + * \par Algorithm + * Let a[n] and b[n] be sequences of length srcALen and srcBLen samples respectively. + * Then the convolution + * + *
    
+ *                   c[n] = a[n] * b[n]    
+ * 
+ * + * \par + * is defined as + * \image html ConvolutionEquation.gif + * \par + * Note that c[n] is of length srcALen + srcBLen - 1 and is defined over the interval n=0, 1, 2, ..., srcALen + srcBLen - 2. + * pSrcA points to the first input vector of length srcALen and + * pSrcB points to the second input vector of length srcBLen. + * The output result is written to pDst and the calling function must allocate srcALen+srcBLen-1 words for the result. + * + * \par + * Conceptually, when two signals a[n] and b[n] are convolved, + * the signal b[n] slides over a[n]. + * For each offset \c n, the overlapping portions of a[n] and b[n] are multiplied and summed together. + * + * \par + * Note that convolution is a commutative operation: + * + *
    
+ *                   a[n] * b[n] = b[n] * a[n].    
+ * 
+ * + * \par + * This means that switching the A and B arguments to the convolution functions has no effect. + * + * Fixed-Point Behavior + * + * \par + * Convolution requires summing up a large number of intermediate products. + * As such, the Q7, Q15, and Q31 functions run a risk of overflow and saturation. + * Refer to the function specific documentation below for further details of the particular algorithm used. + * + * + * Fast Versions + * + * \par + * Fast versions are supported for Q31 and Q15. Cycles for Fast versions are less compared to Q31 and Q15 of conv and the design requires + * the input signals should be scaled down to avoid intermediate overflows. + * + * + * Opt Versions + * + * \par + * Opt versions are supported for Q15 and Q7. Design uses internal scratch buffer for getting good optimisation. + * These versions are optimised in cycles and consumes more memory(Scratch memory) compared to Q15 and Q7 versions + */ + +/** + * @addtogroup Conv + * @{ + */ + +/** + * @brief Convolution of floating-point sequences. + * @param[in] *pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] *pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1. + * @return none. + */ + +void arm_conv_f32( + float32_t * pSrcA, + uint32_t srcALen, + float32_t * pSrcB, + uint32_t srcBLen, + float32_t * pDst) +{ + + +#ifndef ARM_MATH_CM0_FAMILY + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + float32_t *pIn1; /* inputA pointer */ + float32_t *pIn2; /* inputB pointer */ + float32_t *pOut = pDst; /* output pointer */ + float32_t *px; /* Intermediate inputA pointer */ + float32_t *py; /* Intermediate inputB pointer */ + float32_t *pSrc1, *pSrc2; /* Intermediate pointers */ + float32_t sum, acc0, acc1, acc2, acc3; /* Accumulator */ + float32_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */ + uint32_t j, k, count, blkCnt, blockSize1, blockSize2, blockSize3; /* loop counters */ + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + if(srcALen >= srcBLen) + { + /* Initialization of inputA pointer */ + pIn1 = pSrcA; + + /* Initialization of inputB pointer */ + pIn2 = pSrcB; + } + else + { + /* Initialization of inputA pointer */ + pIn1 = pSrcB; + + /* Initialization of inputB pointer */ + pIn2 = pSrcA; + + /* srcBLen is always considered as shorter or equal to srcALen */ + j = srcBLen; + srcBLen = srcALen; + srcALen = j; + } + + /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ + /* The function is internally + * divided into three stages according to the number of multiplications that has to be + * taken place between inputA samples and inputB samples. In the first stage of the + * algorithm, the multiplications increase by one for every iteration. + * In the second stage of the algorithm, srcBLen number of multiplications are done. + * In the third stage of the algorithm, the multiplications decrease by one + * for every iteration. */ + + /* The algorithm is implemented in three stages. + The loop counters of each stage is initiated here. */ + blockSize1 = srcBLen - 1u; + blockSize2 = srcALen - (srcBLen - 1u); + blockSize3 = blockSize1; + + /* -------------------------- + * initializations of stage1 + * -------------------------*/ + + /* sum = x[0] * y[0] + * sum = x[0] * y[1] + x[1] * y[0] + * .... + * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] + */ + + /* In this stage the MAC operations are increased by 1 for every iteration. + The count variable holds the number of MAC operations performed */ + count = 1u; + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + py = pIn2; + + + /* ------------------------ + * Stage1 process + * ----------------------*/ + + /* The first stage starts here */ + while(blockSize1 > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0.0f; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* x[0] * y[srcBLen - 1] */ + sum += *px++ * *py--; + + /* x[1] * y[srcBLen - 2] */ + sum += *px++ * *py--; + + /* x[2] * y[srcBLen - 3] */ + sum += *px++ * *py--; + + /* x[3] * y[srcBLen - 4] */ + sum += *px++ * *py--; + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4u; + + while(k > 0u) + { + /* Perform the multiply-accumulate */ + sum += *px++ * *py--; + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = sum; + + /* Update the inputA and inputB pointers for next MAC calculation */ + py = pIn2 + count; + px = pIn1; + + /* Increment the MAC count */ + count++; + + /* Decrement the loop counter */ + blockSize1--; + } + + /* -------------------------- + * Initializations of stage2 + * ------------------------*/ + + /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] + * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] + * .... + * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] + */ + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + pSrc2 = pIn2 + (srcBLen - 1u); + py = pSrc2; + + /* count is index by which the pointer pIn1 to be incremented */ + count = 0u; + + /* ------------------- + * Stage2 process + * ------------------*/ + + /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. + * So, to loop unroll over blockSize2, + * srcBLen should be greater than or equal to 4 */ + if(srcBLen >= 4u) + { + /* Loop unroll over blockSize2, by 4 */ + blkCnt = blockSize2 >> 2u; + + while(blkCnt > 0u) + { + /* Set all accumulators to zero */ + acc0 = 0.0f; + acc1 = 0.0f; + acc2 = 0.0f; + acc3 = 0.0f; + + /* read x[0], x[1], x[2] samples */ + x0 = *(px++); + x1 = *(px++); + x2 = *(px++); + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + do + { + /* Read y[srcBLen - 1] sample */ + c0 = *(py--); + + /* Read x[3] sample */ + x3 = *(px); + + /* Perform the multiply-accumulate */ + /* acc0 += x[0] * y[srcBLen - 1] */ + acc0 += x0 * c0; + + /* acc1 += x[1] * y[srcBLen - 1] */ + acc1 += x1 * c0; + + /* acc2 += x[2] * y[srcBLen - 1] */ + acc2 += x2 * c0; + + /* acc3 += x[3] * y[srcBLen - 1] */ + acc3 += x3 * c0; + + /* Read y[srcBLen - 2] sample */ + c0 = *(py--); + + /* Read x[4] sample */ + x0 = *(px + 1u); + + /* Perform the multiply-accumulate */ + /* acc0 += x[1] * y[srcBLen - 2] */ + acc0 += x1 * c0; + /* acc1 += x[2] * y[srcBLen - 2] */ + acc1 += x2 * c0; + /* acc2 += x[3] * y[srcBLen - 2] */ + acc2 += x3 * c0; + /* acc3 += x[4] * y[srcBLen - 2] */ + acc3 += x0 * c0; + + /* Read y[srcBLen - 3] sample */ + c0 = *(py--); + + /* Read x[5] sample */ + x1 = *(px + 2u); + + /* Perform the multiply-accumulates */ + /* acc0 += x[2] * y[srcBLen - 3] */ + acc0 += x2 * c0; + /* acc1 += x[3] * y[srcBLen - 2] */ + acc1 += x3 * c0; + /* acc2 += x[4] * y[srcBLen - 2] */ + acc2 += x0 * c0; + /* acc3 += x[5] * y[srcBLen - 2] */ + acc3 += x1 * c0; + + /* Read y[srcBLen - 4] sample */ + c0 = *(py--); + + /* Read x[6] sample */ + x2 = *(px + 3u); + px += 4u; + + /* Perform the multiply-accumulates */ + /* acc0 += x[3] * y[srcBLen - 4] */ + acc0 += x3 * c0; + /* acc1 += x[4] * y[srcBLen - 4] */ + acc1 += x0 * c0; + /* acc2 += x[5] * y[srcBLen - 4] */ + acc2 += x1 * c0; + /* acc3 += x[6] * y[srcBLen - 4] */ + acc3 += x2 * c0; + + + } while(--k); + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4u; + + while(k > 0u) + { + /* Read y[srcBLen - 5] sample */ + c0 = *(py--); + + /* Read x[7] sample */ + x3 = *(px++); + + /* Perform the multiply-accumulates */ + /* acc0 += x[4] * y[srcBLen - 5] */ + acc0 += x0 * c0; + /* acc1 += x[5] * y[srcBLen - 5] */ + acc1 += x1 * c0; + /* acc2 += x[6] * y[srcBLen - 5] */ + acc2 += x2 * c0; + /* acc3 += x[7] * y[srcBLen - 5] */ + acc3 += x3 * c0; + + /* Reuse the present samples for the next MAC */ + x0 = x1; + x1 = x2; + x2 = x3; + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = acc0; + *pOut++ = acc1; + *pOut++ = acc2; + *pOut++ = acc3; + + /* Increment the pointer pIn1 index, count by 4 */ + count += 4u; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + + /* Decrement the loop counter */ + blkCnt--; + } + + + /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize2 % 0x4u; + + while(blkCnt > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0.0f; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* Perform the multiply-accumulates */ + sum += *px++ * *py--; + sum += *px++ * *py--; + sum += *px++ * *py--; + sum += *px++ * *py--; + + /* Decrement the loop counter */ + k--; + } + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4u; + + while(k > 0u) + { + /* Perform the multiply-accumulate */ + sum += *px++ * *py--; + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = sum; + + /* Increment the MAC count */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + else + { + /* If the srcBLen is not a multiple of 4, + * the blockSize2 loop cannot be unrolled by 4 */ + blkCnt = blockSize2; + + while(blkCnt > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0.0f; + + /* srcBLen number of MACS should be performed */ + k = srcBLen; + + while(k > 0u) + { + /* Perform the multiply-accumulate */ + sum += *px++ * *py--; + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = sum; + + /* Increment the MAC count */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + + + /* -------------------------- + * Initializations of stage3 + * -------------------------*/ + + /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] + * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] + * .... + * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] + * sum += x[srcALen-1] * y[srcBLen-1] + */ + + /* In this stage the MAC operations are decreased by 1 for every iteration. + The blockSize3 variable holds the number of MAC operations performed */ + + /* Working pointer of inputA */ + pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); + px = pSrc1; + + /* Working pointer of inputB */ + pSrc2 = pIn2 + (srcBLen - 1u); + py = pSrc2; + + /* ------------------- + * Stage3 process + * ------------------*/ + + while(blockSize3 > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0.0f; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = blockSize3 >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */ + sum += *px++ * *py--; + + /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */ + sum += *px++ * *py--; + + /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */ + sum += *px++ * *py--; + + /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */ + sum += *px++ * *py--; + + /* Decrement the loop counter */ + k--; + } + + /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = blockSize3 % 0x4u; + + while(k > 0u) + { + /* Perform the multiply-accumulates */ + /* sum += x[srcALen-1] * y[srcBLen-1] */ + sum += *px++ * *py--; + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = sum; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = ++pSrc1; + py = pSrc2; + + /* Decrement the loop counter */ + blockSize3--; + } + +#else + + /* Run the below code for Cortex-M0 */ + + float32_t *pIn1 = pSrcA; /* inputA pointer */ + float32_t *pIn2 = pSrcB; /* inputB pointer */ + float32_t sum; /* Accumulator */ + uint32_t i, j; /* loop counters */ + + /* Loop to calculate convolution for output length number of times */ + for (i = 0u; i < ((srcALen + srcBLen) - 1u); i++) + { + /* Initialize sum with zero to carry out MAC operations */ + sum = 0.0f; + + /* Loop to perform MAC operations according to convolution equation */ + for (j = 0u; j <= i; j++) + { + /* Check the array limitations */ + if((((i - j) < srcBLen) && (j < srcALen))) + { + /* z[i] += x[i-j] * y[j] */ + sum += pIn1[j] * pIn2[i - j]; + } + } + /* Store the output in the destination buffer */ + pDst[i] = sum; + } + +#endif /* #ifndef ARM_MATH_CM0_FAMILY */ + +} + +/** + * @} end of Conv group + */ diff --git a/gui/cmsis/arm_conv_q15.c b/gui/cmsis/arm_conv_q15.c new file mode 100644 index 0000000..8454a94 --- /dev/null +++ b/gui/cmsis/arm_conv_q15.c @@ -0,0 +1,734 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010-2014 ARM Limited. All rights reserved. +* +* $Date: 19. March 2015 +* $Revision: V.1.4.5 +* +* Project: CMSIS DSP Library +* Title: arm_conv_q15.c +* +* Description: Convolution of Q15 sequences. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Redistribution and use in source and binary forms, with or without +* modification, are permitted provided that the following conditions +* are met: +* - Redistributions of source code must retain the above copyright +* notice, this list of conditions and the following disclaimer. +* - Redistributions in binary form must reproduce the above copyright +* notice, this list of conditions and the following disclaimer in +* the documentation and/or other materials provided with the +* distribution. +* - Neither the name of ARM LIMITED nor the names of its contributors +* may be used to endorse or promote products derived from this +* software without specific prior written permission. +* +* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +* POSSIBILITY OF SUCH DAMAGE. +* -------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup Conv + * @{ + */ + +/** + * @brief Convolution of Q15 sequences. + * @param[in] *pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] *pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1. + * @return none. + * + * @details + * Scaling and Overflow Behavior: + * + * \par + * The function is implemented using a 64-bit internal accumulator. + * Both inputs are in 1.15 format and multiplications yield a 2.30 result. + * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format. + * This approach provides 33 guard bits and there is no risk of overflow. + * The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format. + * + * \par + * Refer to arm_conv_fast_q15() for a faster but less precise version of this function for Cortex-M3 and Cortex-M4. + * + * \par + * Refer the function arm_conv_opt_q15() for a faster implementation of this function using scratch buffers. + * + */ + +void arm_conv_q15( + q15_t * pSrcA, + uint32_t srcALen, + q15_t * pSrcB, + uint32_t srcBLen, + q15_t * pDst) +{ + +#if (defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE) + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + q15_t *pIn1; /* inputA pointer */ + q15_t *pIn2; /* inputB pointer */ + q15_t *pOut = pDst; /* output pointer */ + q63_t sum, acc0, acc1, acc2, acc3; /* Accumulator */ + q15_t *px; /* Intermediate inputA pointer */ + q15_t *py; /* Intermediate inputB pointer */ + q15_t *pSrc1, *pSrc2; /* Intermediate pointers */ + q31_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */ + uint32_t blockSize1, blockSize2, blockSize3, j, k, count, blkCnt; /* loop counter */ + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + if(srcALen >= srcBLen) + { + /* Initialization of inputA pointer */ + pIn1 = pSrcA; + + /* Initialization of inputB pointer */ + pIn2 = pSrcB; + } + else + { + /* Initialization of inputA pointer */ + pIn1 = pSrcB; + + /* Initialization of inputB pointer */ + pIn2 = pSrcA; + + /* srcBLen is always considered as shorter or equal to srcALen */ + j = srcBLen; + srcBLen = srcALen; + srcALen = j; + } + + /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ + /* The function is internally + * divided into three stages according to the number of multiplications that has to be + * taken place between inputA samples and inputB samples. In the first stage of the + * algorithm, the multiplications increase by one for every iteration. + * In the second stage of the algorithm, srcBLen number of multiplications are done. + * In the third stage of the algorithm, the multiplications decrease by one + * for every iteration. */ + + /* The algorithm is implemented in three stages. + The loop counters of each stage is initiated here. */ + blockSize1 = srcBLen - 1u; + blockSize2 = srcALen - (srcBLen - 1u); + + /* -------------------------- + * Initializations of stage1 + * -------------------------*/ + + /* sum = x[0] * y[0] + * sum = x[0] * y[1] + x[1] * y[0] + * .... + * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] + */ + + /* In this stage the MAC operations are increased by 1 for every iteration. + The count variable holds the number of MAC operations performed */ + count = 1u; + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + py = pIn2; + + + /* ------------------------ + * Stage1 process + * ----------------------*/ + + /* For loop unrolling by 4, this stage is divided into two. */ + /* First part of this stage computes the MAC operations less than 4 */ + /* Second part of this stage computes the MAC operations greater than or equal to 4 */ + + /* The first part of the stage starts here */ + while((count < 4u) && (blockSize1 > 0u)) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Loop over number of MAC operations between + * inputA samples and inputB samples */ + k = count; + + while(k > 0u) + { + /* Perform the multiply-accumulates */ + sum = __SMLALD(*px++, *py--, sum); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (__SSAT((sum >> 15), 16)); + + /* Update the inputA and inputB pointers for next MAC calculation */ + py = pIn2 + count; + px = pIn1; + + /* Increment the MAC count */ + count++; + + /* Decrement the loop counter */ + blockSize1--; + } + + /* The second part of the stage starts here */ + /* The internal loop, over count, is unrolled by 4 */ + /* To, read the last two inputB samples using SIMD: + * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */ + py = py - 1; + + while(blockSize1 > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* Perform the multiply-accumulates */ + /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */ + sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum); + /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */ + sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum); + + /* Decrement the loop counter */ + k--; + } + + /* For the next MAC operations, the pointer py is used without SIMD + * So, py is incremented by 1 */ + py = py + 1u; + + /* If the count is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4u; + + while(k > 0u) + { + /* Perform the multiply-accumulates */ + sum = __SMLALD(*px++, *py--, sum); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (__SSAT((sum >> 15), 16)); + + /* Update the inputA and inputB pointers for next MAC calculation */ + py = pIn2 + (count - 1u); + px = pIn1; + + /* Increment the MAC count */ + count++; + + /* Decrement the loop counter */ + blockSize1--; + } + + /* -------------------------- + * Initializations of stage2 + * ------------------------*/ + + /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] + * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] + * .... + * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] + */ + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + pSrc2 = pIn2 + (srcBLen - 1u); + py = pSrc2; + + /* count is the index by which the pointer pIn1 to be incremented */ + count = 0u; + + + /* -------------------- + * Stage2 process + * -------------------*/ + + /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. + * So, to loop unroll over blockSize2, + * srcBLen should be greater than or equal to 4 */ + if(srcBLen >= 4u) + { + /* Loop unroll over blockSize2, by 4 */ + blkCnt = blockSize2 >> 2u; + + while(blkCnt > 0u) + { + py = py - 1u; + + /* Set all accumulators to zero */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + acc3 = 0; + + + /* read x[0], x[1] samples */ + x0 = *__SIMD32(px); + /* read x[1], x[2] samples */ + x1 = _SIMD32_OFFSET(px+1); + px+= 2u; + + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + do + { + /* Read the last two inputB samples using SIMD: + * y[srcBLen - 1] and y[srcBLen - 2] */ + c0 = *__SIMD32(py)--; + + /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */ + acc0 = __SMLALDX(x0, c0, acc0); + + /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */ + acc1 = __SMLALDX(x1, c0, acc1); + + /* Read x[2], x[3] */ + x2 = *__SIMD32(px); + + /* Read x[3], x[4] */ + x3 = _SIMD32_OFFSET(px+1); + + /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */ + acc2 = __SMLALDX(x2, c0, acc2); + + /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */ + acc3 = __SMLALDX(x3, c0, acc3); + + /* Read y[srcBLen - 3] and y[srcBLen - 4] */ + c0 = *__SIMD32(py)--; + + /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */ + acc0 = __SMLALDX(x2, c0, acc0); + + /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */ + acc1 = __SMLALDX(x3, c0, acc1); + + /* Read x[4], x[5] */ + x0 = _SIMD32_OFFSET(px+2); + + /* Read x[5], x[6] */ + x1 = _SIMD32_OFFSET(px+3); + px += 4u; + + /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */ + acc2 = __SMLALDX(x0, c0, acc2); + + /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */ + acc3 = __SMLALDX(x1, c0, acc3); + + } while(--k); + + /* For the next MAC operations, SIMD is not used + * So, the 16 bit pointer if inputB, py is updated */ + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4u; + + if(k == 1u) + { + /* Read y[srcBLen - 5] */ + c0 = *(py+1); + +#ifdef ARM_MATH_BIG_ENDIAN + + c0 = c0 << 16u; + +#else + + c0 = c0 & 0x0000FFFF; + +#endif /* #ifdef ARM_MATH_BIG_ENDIAN */ + /* Read x[7] */ + x3 = *__SIMD32(px); + px++; + + /* Perform the multiply-accumulates */ + acc0 = __SMLALD(x0, c0, acc0); + acc1 = __SMLALD(x1, c0, acc1); + acc2 = __SMLALDX(x1, c0, acc2); + acc3 = __SMLALDX(x3, c0, acc3); + } + + if(k == 2u) + { + /* Read y[srcBLen - 5], y[srcBLen - 6] */ + c0 = _SIMD32_OFFSET(py); + + /* Read x[7], x[8] */ + x3 = *__SIMD32(px); + + /* Read x[9] */ + x2 = _SIMD32_OFFSET(px+1); + px += 2u; + + /* Perform the multiply-accumulates */ + acc0 = __SMLALDX(x0, c0, acc0); + acc1 = __SMLALDX(x1, c0, acc1); + acc2 = __SMLALDX(x3, c0, acc2); + acc3 = __SMLALDX(x2, c0, acc3); + } + + if(k == 3u) + { + /* Read y[srcBLen - 5], y[srcBLen - 6] */ + c0 = _SIMD32_OFFSET(py); + + /* Read x[7], x[8] */ + x3 = *__SIMD32(px); + + /* Read x[9] */ + x2 = _SIMD32_OFFSET(px+1); + + /* Perform the multiply-accumulates */ + acc0 = __SMLALDX(x0, c0, acc0); + acc1 = __SMLALDX(x1, c0, acc1); + acc2 = __SMLALDX(x3, c0, acc2); + acc3 = __SMLALDX(x2, c0, acc3); + + c0 = *(py-1); + +#ifdef ARM_MATH_BIG_ENDIAN + + c0 = c0 << 16u; +#else + + c0 = c0 & 0x0000FFFF; +#endif /* #ifdef ARM_MATH_BIG_ENDIAN */ + /* Read x[10] */ + x3 = _SIMD32_OFFSET(px+2); + px += 3u; + + /* Perform the multiply-accumulates */ + acc0 = __SMLALDX(x1, c0, acc0); + acc1 = __SMLALD(x2, c0, acc1); + acc2 = __SMLALDX(x2, c0, acc2); + acc3 = __SMLALDX(x3, c0, acc3); + } + + + /* Store the results in the accumulators in the destination buffer. */ + +#ifndef ARM_MATH_BIG_ENDIAN + + *__SIMD32(pOut)++ = + __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16); + *__SIMD32(pOut)++ = + __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16); + +#else + + *__SIMD32(pOut)++ = + __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16); + *__SIMD32(pOut)++ = + __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* Increment the pointer pIn1 index, count by 4 */ + count += 4u; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize2 % 0x4u; + + while(blkCnt > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* Perform the multiply-accumulates */ + sum += (q63_t) ((q31_t) * px++ * *py--); + sum += (q63_t) ((q31_t) * px++ * *py--); + sum += (q63_t) ((q31_t) * px++ * *py--); + sum += (q63_t) ((q31_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4u; + + while(k > 0u) + { + /* Perform the multiply-accumulates */ + sum += (q63_t) ((q31_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (__SSAT(sum >> 15, 16)); + + /* Increment the pointer pIn1 index, count by 1 */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + else + { + /* If the srcBLen is not a multiple of 4, + * the blockSize2 loop cannot be unrolled by 4 */ + blkCnt = blockSize2; + + while(blkCnt > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* srcBLen number of MACS should be performed */ + k = srcBLen; + + while(k > 0u) + { + /* Perform the multiply-accumulate */ + sum += (q63_t) ((q31_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (__SSAT(sum >> 15, 16)); + + /* Increment the MAC count */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + + + /* -------------------------- + * Initializations of stage3 + * -------------------------*/ + + /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] + * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] + * .... + * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] + * sum += x[srcALen-1] * y[srcBLen-1] + */ + + /* In this stage the MAC operations are decreased by 1 for every iteration. + The blockSize3 variable holds the number of MAC operations performed */ + + blockSize3 = srcBLen - 1u; + + /* Working pointer of inputA */ + pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); + px = pSrc1; + + /* Working pointer of inputB */ + pSrc2 = pIn2 + (srcBLen - 1u); + pIn2 = pSrc2 - 1u; + py = pIn2; + + /* ------------------- + * Stage3 process + * ------------------*/ + + /* For loop unrolling by 4, this stage is divided into two. */ + /* First part of this stage computes the MAC operations greater than 4 */ + /* Second part of this stage computes the MAC operations less than or equal to 4 */ + + /* The first part of the stage starts here */ + j = blockSize3 >> 2u; + + while((j > 0u) && (blockSize3 > 0u)) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = blockSize3 >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied + * with y[srcBLen - 1], y[srcBLen - 2] respectively */ + sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum); + /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied + * with y[srcBLen - 3], y[srcBLen - 4] respectively */ + sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum); + + /* Decrement the loop counter */ + k--; + } + + /* For the next MAC operations, the pointer py is used without SIMD + * So, py is incremented by 1 */ + py = py + 1u; + + /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = blockSize3 % 0x4u; + + while(k > 0u) + { + /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */ + sum = __SMLALD(*px++, *py--, sum); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (__SSAT((sum >> 15), 16)); + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = ++pSrc1; + py = pIn2; + + /* Decrement the loop counter */ + blockSize3--; + + j--; + } + + /* The second part of the stage starts here */ + /* SIMD is not used for the next MAC operations, + * so pointer py is updated to read only one sample at a time */ + py = py + 1u; + + while(blockSize3 > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = blockSize3; + + while(k > 0u) + { + /* Perform the multiply-accumulates */ + /* sum += x[srcALen-1] * y[srcBLen-1] */ + sum = __SMLALD(*px++, *py--, sum); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q15_t) (__SSAT((sum >> 15), 16)); + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = ++pSrc1; + py = pSrc2; + + /* Decrement the loop counter */ + blockSize3--; + } + +#else + +/* Run the below code for Cortex-M0 */ + + q15_t *pIn1 = pSrcA; /* input pointer */ + q15_t *pIn2 = pSrcB; /* coefficient pointer */ + q63_t sum; /* Accumulator */ + uint32_t i, j; /* loop counter */ + + /* Loop to calculate output of convolution for output length number of times */ + for (i = 0; i < (srcALen + srcBLen - 1); i++) + { + /* Initialize sum with zero to carry on MAC operations */ + sum = 0; + + /* Loop to perform MAC operations according to convolution equation */ + for (j = 0; j <= i; j++) + { + /* Check the array limitations */ + if(((i - j) < srcBLen) && (j < srcALen)) + { + /* z[i] += x[i-j] * y[j] */ + sum += (q31_t) pIn1[j] * (pIn2[i - j]); + } + } + + /* Store the output in the destination buffer */ + pDst[i] = (q15_t) __SSAT((sum >> 15u), 16u); + } + +#endif /* #if (defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE)*/ + +} + +/** + * @} end of Conv group + */ diff --git a/gui/cmsis/arm_conv_q31.c b/gui/cmsis/arm_conv_q31.c new file mode 100644 index 0000000..ffa972f --- /dev/null +++ b/gui/cmsis/arm_conv_q31.c @@ -0,0 +1,565 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010-2014 ARM Limited. All rights reserved. +* +* $Date: 19. March 2015 +* $Revision: V.1.4.5 +* +* Project: CMSIS DSP Library +* Title: arm_conv_q31.c +* +* Description: Convolution of Q31 sequences. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Redistribution and use in source and binary forms, with or without +* modification, are permitted provided that the following conditions +* are met: +* - Redistributions of source code must retain the above copyright +* notice, this list of conditions and the following disclaimer. +* - Redistributions in binary form must reproduce the above copyright +* notice, this list of conditions and the following disclaimer in +* the documentation and/or other materials provided with the +* distribution. +* - Neither the name of ARM LIMITED nor the names of its contributors +* may be used to endorse or promote products derived from this +* software without specific prior written permission. +* +* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +* POSSIBILITY OF SUCH DAMAGE. +* -------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup Conv + * @{ + */ + +/** + * @brief Convolution of Q31 sequences. + * @param[in] *pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] *pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1. + * @return none. + * + * @details + * Scaling and Overflow Behavior: + * + * \par + * The function is implemented using an internal 64-bit accumulator. + * The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit. + * There is no saturation on intermediate additions. + * Thus, if the accumulator overflows it wraps around and distorts the result. + * The input signals should be scaled down to avoid intermediate overflows. + * Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows, + * as maximum of min(srcALen, srcBLen) number of additions are carried internally. + * The 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result. + * + * \par + * See arm_conv_fast_q31() for a faster but less precise implementation of this function for Cortex-M3 and Cortex-M4. + */ + +void arm_conv_q31( + q31_t * pSrcA, + uint32_t srcALen, + q31_t * pSrcB, + uint32_t srcBLen, + q31_t * pDst) +{ + + +#ifndef ARM_MATH_CM0_FAMILY + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + q31_t *pIn1; /* inputA pointer */ + q31_t *pIn2; /* inputB pointer */ + q31_t *pOut = pDst; /* output pointer */ + q31_t *px; /* Intermediate inputA pointer */ + q31_t *py; /* Intermediate inputB pointer */ + q31_t *pSrc1, *pSrc2; /* Intermediate pointers */ + q63_t sum; /* Accumulator */ + q63_t acc0, acc1, acc2; /* Accumulator */ + q31_t x0, x1, x2, c0; /* Temporary variables to hold state and coefficient values */ + uint32_t j, k, count, blkCnt, blockSize1, blockSize2, blockSize3; /* loop counter */ + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + if(srcALen >= srcBLen) + { + /* Initialization of inputA pointer */ + pIn1 = pSrcA; + + /* Initialization of inputB pointer */ + pIn2 = pSrcB; + } + else + { + /* Initialization of inputA pointer */ + pIn1 = (q31_t *) pSrcB; + + /* Initialization of inputB pointer */ + pIn2 = (q31_t *) pSrcA; + + /* srcBLen is always considered as shorter or equal to srcALen */ + j = srcBLen; + srcBLen = srcALen; + srcALen = j; + } + + /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ + /* The function is internally + * divided into three stages according to the number of multiplications that has to be + * taken place between inputA samples and inputB samples. In the first stage of the + * algorithm, the multiplications increase by one for every iteration. + * In the second stage of the algorithm, srcBLen number of multiplications are done. + * In the third stage of the algorithm, the multiplications decrease by one + * for every iteration. */ + + /* The algorithm is implemented in three stages. + The loop counters of each stage is initiated here. */ + blockSize1 = srcBLen - 1u; + blockSize2 = srcALen - (srcBLen - 1u); + blockSize3 = blockSize1; + + /* -------------------------- + * Initializations of stage1 + * -------------------------*/ + + /* sum = x[0] * y[0] + * sum = x[0] * y[1] + x[1] * y[0] + * .... + * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] + */ + + /* In this stage the MAC operations are increased by 1 for every iteration. + The count variable holds the number of MAC operations performed */ + count = 1u; + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + py = pIn2; + + + /* ------------------------ + * Stage1 process + * ----------------------*/ + + /* The first stage starts here */ + while(blockSize1 > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* x[0] * y[srcBLen - 1] */ + sum += (q63_t) * px++ * (*py--); + /* x[1] * y[srcBLen - 2] */ + sum += (q63_t) * px++ * (*py--); + /* x[2] * y[srcBLen - 3] */ + sum += (q63_t) * px++ * (*py--); + /* x[3] * y[srcBLen - 4] */ + sum += (q63_t) * px++ * (*py--); + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4u; + + while(k > 0u) + { + /* Perform the multiply-accumulate */ + sum += (q63_t) * px++ * (*py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q31_t) (sum >> 31); + + /* Update the inputA and inputB pointers for next MAC calculation */ + py = pIn2 + count; + px = pIn1; + + /* Increment the MAC count */ + count++; + + /* Decrement the loop counter */ + blockSize1--; + } + + /* -------------------------- + * Initializations of stage2 + * ------------------------*/ + + /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] + * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] + * .... + * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] + */ + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + pSrc2 = pIn2 + (srcBLen - 1u); + py = pSrc2; + + /* count is index by which the pointer pIn1 to be incremented */ + count = 0u; + + /* ------------------- + * Stage2 process + * ------------------*/ + + /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. + * So, to loop unroll over blockSize2, + * srcBLen should be greater than or equal to 4 */ + if(srcBLen >= 4u) + { + /* Loop unroll by 3 */ + blkCnt = blockSize2 / 3; + + while(blkCnt > 0u) + { + /* Set all accumulators to zero */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + + /* read x[0], x[1], x[2] samples */ + x0 = *(px++); + x1 = *(px++); + + /* Apply loop unrolling and compute 3 MACs simultaneously. */ + k = srcBLen / 3; + + /* First part of the processing with loop unrolling. Compute 3 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 2 samples. */ + do + { + /* Read y[srcBLen - 1] sample */ + c0 = *(py); + + /* Read x[3] sample */ + x2 = *(px); + + /* Perform the multiply-accumulates */ + /* acc0 += x[0] * y[srcBLen - 1] */ + acc0 += ((q63_t) x0 * c0); + /* acc1 += x[1] * y[srcBLen - 1] */ + acc1 += ((q63_t) x1 * c0); + /* acc2 += x[2] * y[srcBLen - 1] */ + acc2 += ((q63_t) x2 * c0); + + /* Read y[srcBLen - 2] sample */ + c0 = *(py - 1u); + + /* Read x[4] sample */ + x0 = *(px + 1u); + + /* Perform the multiply-accumulate */ + /* acc0 += x[1] * y[srcBLen - 2] */ + acc0 += ((q63_t) x1 * c0); + /* acc1 += x[2] * y[srcBLen - 2] */ + acc1 += ((q63_t) x2 * c0); + /* acc2 += x[3] * y[srcBLen - 2] */ + acc2 += ((q63_t) x0 * c0); + + /* Read y[srcBLen - 3] sample */ + c0 = *(py - 2u); + + /* Read x[5] sample */ + x1 = *(px + 2u); + + /* Perform the multiply-accumulates */ + /* acc0 += x[2] * y[srcBLen - 3] */ + acc0 += ((q63_t) x2 * c0); + /* acc1 += x[3] * y[srcBLen - 2] */ + acc1 += ((q63_t) x0 * c0); + /* acc2 += x[4] * y[srcBLen - 2] */ + acc2 += ((q63_t) x1 * c0); + + /* update scratch pointers */ + px += 3u; + py -= 3u; + + } while(--k); + + /* If the srcBLen is not a multiple of 3, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen - (3 * (srcBLen / 3)); + + while(k > 0u) + { + /* Read y[srcBLen - 5] sample */ + c0 = *(py--); + + /* Read x[7] sample */ + x2 = *(px++); + + /* Perform the multiply-accumulates */ + /* acc0 += x[4] * y[srcBLen - 5] */ + acc0 += ((q63_t) x0 * c0); + /* acc1 += x[5] * y[srcBLen - 5] */ + acc1 += ((q63_t) x1 * c0); + /* acc2 += x[6] * y[srcBLen - 5] */ + acc2 += ((q63_t) x2 * c0); + + /* Reuse the present samples for the next MAC */ + x0 = x1; + x1 = x2; + + /* Decrement the loop counter */ + k--; + } + + /* Store the results in the accumulators in the destination buffer. */ + *pOut++ = (q31_t) (acc0 >> 31); + *pOut++ = (q31_t) (acc1 >> 31); + *pOut++ = (q31_t) (acc2 >> 31); + + /* Increment the pointer pIn1 index, count by 3 */ + count += 3u; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize2 is not a multiple of 3, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize2 - 3 * (blockSize2 / 3); + + while(blkCnt > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* Perform the multiply-accumulates */ + sum += (q63_t) * px++ * (*py--); + sum += (q63_t) * px++ * (*py--); + sum += (q63_t) * px++ * (*py--); + sum += (q63_t) * px++ * (*py--); + + /* Decrement the loop counter */ + k--; + } + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4u; + + while(k > 0u) + { + /* Perform the multiply-accumulate */ + sum += (q63_t) * px++ * (*py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q31_t) (sum >> 31); + + /* Increment the MAC count */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + else + { + /* If the srcBLen is not a multiple of 4, + * the blockSize2 loop cannot be unrolled by 4 */ + blkCnt = blockSize2; + + while(blkCnt > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* srcBLen number of MACS should be performed */ + k = srcBLen; + + while(k > 0u) + { + /* Perform the multiply-accumulate */ + sum += (q63_t) * px++ * (*py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q31_t) (sum >> 31); + + /* Increment the MAC count */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + + + /* -------------------------- + * Initializations of stage3 + * -------------------------*/ + + /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] + * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] + * .... + * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] + * sum += x[srcALen-1] * y[srcBLen-1] + */ + + /* In this stage the MAC operations are decreased by 1 for every iteration. + The blockSize3 variable holds the number of MAC operations performed */ + + /* Working pointer of inputA */ + pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); + px = pSrc1; + + /* Working pointer of inputB */ + pSrc2 = pIn2 + (srcBLen - 1u); + py = pSrc2; + + /* ------------------- + * Stage3 process + * ------------------*/ + + while(blockSize3 > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = blockSize3 >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */ + sum += (q63_t) * px++ * (*py--); + /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */ + sum += (q63_t) * px++ * (*py--); + /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */ + sum += (q63_t) * px++ * (*py--); + /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */ + sum += (q63_t) * px++ * (*py--); + + /* Decrement the loop counter */ + k--; + } + + /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = blockSize3 % 0x4u; + + while(k > 0u) + { + /* Perform the multiply-accumulate */ + sum += (q63_t) * px++ * (*py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q31_t) (sum >> 31); + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = ++pSrc1; + py = pSrc2; + + /* Decrement the loop counter */ + blockSize3--; + } + +#else + + /* Run the below code for Cortex-M0 */ + + q31_t *pIn1 = pSrcA; /* input pointer */ + q31_t *pIn2 = pSrcB; /* coefficient pointer */ + q63_t sum; /* Accumulator */ + uint32_t i, j; /* loop counter */ + + /* Loop to calculate output of convolution for output length number of times */ + for (i = 0; i < (srcALen + srcBLen - 1); i++) + { + /* Initialize sum with zero to carry on MAC operations */ + sum = 0; + + /* Loop to perform MAC operations according to convolution equation */ + for (j = 0; j <= i; j++) + { + /* Check the array limitations */ + if(((i - j) < srcBLen) && (j < srcALen)) + { + /* z[i] += x[i-j] * y[j] */ + sum += ((q63_t) pIn1[j] * (pIn2[i - j])); + } + } + + /* Store the output in the destination buffer */ + pDst[i] = (q31_t) (sum >> 31u); + } + +#endif /* #ifndef ARM_MATH_CM0_FAMILY */ + +} + +/** + * @} end of Conv group + */ diff --git a/gui/cmsis/arm_conv_q7.c b/gui/cmsis/arm_conv_q7.c new file mode 100644 index 0000000..79b08fc --- /dev/null +++ b/gui/cmsis/arm_conv_q7.c @@ -0,0 +1,690 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010-2014 ARM Limited. All rights reserved. +* +* $Date: 19. March 2015 +* $Revision: V.1.4.5 +* +* Project: CMSIS DSP Library +* Title: arm_conv_q7.c +* +* Description: Convolution of Q7 sequences. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Redistribution and use in source and binary forms, with or without +* modification, are permitted provided that the following conditions +* are met: +* - Redistributions of source code must retain the above copyright +* notice, this list of conditions and the following disclaimer. +* - Redistributions in binary form must reproduce the above copyright +* notice, this list of conditions and the following disclaimer in +* the documentation and/or other materials provided with the +* distribution. +* - Neither the name of ARM LIMITED nor the names of its contributors +* may be used to endorse or promote products derived from this +* software without specific prior written permission. +* +* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +* POSSIBILITY OF SUCH DAMAGE. +* -------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup Conv + * @{ + */ + +/** + * @brief Convolution of Q7 sequences. + * @param[in] *pSrcA points to the first input sequence. + * @param[in] srcALen length of the first input sequence. + * @param[in] *pSrcB points to the second input sequence. + * @param[in] srcBLen length of the second input sequence. + * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1. + * @return none. + * + * @details + * Scaling and Overflow Behavior: + * + * \par + * The function is implemented using a 32-bit internal accumulator. + * Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result. + * The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format. + * This approach provides 17 guard bits and there is no risk of overflow as long as max(srcALen, srcBLen)<131072. + * The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and then saturated to 1.7 format. + * + * \par + * Refer the function arm_conv_opt_q7() for a faster implementation of this function. + * + */ + +void arm_conv_q7( + q7_t * pSrcA, + uint32_t srcALen, + q7_t * pSrcB, + uint32_t srcBLen, + q7_t * pDst) +{ + + +#ifndef ARM_MATH_CM0_FAMILY + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + q7_t *pIn1; /* inputA pointer */ + q7_t *pIn2; /* inputB pointer */ + q7_t *pOut = pDst; /* output pointer */ + q7_t *px; /* Intermediate inputA pointer */ + q7_t *py; /* Intermediate inputB pointer */ + q7_t *pSrc1, *pSrc2; /* Intermediate pointers */ + q7_t x0, x1, x2, x3, c0, c1; /* Temporary variables to hold state and coefficient values */ + q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */ + q31_t input1, input2; /* Temporary input variables */ + q15_t in1, in2; /* Temporary input variables */ + uint32_t j, k, count, blkCnt, blockSize1, blockSize2, blockSize3; /* loop counter */ + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + if(srcALen >= srcBLen) + { + /* Initialization of inputA pointer */ + pIn1 = pSrcA; + + /* Initialization of inputB pointer */ + pIn2 = pSrcB; + } + else + { + /* Initialization of inputA pointer */ + pIn1 = pSrcB; + + /* Initialization of inputB pointer */ + pIn2 = pSrcA; + + /* srcBLen is always considered as shorter or equal to srcALen */ + j = srcBLen; + srcBLen = srcALen; + srcALen = j; + } + + /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ + /* The function is internally + * divided into three stages according to the number of multiplications that has to be + * taken place between inputA samples and inputB samples. In the first stage of the + * algorithm, the multiplications increase by one for every iteration. + * In the second stage of the algorithm, srcBLen number of multiplications are done. + * In the third stage of the algorithm, the multiplications decrease by one + * for every iteration. */ + + /* The algorithm is implemented in three stages. + The loop counters of each stage is initiated here. */ + blockSize1 = srcBLen - 1u; + blockSize2 = (srcALen - srcBLen) + 1u; + blockSize3 = blockSize1; + + /* -------------------------- + * Initializations of stage1 + * -------------------------*/ + + /* sum = x[0] * y[0] + * sum = x[0] * y[1] + x[1] * y[0] + * .... + * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] + */ + + /* In this stage the MAC operations are increased by 1 for every iteration. + The count variable holds the number of MAC operations performed */ + count = 1u; + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + py = pIn2; + + + /* ------------------------ + * Stage1 process + * ----------------------*/ + + /* The first stage starts here */ + while(blockSize1 > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = count >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* x[0] , x[1] */ + in1 = (q15_t) * px++; + in2 = (q15_t) * px++; + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); + + /* y[srcBLen - 1] , y[srcBLen - 2] */ + in1 = (q15_t) * py--; + in2 = (q15_t) * py--; + input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); + + /* x[0] * y[srcBLen - 1] */ + /* x[1] * y[srcBLen - 2] */ + sum = __SMLAD(input1, input2, sum); + + /* x[2] , x[3] */ + in1 = (q15_t) * px++; + in2 = (q15_t) * px++; + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); + + /* y[srcBLen - 3] , y[srcBLen - 4] */ + in1 = (q15_t) * py--; + in2 = (q15_t) * py--; + input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); + + /* x[2] * y[srcBLen - 3] */ + /* x[3] * y[srcBLen - 4] */ + sum = __SMLAD(input1, input2, sum); + + /* Decrement the loop counter */ + k--; + } + + /* If the count is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = count % 0x4u; + + while(k > 0u) + { + /* Perform the multiply-accumulates */ + sum += ((q15_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q7_t) (__SSAT(sum >> 7u, 8)); + + /* Update the inputA and inputB pointers for next MAC calculation */ + py = pIn2 + count; + px = pIn1; + + /* Increment the MAC count */ + count++; + + /* Decrement the loop counter */ + blockSize1--; + } + + /* -------------------------- + * Initializations of stage2 + * ------------------------*/ + + /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] + * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] + * .... + * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] + */ + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + pSrc2 = pIn2 + (srcBLen - 1u); + py = pSrc2; + + /* count is index by which the pointer pIn1 to be incremented */ + count = 0u; + + /* ------------------- + * Stage2 process + * ------------------*/ + + /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. + * So, to loop unroll over blockSize2, + * srcBLen should be greater than or equal to 4 */ + if(srcBLen >= 4u) + { + /* Loop unroll over blockSize2, by 4 */ + blkCnt = blockSize2 >> 2u; + + while(blkCnt > 0u) + { + /* Set all accumulators to zero */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + acc3 = 0; + + /* read x[0], x[1], x[2] samples */ + x0 = *(px++); + x1 = *(px++); + x2 = *(px++); + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + do + { + /* Read y[srcBLen - 1] sample */ + c0 = *(py--); + /* Read y[srcBLen - 2] sample */ + c1 = *(py--); + + /* Read x[3] sample */ + x3 = *(px++); + + /* x[0] and x[1] are packed */ + in1 = (q15_t) x0; + in2 = (q15_t) x1; + + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); + + /* y[srcBLen - 1] and y[srcBLen - 2] are packed */ + in1 = (q15_t) c0; + in2 = (q15_t) c1; + + input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); + + /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */ + acc0 = __SMLAD(input1, input2, acc0); + + /* x[1] and x[2] are packed */ + in1 = (q15_t) x1; + in2 = (q15_t) x2; + + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); + + /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */ + acc1 = __SMLAD(input1, input2, acc1); + + /* x[2] and x[3] are packed */ + in1 = (q15_t) x2; + in2 = (q15_t) x3; + + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); + + /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */ + acc2 = __SMLAD(input1, input2, acc2); + + /* Read x[4] sample */ + x0 = *(px++); + + /* x[3] and x[4] are packed */ + in1 = (q15_t) x3; + in2 = (q15_t) x0; + + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); + + /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */ + acc3 = __SMLAD(input1, input2, acc3); + + /* Read y[srcBLen - 3] sample */ + c0 = *(py--); + /* Read y[srcBLen - 4] sample */ + c1 = *(py--); + + /* Read x[5] sample */ + x1 = *(px++); + + /* x[2] and x[3] are packed */ + in1 = (q15_t) x2; + in2 = (q15_t) x3; + + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); + + /* y[srcBLen - 3] and y[srcBLen - 4] are packed */ + in1 = (q15_t) c0; + in2 = (q15_t) c1; + + input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); + + /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */ + acc0 = __SMLAD(input1, input2, acc0); + + /* x[3] and x[4] are packed */ + in1 = (q15_t) x3; + in2 = (q15_t) x0; + + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); + + /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */ + acc1 = __SMLAD(input1, input2, acc1); + + /* x[4] and x[5] are packed */ + in1 = (q15_t) x0; + in2 = (q15_t) x1; + + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); + + /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */ + acc2 = __SMLAD(input1, input2, acc2); + + /* Read x[6] sample */ + x2 = *(px++); + + /* x[5] and x[6] are packed */ + in1 = (q15_t) x1; + in2 = (q15_t) x2; + + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); + + /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */ + acc3 = __SMLAD(input1, input2, acc3); + + } while(--k); + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4u; + + while(k > 0u) + { + /* Read y[srcBLen - 5] sample */ + c0 = *(py--); + + /* Read x[7] sample */ + x3 = *(px++); + + /* Perform the multiply-accumulates */ + /* acc0 += x[4] * y[srcBLen - 5] */ + acc0 += ((q15_t) x0 * c0); + /* acc1 += x[5] * y[srcBLen - 5] */ + acc1 += ((q15_t) x1 * c0); + /* acc2 += x[6] * y[srcBLen - 5] */ + acc2 += ((q15_t) x2 * c0); + /* acc3 += x[7] * y[srcBLen - 5] */ + acc3 += ((q15_t) x3 * c0); + + /* Reuse the present samples for the next MAC */ + x0 = x1; + x1 = x2; + x2 = x3; + + /* Decrement the loop counter */ + k--; + } + + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q7_t) (__SSAT(acc0 >> 7u, 8)); + *pOut++ = (q7_t) (__SSAT(acc1 >> 7u, 8)); + *pOut++ = (q7_t) (__SSAT(acc2 >> 7u, 8)); + *pOut++ = (q7_t) (__SSAT(acc3 >> 7u, 8)); + + /* Increment the pointer pIn1 index, count by 4 */ + count += 4u; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize2 % 0x4u; + + while(blkCnt > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = srcBLen >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while(k > 0u) + { + + /* Reading two inputs of SrcA buffer and packing */ + in1 = (q15_t) * px++; + in2 = (q15_t) * px++; + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); + + /* Reading two inputs of SrcB buffer and packing */ + in1 = (q15_t) * py--; + in2 = (q15_t) * py--; + input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); + + /* Perform the multiply-accumulates */ + sum = __SMLAD(input1, input2, sum); + + /* Reading two inputs of SrcA buffer and packing */ + in1 = (q15_t) * px++; + in2 = (q15_t) * px++; + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); + + /* Reading two inputs of SrcB buffer and packing */ + in1 = (q15_t) * py--; + in2 = (q15_t) * py--; + input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); + + /* Perform the multiply-accumulates */ + sum = __SMLAD(input1, input2, sum); + + /* Decrement the loop counter */ + k--; + } + + /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = srcBLen % 0x4u; + + while(k > 0u) + { + /* Perform the multiply-accumulates */ + sum += ((q15_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q7_t) (__SSAT(sum >> 7u, 8)); + + /* Increment the pointer pIn1 index, count by 1 */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + else + { + /* If the srcBLen is not a multiple of 4, + * the blockSize2 loop cannot be unrolled by 4 */ + blkCnt = blockSize2; + + while(blkCnt > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* srcBLen number of MACS should be performed */ + k = srcBLen; + + while(k > 0u) + { + /* Perform the multiply-accumulate */ + sum += ((q15_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q7_t) (__SSAT(sum >> 7u, 8)); + + /* Increment the MAC count */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pSrc2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + + + /* -------------------------- + * Initializations of stage3 + * -------------------------*/ + + /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] + * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] + * .... + * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] + * sum += x[srcALen-1] * y[srcBLen-1] + */ + + /* In this stage the MAC operations are decreased by 1 for every iteration. + The blockSize3 variable holds the number of MAC operations performed */ + + /* Working pointer of inputA */ + pSrc1 = pIn1 + (srcALen - (srcBLen - 1u)); + px = pSrc1; + + /* Working pointer of inputB */ + pSrc2 = pIn2 + (srcBLen - 1u); + py = pSrc2; + + /* ------------------- + * Stage3 process + * ------------------*/ + + while(blockSize3 > 0u) + { + /* Accumulator is made zero for every iteration */ + sum = 0; + + /* Apply loop unrolling and compute 4 MACs simultaneously. */ + k = blockSize3 >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + ** a second loop below computes MACs for the remaining 1 to 3 samples. */ + while(k > 0u) + { + /* Reading two inputs, x[srcALen - srcBLen + 1] and x[srcALen - srcBLen + 2] of SrcA buffer and packing */ + in1 = (q15_t) * px++; + in2 = (q15_t) * px++; + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); + + /* Reading two inputs, y[srcBLen - 1] and y[srcBLen - 2] of SrcB buffer and packing */ + in1 = (q15_t) * py--; + in2 = (q15_t) * py--; + input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); + + /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */ + /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */ + sum = __SMLAD(input1, input2, sum); + + /* Reading two inputs, x[srcALen - srcBLen + 3] and x[srcALen - srcBLen + 4] of SrcA buffer and packing */ + in1 = (q15_t) * px++; + in2 = (q15_t) * px++; + input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); + + /* Reading two inputs, y[srcBLen - 3] and y[srcBLen - 4] of SrcB buffer and packing */ + in1 = (q15_t) * py--; + in2 = (q15_t) * py--; + input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u); + + /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */ + /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */ + sum = __SMLAD(input1, input2, sum); + + /* Decrement the loop counter */ + k--; + } + + /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + k = blockSize3 % 0x4u; + + while(k > 0u) + { + /* Perform the multiply-accumulates */ + sum += ((q15_t) * px++ * *py--); + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut++ = (q7_t) (__SSAT(sum >> 7u, 8)); + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = ++pSrc1; + py = pSrc2; + + /* Decrement the loop counter */ + blockSize3--; + } + +#else + + /* Run the below code for Cortex-M0 */ + + q7_t *pIn1 = pSrcA; /* input pointer */ + q7_t *pIn2 = pSrcB; /* coefficient pointer */ + q31_t sum; /* Accumulator */ + uint32_t i, j; /* loop counter */ + + /* Loop to calculate output of convolution for output length number of times */ + for (i = 0; i < (srcALen + srcBLen - 1); i++) + { + /* Initialize sum with zero to carry on MAC operations */ + sum = 0; + + /* Loop to perform MAC operations according to convolution equation */ + for (j = 0; j <= i; j++) + { + /* Check the array limitations */ + if(((i - j) < srcBLen) && (j < srcALen)) + { + /* z[i] += x[i-j] * y[j] */ + sum += (q15_t) pIn1[j] * (pIn2[i - j]); + } + } + + /* Store the output in the destination buffer */ + pDst[i] = (q7_t) __SSAT((sum >> 7u), 8u); + } + +#endif /* #ifndef ARM_MATH_CM0_FAMILY */ + +} + +/** + * @} end of Conv group + */ diff --git a/gui/cmsis/arm_fir_f32.c b/gui/cmsis/arm_fir_f32.c new file mode 100644 index 0000000..3ecb7b5 --- /dev/null +++ b/gui/cmsis/arm_fir_f32.c @@ -0,0 +1,997 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010-2014 ARM Limited. All rights reserved. +* +* $Date: 19. March 2015 +* $Revision: V.1.4.5 +* +* Project: CMSIS DSP Library +* Title: arm_fir_f32.c +* +* Description: Floating-point FIR filter processing function. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Redistribution and use in source and binary forms, with or without +* modification, are permitted provided that the following conditions +* are met: +* - Redistributions of source code must retain the above copyright +* notice, this list of conditions and the following disclaimer. +* - Redistributions in binary form must reproduce the above copyright +* notice, this list of conditions and the following disclaimer in +* the documentation and/or other materials provided with the +* distribution. +* - Neither the name of ARM LIMITED nor the names of its contributors +* may be used to endorse or promote products derived from this +* software without specific prior written permission. +* +* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +* POSSIBILITY OF SUCH DAMAGE. +* -------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** +* @ingroup groupFilters +*/ + +/** +* @defgroup FIR Finite Impulse Response (FIR) Filters +* +* This set of functions implements Finite Impulse Response (FIR) filters +* for Q7, Q15, Q31, and floating-point data types. Fast versions of Q15 and Q31 are also provided. +* The functions operate on blocks of input and output data and each call to the function processes +* blockSize samples through the filter. pSrc and +* pDst points to input and output arrays containing blockSize values. +* +* \par Algorithm: +* The FIR filter algorithm is based upon a sequence of multiply-accumulate (MAC) operations. +* Each filter coefficient b[n] is multiplied by a state variable which equals a previous input sample x[n]. +*
  
+*    y[n] = b[0] * x[n] + b[1] * x[n-1] + b[2] * x[n-2] + ...+ b[numTaps-1] * x[n-numTaps+1]  
+* 
+* \par +* \image html FIR.gif "Finite Impulse Response filter" +* \par +* pCoeffs points to a coefficient array of size numTaps. +* Coefficients are stored in time reversed order. +* \par +*
  
+*    {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}  
+* 
+* \par +* pState points to a state array of size numTaps + blockSize - 1. +* Samples in the state buffer are stored in the following order. +* \par +*
  
+*    {x[n-numTaps+1], x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2]....x[0], x[1], ..., x[blockSize-1]}  
+* 
+* \par +* Note that the length of the state buffer exceeds the length of the coefficient array by blockSize-1. +* The increased state buffer length allows circular addressing, which is traditionally used in the FIR filters, +* to be avoided and yields a significant speed improvement. +* The state variables are updated after each block of data is processed; the coefficients are untouched. +* \par Instance Structure +* The coefficients and state variables for a filter are stored together in an instance data structure. +* A separate instance structure must be defined for each filter. +* Coefficient arrays may be shared among several instances while state variable arrays cannot be shared. +* There are separate instance structure declarations for each of the 4 supported data types. +* +* \par Initialization Functions +* There is also an associated initialization function for each data type. +* The initialization function performs the following operations: +* - Sets the values of the internal structure fields. +* - Zeros out the values in the state buffer. +* To do this manually without calling the init function, assign the follow subfields of the instance structure: +* numTaps, pCoeffs, pState. Also set all of the values in pState to zero. +* +* \par +* Use of the initialization function is optional. +* However, if the initialization function is used, then the instance structure cannot be placed into a const data section. +* To place an instance structure into a const data section, the instance structure must be manually initialized. +* Set the values in the state buffer to zeros before static initialization. +* The code below statically initializes each of the 4 different data type filter instance structures +*
  
+*arm_fir_instance_f32 S = {numTaps, pState, pCoeffs};  
+*arm_fir_instance_q31 S = {numTaps, pState, pCoeffs};  
+*arm_fir_instance_q15 S = {numTaps, pState, pCoeffs};  
+*arm_fir_instance_q7 S =  {numTaps, pState, pCoeffs};  
+* 
+* +* where numTaps is the number of filter coefficients in the filter; pState is the address of the state buffer; +* pCoeffs is the address of the coefficient buffer. +* +* \par Fixed-Point Behavior +* Care must be taken when using the fixed-point versions of the FIR filter functions. +* In particular, the overflow and saturation behavior of the accumulator used in each function must be considered. +* Refer to the function specific documentation below for usage guidelines. +*/ + +/** +* @addtogroup FIR +* @{ +*/ + +/** +* +* @param[in] *S points to an instance of the floating-point FIR filter structure. +* @param[in] *pSrc points to the block of input data. +* @param[out] *pDst points to the block of output data. +* @param[in] blockSize number of samples to process per call. +* @return none. +* +*/ + +#if defined(ARM_MATH_CM7) + +void arm_fir_f32( +const arm_fir_instance_f32 * S, +float32_t * pSrc, +float32_t * pDst, +uint32_t blockSize) +{ + float32_t *pState = S->pState; /* State pointer */ + float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + float32_t *pStateCurnt; /* Points to the current sample of the state */ + float32_t *px, *pb; /* Temporary pointers for state and coefficient buffers */ + float32_t acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; /* Accumulators */ + float32_t x0, x1, x2, x3, x4, x5, x6, x7, c0; /* Temporary variables to hold state and coefficient values */ + uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ + uint32_t i, tapCnt, blkCnt; /* Loop counters */ + + /* S->pState points to state array which contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = &(S->pState[(numTaps - 1u)]); + + /* Apply loop unrolling and compute 8 output values simultaneously. + * The variables acc0 ... acc7 hold output values that are being computed: + * + * acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] + * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1] + * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2] + * acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3] + */ + blkCnt = blockSize >> 3; + + /* First part of the processing with loop unrolling. Compute 8 outputs at a time. + ** a second loop below computes the remaining 1 to 7 samples. */ + while(blkCnt > 0u) + { + /* Copy four new input samples into the state buffer */ + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + + /* Set all accumulators to zero */ + acc0 = 0.0f; + acc1 = 0.0f; + acc2 = 0.0f; + acc3 = 0.0f; + acc4 = 0.0f; + acc5 = 0.0f; + acc6 = 0.0f; + acc7 = 0.0f; + + /* Initialize state pointer */ + px = pState; + + /* Initialize coeff pointer */ + pb = (pCoeffs); + + /* This is separated from the others to avoid + * a call to __aeabi_memmove which would be slower + */ + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + + /* Read the first seven samples from the state buffer: x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2] */ + x0 = *px++; + x1 = *px++; + x2 = *px++; + x3 = *px++; + x4 = *px++; + x5 = *px++; + x6 = *px++; + + /* Loop unrolling. Process 8 taps at a time. */ + tapCnt = numTaps >> 3u; + + /* Loop over the number of taps. Unroll by a factor of 8. + ** Repeat until we've computed numTaps-8 coefficients. */ + while(tapCnt > 0u) + { + /* Read the b[numTaps-1] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-3] sample */ + x7 = *(px++); + + /* acc0 += b[numTaps-1] * x[n-numTaps] */ + acc0 += x0 * c0; + + /* acc1 += b[numTaps-1] * x[n-numTaps-1] */ + acc1 += x1 * c0; + + /* acc2 += b[numTaps-1] * x[n-numTaps-2] */ + acc2 += x2 * c0; + + /* acc3 += b[numTaps-1] * x[n-numTaps-3] */ + acc3 += x3 * c0; + + /* acc4 += b[numTaps-1] * x[n-numTaps-4] */ + acc4 += x4 * c0; + + /* acc1 += b[numTaps-1] * x[n-numTaps-5] */ + acc5 += x5 * c0; + + /* acc2 += b[numTaps-1] * x[n-numTaps-6] */ + acc6 += x6 * c0; + + /* acc3 += b[numTaps-1] * x[n-numTaps-7] */ + acc7 += x7 * c0; + + /* Read the b[numTaps-2] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-4] sample */ + x0 = *(px++); + + /* Perform the multiply-accumulate */ + acc0 += x1 * c0; + acc1 += x2 * c0; + acc2 += x3 * c0; + acc3 += x4 * c0; + acc4 += x5 * c0; + acc5 += x6 * c0; + acc6 += x7 * c0; + acc7 += x0 * c0; + + /* Read the b[numTaps-3] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-5] sample */ + x1 = *(px++); + + /* Perform the multiply-accumulates */ + acc0 += x2 * c0; + acc1 += x3 * c0; + acc2 += x4 * c0; + acc3 += x5 * c0; + acc4 += x6 * c0; + acc5 += x7 * c0; + acc6 += x0 * c0; + acc7 += x1 * c0; + + /* Read the b[numTaps-4] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-6] sample */ + x2 = *(px++); + + /* Perform the multiply-accumulates */ + acc0 += x3 * c0; + acc1 += x4 * c0; + acc2 += x5 * c0; + acc3 += x6 * c0; + acc4 += x7 * c0; + acc5 += x0 * c0; + acc6 += x1 * c0; + acc7 += x2 * c0; + + /* Read the b[numTaps-4] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-6] sample */ + x3 = *(px++); + /* Perform the multiply-accumulates */ + acc0 += x4 * c0; + acc1 += x5 * c0; + acc2 += x6 * c0; + acc3 += x7 * c0; + acc4 += x0 * c0; + acc5 += x1 * c0; + acc6 += x2 * c0; + acc7 += x3 * c0; + + /* Read the b[numTaps-4] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-6] sample */ + x4 = *(px++); + + /* Perform the multiply-accumulates */ + acc0 += x5 * c0; + acc1 += x6 * c0; + acc2 += x7 * c0; + acc3 += x0 * c0; + acc4 += x1 * c0; + acc5 += x2 * c0; + acc6 += x3 * c0; + acc7 += x4 * c0; + + /* Read the b[numTaps-4] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-6] sample */ + x5 = *(px++); + + /* Perform the multiply-accumulates */ + acc0 += x6 * c0; + acc1 += x7 * c0; + acc2 += x0 * c0; + acc3 += x1 * c0; + acc4 += x2 * c0; + acc5 += x3 * c0; + acc6 += x4 * c0; + acc7 += x5 * c0; + + /* Read the b[numTaps-4] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-6] sample */ + x6 = *(px++); + + /* Perform the multiply-accumulates */ + acc0 += x7 * c0; + acc1 += x0 * c0; + acc2 += x1 * c0; + acc3 += x2 * c0; + acc4 += x3 * c0; + acc5 += x4 * c0; + acc6 += x5 * c0; + acc7 += x6 * c0; + + tapCnt--; + } + + /* If the filter length is not a multiple of 8, compute the remaining filter taps */ + tapCnt = numTaps % 0x8u; + + while(tapCnt > 0u) + { + /* Read coefficients */ + c0 = *(pb++); + + /* Fetch 1 state variable */ + x7 = *(px++); + + /* Perform the multiply-accumulates */ + acc0 += x0 * c0; + acc1 += x1 * c0; + acc2 += x2 * c0; + acc3 += x3 * c0; + acc4 += x4 * c0; + acc5 += x5 * c0; + acc6 += x6 * c0; + acc7 += x7 * c0; + + /* Reuse the present sample states for next sample */ + x0 = x1; + x1 = x2; + x2 = x3; + x3 = x4; + x4 = x5; + x5 = x6; + x6 = x7; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Advance the state pointer by 8 to process the next group of 8 samples */ + pState = pState + 8; + + /* The results in the 8 accumulators, store in the destination buffer. */ + *pDst++ = acc0; + *pDst++ = acc1; + *pDst++ = acc2; + *pDst++ = acc3; + *pDst++ = acc4; + *pDst++ = acc5; + *pDst++ = acc6; + *pDst++ = acc7; + + blkCnt--; + } + + /* If the blockSize is not a multiple of 8, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize % 0x8u; + + while(blkCnt > 0u) + { + /* Copy one sample at a time into state buffer */ + *pStateCurnt++ = *pSrc++; + + /* Set the accumulator to zero */ + acc0 = 0.0f; + + /* Initialize state pointer */ + px = pState; + + /* Initialize Coefficient pointer */ + pb = (pCoeffs); + + i = numTaps; + + /* Perform the multiply-accumulates */ + do + { + acc0 += *px++ * *pb++; + i--; + + } while(i > 0u); + + /* The result is store in the destination buffer. */ + *pDst++ = acc0; + + /* Advance state pointer by 1 for the next sample */ + pState = pState + 1; + + blkCnt--; + } + + /* Processing is complete. + ** Now copy the last numTaps - 1 samples to the start of the state buffer. + ** This prepares the state buffer for the next function call. */ + + /* Points to the start of the state buffer */ + pStateCurnt = S->pState; + + tapCnt = (numTaps - 1u) >> 2u; + + /* copy data */ + while(tapCnt > 0u) + { + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Calculate remaining number of copies */ + tapCnt = (numTaps - 1u) % 0x4u; + + /* Copy the remaining q31_t data */ + while(tapCnt > 0u) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } +} + +#elif defined(ARM_MATH_CM0_FAMILY) + +void arm_fir_f32( +const arm_fir_instance_f32 * S, +float32_t * pSrc, +float32_t * pDst, +uint32_t blockSize) +{ + float32_t *pState = S->pState; /* State pointer */ + float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + float32_t *pStateCurnt; /* Points to the current sample of the state */ + float32_t *px, *pb; /* Temporary pointers for state and coefficient buffers */ + uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ + uint32_t i, tapCnt, blkCnt; /* Loop counters */ + + /* Run the below code for Cortex-M0 */ + + float32_t acc; + + /* S->pState points to state array which contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = &(S->pState[(numTaps - 1u)]); + + /* Initialize blkCnt with blockSize */ + blkCnt = blockSize; + + while(blkCnt > 0u) + { + /* Copy one sample at a time into state buffer */ + *pStateCurnt++ = *pSrc++; + + /* Set the accumulator to zero */ + acc = 0.0f; + + /* Initialize state pointer */ + px = pState; + + /* Initialize Coefficient pointer */ + pb = pCoeffs; + + i = numTaps; + + /* Perform the multiply-accumulates */ + do + { + /* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */ + acc += *px++ * *pb++; + i--; + + } while(i > 0u); + + /* The result is store in the destination buffer. */ + *pDst++ = acc; + + /* Advance state pointer by 1 for the next sample */ + pState = pState + 1; + + blkCnt--; + } + + /* Processing is complete. + ** Now copy the last numTaps - 1 samples to the starting of the state buffer. + ** This prepares the state buffer for the next function call. */ + + /* Points to the start of the state buffer */ + pStateCurnt = S->pState; + + /* Copy numTaps number of values */ + tapCnt = numTaps - 1u; + + /* Copy data */ + while(tapCnt > 0u) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + +} + +#else + +/* Run the below code for Cortex-M4 and Cortex-M3 */ + +void arm_fir_f32( +const arm_fir_instance_f32 * S, +float32_t * pSrc, +float32_t * pDst, +uint32_t blockSize) +{ + float32_t *pState = S->pState; /* State pointer */ + float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + float32_t *pStateCurnt; /* Points to the current sample of the state */ + float32_t *px, *pb; /* Temporary pointers for state and coefficient buffers */ + float32_t acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; /* Accumulators */ + float32_t x0, x1, x2, x3, x4, x5, x6, x7, c0; /* Temporary variables to hold state and coefficient values */ + uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ + uint32_t i, tapCnt, blkCnt; /* Loop counters */ + float32_t p0,p1,p2,p3,p4,p5,p6,p7; /* Temporary product values */ + + /* S->pState points to state array which contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = &(S->pState[(numTaps - 1u)]); + + /* Apply loop unrolling and compute 8 output values simultaneously. + * The variables acc0 ... acc7 hold output values that are being computed: + * + * acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] + * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1] + * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2] + * acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3] + */ + blkCnt = blockSize >> 3; + + /* First part of the processing with loop unrolling. Compute 8 outputs at a time. + ** a second loop below computes the remaining 1 to 7 samples. */ + while(blkCnt > 0u) + { + /* Copy four new input samples into the state buffer */ + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + + /* Set all accumulators to zero */ + acc0 = 0.0f; + acc1 = 0.0f; + acc2 = 0.0f; + acc3 = 0.0f; + acc4 = 0.0f; + acc5 = 0.0f; + acc6 = 0.0f; + acc7 = 0.0f; + + /* Initialize state pointer */ + px = pState; + + /* Initialize coeff pointer */ + pb = (pCoeffs); + + /* This is separated from the others to avoid + * a call to __aeabi_memmove which would be slower + */ + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + + /* Read the first seven samples from the state buffer: x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2] */ + x0 = *px++; + x1 = *px++; + x2 = *px++; + x3 = *px++; + x4 = *px++; + x5 = *px++; + x6 = *px++; + + /* Loop unrolling. Process 8 taps at a time. */ + tapCnt = numTaps >> 3u; + + /* Loop over the number of taps. Unroll by a factor of 8. + ** Repeat until we've computed numTaps-8 coefficients. */ + while(tapCnt > 0u) + { + /* Read the b[numTaps-1] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-3] sample */ + x7 = *(px++); + + /* acc0 += b[numTaps-1] * x[n-numTaps] */ + p0 = x0 * c0; + + /* acc1 += b[numTaps-1] * x[n-numTaps-1] */ + p1 = x1 * c0; + + /* acc2 += b[numTaps-1] * x[n-numTaps-2] */ + p2 = x2 * c0; + + /* acc3 += b[numTaps-1] * x[n-numTaps-3] */ + p3 = x3 * c0; + + /* acc4 += b[numTaps-1] * x[n-numTaps-4] */ + p4 = x4 * c0; + + /* acc1 += b[numTaps-1] * x[n-numTaps-5] */ + p5 = x5 * c0; + + /* acc2 += b[numTaps-1] * x[n-numTaps-6] */ + p6 = x6 * c0; + + /* acc3 += b[numTaps-1] * x[n-numTaps-7] */ + p7 = x7 * c0; + + /* Read the b[numTaps-2] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-4] sample */ + x0 = *(px++); + + acc0 += p0; + acc1 += p1; + acc2 += p2; + acc3 += p3; + acc4 += p4; + acc5 += p5; + acc6 += p6; + acc7 += p7; + + + /* Perform the multiply-accumulate */ + p0 = x1 * c0; + p1 = x2 * c0; + p2 = x3 * c0; + p3 = x4 * c0; + p4 = x5 * c0; + p5 = x6 * c0; + p6 = x7 * c0; + p7 = x0 * c0; + + /* Read the b[numTaps-3] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-5] sample */ + x1 = *(px++); + + acc0 += p0; + acc1 += p1; + acc2 += p2; + acc3 += p3; + acc4 += p4; + acc5 += p5; + acc6 += p6; + acc7 += p7; + + /* Perform the multiply-accumulates */ + p0 = x2 * c0; + p1 = x3 * c0; + p2 = x4 * c0; + p3 = x5 * c0; + p4 = x6 * c0; + p5 = x7 * c0; + p6 = x0 * c0; + p7 = x1 * c0; + + /* Read the b[numTaps-4] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-6] sample */ + x2 = *(px++); + + acc0 += p0; + acc1 += p1; + acc2 += p2; + acc3 += p3; + acc4 += p4; + acc5 += p5; + acc6 += p6; + acc7 += p7; + + /* Perform the multiply-accumulates */ + p0 = x3 * c0; + p1 = x4 * c0; + p2 = x5 * c0; + p3 = x6 * c0; + p4 = x7 * c0; + p5 = x0 * c0; + p6 = x1 * c0; + p7 = x2 * c0; + + /* Read the b[numTaps-4] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-6] sample */ + x3 = *(px++); + + acc0 += p0; + acc1 += p1; + acc2 += p2; + acc3 += p3; + acc4 += p4; + acc5 += p5; + acc6 += p6; + acc7 += p7; + + /* Perform the multiply-accumulates */ + p0 = x4 * c0; + p1 = x5 * c0; + p2 = x6 * c0; + p3 = x7 * c0; + p4 = x0 * c0; + p5 = x1 * c0; + p6 = x2 * c0; + p7 = x3 * c0; + + /* Read the b[numTaps-4] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-6] sample */ + x4 = *(px++); + + acc0 += p0; + acc1 += p1; + acc2 += p2; + acc3 += p3; + acc4 += p4; + acc5 += p5; + acc6 += p6; + acc7 += p7; + + /* Perform the multiply-accumulates */ + p0 = x5 * c0; + p1 = x6 * c0; + p2 = x7 * c0; + p3 = x0 * c0; + p4 = x1 * c0; + p5 = x2 * c0; + p6 = x3 * c0; + p7 = x4 * c0; + + /* Read the b[numTaps-4] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-6] sample */ + x5 = *(px++); + + acc0 += p0; + acc1 += p1; + acc2 += p2; + acc3 += p3; + acc4 += p4; + acc5 += p5; + acc6 += p6; + acc7 += p7; + + /* Perform the multiply-accumulates */ + p0 = x6 * c0; + p1 = x7 * c0; + p2 = x0 * c0; + p3 = x1 * c0; + p4 = x2 * c0; + p5 = x3 * c0; + p6 = x4 * c0; + p7 = x5 * c0; + + /* Read the b[numTaps-4] coefficient */ + c0 = *(pb++); + + /* Read x[n-numTaps-6] sample */ + x6 = *(px++); + + acc0 += p0; + acc1 += p1; + acc2 += p2; + acc3 += p3; + acc4 += p4; + acc5 += p5; + acc6 += p6; + acc7 += p7; + + /* Perform the multiply-accumulates */ + p0 = x7 * c0; + p1 = x0 * c0; + p2 = x1 * c0; + p3 = x2 * c0; + p4 = x3 * c0; + p5 = x4 * c0; + p6 = x5 * c0; + p7 = x6 * c0; + + tapCnt--; + + acc0 += p0; + acc1 += p1; + acc2 += p2; + acc3 += p3; + acc4 += p4; + acc5 += p5; + acc6 += p6; + acc7 += p7; + } + + /* If the filter length is not a multiple of 8, compute the remaining filter taps */ + tapCnt = numTaps % 0x8u; + + while(tapCnt > 0u) + { + /* Read coefficients */ + c0 = *(pb++); + + /* Fetch 1 state variable */ + x7 = *(px++); + + /* Perform the multiply-accumulates */ + p0 = x0 * c0; + p1 = x1 * c0; + p2 = x2 * c0; + p3 = x3 * c0; + p4 = x4 * c0; + p5 = x5 * c0; + p6 = x6 * c0; + p7 = x7 * c0; + + /* Reuse the present sample states for next sample */ + x0 = x1; + x1 = x2; + x2 = x3; + x3 = x4; + x4 = x5; + x5 = x6; + x6 = x7; + + acc0 += p0; + acc1 += p1; + acc2 += p2; + acc3 += p3; + acc4 += p4; + acc5 += p5; + acc6 += p6; + acc7 += p7; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Advance the state pointer by 8 to process the next group of 8 samples */ + pState = pState + 8; + + /* The results in the 8 accumulators, store in the destination buffer. */ + *pDst++ = acc0; + *pDst++ = acc1; + *pDst++ = acc2; + *pDst++ = acc3; + *pDst++ = acc4; + *pDst++ = acc5; + *pDst++ = acc6; + *pDst++ = acc7; + + blkCnt--; + } + + /* If the blockSize is not a multiple of 8, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize % 0x8u; + + while(blkCnt > 0u) + { + /* Copy one sample at a time into state buffer */ + *pStateCurnt++ = *pSrc++; + + /* Set the accumulator to zero */ + acc0 = 0.0f; + + /* Initialize state pointer */ + px = pState; + + /* Initialize Coefficient pointer */ + pb = (pCoeffs); + + i = numTaps; + + /* Perform the multiply-accumulates */ + do + { + acc0 += *px++ * *pb++; + i--; + + } while(i > 0u); + + /* The result is store in the destination buffer. */ + *pDst++ = acc0; + + /* Advance state pointer by 1 for the next sample */ + pState = pState + 1; + + blkCnt--; + } + + /* Processing is complete. + ** Now copy the last numTaps - 1 samples to the start of the state buffer. + ** This prepares the state buffer for the next function call. */ + + /* Points to the start of the state buffer */ + pStateCurnt = S->pState; + + tapCnt = (numTaps - 1u) >> 2u; + + /* copy data */ + while(tapCnt > 0u) + { + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Calculate remaining number of copies */ + tapCnt = (numTaps - 1u) % 0x4u; + + /* Copy the remaining q31_t data */ + while(tapCnt > 0u) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } +} + +#endif + +/** +* @} end of FIR group +*/ diff --git a/gui/cmsis/arm_fir_init_f32.c b/gui/cmsis/arm_fir_init_f32.c new file mode 100644 index 0000000..92bdc9e --- /dev/null +++ b/gui/cmsis/arm_fir_init_f32.c @@ -0,0 +1,96 @@ +/*----------------------------------------------------------------------------- +* Copyright (C) 2010-2014 ARM Limited. All rights reserved. +* +* $Date: 19. March 2015 +* $Revision: V.1.4.5 +* +* Project: CMSIS DSP Library +* Title: arm_fir_init_f32.c +* +* Description: Floating-point FIR filter initialization function. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Redistribution and use in source and binary forms, with or without +* modification, are permitted provided that the following conditions +* are met: +* - Redistributions of source code must retain the above copyright +* notice, this list of conditions and the following disclaimer. +* - Redistributions in binary form must reproduce the above copyright +* notice, this list of conditions and the following disclaimer in +* the documentation and/or other materials provided with the +* distribution. +* - Neither the name of ARM LIMITED nor the names of its contributors +* may be used to endorse or promote products derived from this +* software without specific prior written permission. +* +* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +* POSSIBILITY OF SUCH DAMAGE. +* ---------------------------------------------------------------------------*/ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup FIR + * @{ + */ + +/** + * @details + * + * @param[in,out] *S points to an instance of the floating-point FIR filter structure. + * @param[in] numTaps Number of filter coefficients in the filter. + * @param[in] *pCoeffs points to the filter coefficients buffer. + * @param[in] *pState points to the state buffer. + * @param[in] blockSize number of samples that are processed per call. + * @return none. + * + * Description: + * \par + * pCoeffs points to the array of filter coefficients stored in time reversed order: + *
    
+ *    {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}    
+ * 
+ * \par + * pState points to the array of state variables. + * pState is of length numTaps+blockSize-1 samples, where blockSize is the number of input samples processed by each call to arm_fir_f32(). + */ + +void arm_fir_init_f32( + arm_fir_instance_f32 * S, + uint16_t numTaps, + float32_t * pCoeffs, + float32_t * pState, + uint32_t blockSize) +{ + /* Assign filter taps */ + S->numTaps = numTaps; + + /* Assign coefficient pointer */ + S->pCoeffs = pCoeffs; + + /* Clear state buffer and the size of state buffer is (blockSize + numTaps - 1) */ + memset(pState, 0, (numTaps + (blockSize - 1u)) * sizeof(float32_t)); + + /* Assign state pointer */ + S->pState = pState; + +} + +/** + * @} end of FIR group + */ diff --git a/gui/cmsis/arm_fir_init_q15.c b/gui/cmsis/arm_fir_init_q15.c new file mode 100644 index 0000000..d976d73 --- /dev/null +++ b/gui/cmsis/arm_fir_init_q15.c @@ -0,0 +1,154 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010-2014 ARM Limited. All rights reserved. +* +* $Date: 19. March 2015 +* $Revision: V.1.4.5 +* +* Project: CMSIS DSP Library +* Title: arm_fir_init_q15.c +* +* Description: Q15 FIR filter initialization function. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Redistribution and use in source and binary forms, with or without +* modification, are permitted provided that the following conditions +* are met: +* - Redistributions of source code must retain the above copyright +* notice, this list of conditions and the following disclaimer. +* - Redistributions in binary form must reproduce the above copyright +* notice, this list of conditions and the following disclaimer in +* the documentation and/or other materials provided with the +* distribution. +* - Neither the name of ARM LIMITED nor the names of its contributors +* may be used to endorse or promote products derived from this +* software without specific prior written permission. +* +* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +* POSSIBILITY OF SUCH DAMAGE. +* ------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup FIR + * @{ + */ + +/** + * @param[in,out] *S points to an instance of the Q15 FIR filter structure. + * @param[in] numTaps Number of filter coefficients in the filter. Must be even and greater than or equal to 4. + * @param[in] *pCoeffs points to the filter coefficients buffer. + * @param[in] *pState points to the state buffer. + * @param[in] blockSize is number of samples processed per call. + * @return The function returns ARM_MATH_SUCCESS if initialization is successful or ARM_MATH_ARGUMENT_ERROR if + * numTaps is not greater than or equal to 4 and even. + * + * Description: + * \par + * pCoeffs points to the array of filter coefficients stored in time reversed order: + *
    
+ *    {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}    
+ * 
+ * Note that numTaps must be even and greater than or equal to 4. + * To implement an odd length filter simply increase numTaps by 1 and set the last coefficient to zero. + * For example, to implement a filter with numTaps=3 and coefficients + *
    
+ *     {0.3, -0.8, 0.3}    
+ * 
+ * set numTaps=4 and use the coefficients: + *
    
+ *     {0.3, -0.8, 0.3, 0}.    
+ * 
+ * Similarly, to implement a two point filter + *
    
+ *     {0.3, -0.3}    
+ * 
+ * set numTaps=4 and use the coefficients: + *
    
+ *     {0.3, -0.3, 0, 0}.    
+ * 
+ * \par + * pState points to the array of state variables. + * pState is of length numTaps+blockSize, when running on Cortex-M4 and Cortex-M3 and is of length numTaps+blockSize-1, when running on Cortex-M0 where blockSize is the number of input samples processed by each call to arm_fir_q15(). + */ + +arm_status arm_fir_init_q15( + arm_fir_instance_q15 * S, + uint16_t numTaps, + q15_t * pCoeffs, + q15_t * pState, + uint32_t blockSize) +{ + arm_status status; + + +#ifndef ARM_MATH_CM0_FAMILY + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + /* The Number of filter coefficients in the filter must be even and at least 4 */ + if(numTaps & 0x1u) + { + status = ARM_MATH_ARGUMENT_ERROR; + } + else + { + /* Assign filter taps */ + S->numTaps = numTaps; + + /* Assign coefficient pointer */ + S->pCoeffs = pCoeffs; + + /* Clear the state buffer. The size is always (blockSize + numTaps ) */ + memset(pState, 0, (numTaps + (blockSize)) * sizeof(q15_t)); + + /* Assign state pointer */ + S->pState = pState; + + status = ARM_MATH_SUCCESS; + } + + return (status); + +#else + + /* Run the below code for Cortex-M0 */ + + /* Assign filter taps */ + S->numTaps = numTaps; + + /* Assign coefficient pointer */ + S->pCoeffs = pCoeffs; + + /* Clear the state buffer. The size is always (blockSize + numTaps - 1) */ + memset(pState, 0, (numTaps + (blockSize - 1u)) * sizeof(q15_t)); + + /* Assign state pointer */ + S->pState = pState; + + status = ARM_MATH_SUCCESS; + + return (status); + +#endif /* #ifndef ARM_MATH_CM0_FAMILY */ + +} + +/** + * @} end of FIR group + */ diff --git a/gui/cmsis/arm_fir_init_q31.c b/gui/cmsis/arm_fir_init_q31.c new file mode 100644 index 0000000..726cdfc --- /dev/null +++ b/gui/cmsis/arm_fir_init_q31.c @@ -0,0 +1,96 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010-2014 ARM Limited. All rights reserved. +* +* $Date: 19. March 2015 +* $Revision: V.1.4.5 +* +* Project: CMSIS DSP Library +* Title: arm_fir_init_q31.c +* +* Description: Q31 FIR filter initialization function. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Redistribution and use in source and binary forms, with or without +* modification, are permitted provided that the following conditions +* are met: +* - Redistributions of source code must retain the above copyright +* notice, this list of conditions and the following disclaimer. +* - Redistributions in binary form must reproduce the above copyright +* notice, this list of conditions and the following disclaimer in +* the documentation and/or other materials provided with the +* distribution. +* - Neither the name of ARM LIMITED nor the names of its contributors +* may be used to endorse or promote products derived from this +* software without specific prior written permission. +* +* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +* POSSIBILITY OF SUCH DAMAGE. +* -------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup FIR + * @{ + */ + +/** + * @details + * + * @param[in,out] *S points to an instance of the Q31 FIR filter structure. + * @param[in] numTaps Number of filter coefficients in the filter. + * @param[in] *pCoeffs points to the filter coefficients buffer. + * @param[in] *pState points to the state buffer. + * @param[in] blockSize number of samples that are processed per call. + * @return none. + * + * Description: + * \par + * pCoeffs points to the array of filter coefficients stored in time reversed order: + *
    
+ *    {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}    
+ * 
+ * \par + * pState points to the array of state variables. + * pState is of length numTaps+blockSize-1 samples, where blockSize is the number of input samples processed by each call to arm_fir_q31(). + */ + +void arm_fir_init_q31( + arm_fir_instance_q31 * S, + uint16_t numTaps, + q31_t * pCoeffs, + q31_t * pState, + uint32_t blockSize) +{ + /* Assign filter taps */ + S->numTaps = numTaps; + + /* Assign coefficient pointer */ + S->pCoeffs = pCoeffs; + + /* Clear state buffer and state array size is (blockSize + numTaps - 1) */ + memset(pState, 0, (blockSize + ((uint32_t) numTaps - 1u)) * sizeof(q31_t)); + + /* Assign state pointer */ + S->pState = pState; + +} + +/** + * @} end of FIR group + */ diff --git a/gui/cmsis/arm_fir_init_q7.c b/gui/cmsis/arm_fir_init_q7.c new file mode 100644 index 0000000..083d58e --- /dev/null +++ b/gui/cmsis/arm_fir_init_q7.c @@ -0,0 +1,94 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010-2014 ARM Limited. All rights reserved. +* +* $Date: 19. March 2015 +* $Revision: V.1.4.5 +* +* Project: CMSIS DSP Library +* Title: arm_fir_init_q7.c +* +* Description: Q7 FIR filter initialization function. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Redistribution and use in source and binary forms, with or without +* modification, are permitted provided that the following conditions +* are met: +* - Redistributions of source code must retain the above copyright +* notice, this list of conditions and the following disclaimer. +* - Redistributions in binary form must reproduce the above copyright +* notice, this list of conditions and the following disclaimer in +* the documentation and/or other materials provided with the +* distribution. +* - Neither the name of ARM LIMITED nor the names of its contributors +* may be used to endorse or promote products derived from this +* software without specific prior written permission. +* +* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +* POSSIBILITY OF SUCH DAMAGE. +* ------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup FIR + * @{ + */ +/** + * @param[in,out] *S points to an instance of the Q7 FIR filter structure. + * @param[in] numTaps Number of filter coefficients in the filter. + * @param[in] *pCoeffs points to the filter coefficients buffer. + * @param[in] *pState points to the state buffer. + * @param[in] blockSize number of samples that are processed per call. + * @return none + * + * Description: + * \par + * pCoeffs points to the array of filter coefficients stored in time reversed order: + *
    
+ *    {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}    
+ * 
+ * \par + * pState points to the array of state variables. + * pState is of length numTaps+blockSize-1 samples, where blockSize is the number of input samples processed by each call to arm_fir_q7(). + */ + +void arm_fir_init_q7( + arm_fir_instance_q7 * S, + uint16_t numTaps, + q7_t * pCoeffs, + q7_t * pState, + uint32_t blockSize) +{ + + /* Assign filter taps */ + S->numTaps = numTaps; + + /* Assign coefficient pointer */ + S->pCoeffs = pCoeffs; + + /* Clear the state buffer. The size is always (blockSize + numTaps - 1) */ + memset(pState, 0, (numTaps + (blockSize - 1u)) * sizeof(q7_t)); + + /* Assign state pointer */ + S->pState = pState; + +} + +/** + * @} end of FIR group + */ diff --git a/gui/cmsis/arm_fir_q15.c b/gui/cmsis/arm_fir_q15.c new file mode 100644 index 0000000..f3c595f --- /dev/null +++ b/gui/cmsis/arm_fir_q15.c @@ -0,0 +1,691 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010-2014 ARM Limited. All rights reserved. +* +* $Date: 19. March 2015 +* $Revision: V.1.4.5 +* +* Project: CMSIS DSP Library +* Title: arm_fir_q15.c +* +* Description: Q15 FIR filter processing function. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Redistribution and use in source and binary forms, with or without +* modification, are permitted provided that the following conditions +* are met: +* - Redistributions of source code must retain the above copyright +* notice, this list of conditions and the following disclaimer. +* - Redistributions in binary form must reproduce the above copyright +* notice, this list of conditions and the following disclaimer in +* the documentation and/or other materials provided with the +* distribution. +* - Neither the name of ARM LIMITED nor the names of its contributors +* may be used to endorse or promote products derived from this +* software without specific prior written permission. +* +* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +* POSSIBILITY OF SUCH DAMAGE. +* -------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup FIR + * @{ + */ + +/** + * @brief Processing function for the Q15 FIR filter. + * @param[in] *S points to an instance of the Q15 FIR structure. + * @param[in] *pSrc points to the block of input data. + * @param[out] *pDst points to the block of output data. + * @param[in] blockSize number of samples to process per call. + * @return none. + * + * + * \par Restrictions + * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE + * In this case input, output, state buffers should be aligned by 32-bit + * + * Scaling and Overflow Behavior: + * \par + * The function is implemented using a 64-bit internal accumulator. + * Both coefficients and state variables are represented in 1.15 format and multiplications yield a 2.30 result. + * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format. + * There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved. + * After all additions have been performed, the accumulator is truncated to 34.15 format by discarding low 15 bits. + * Lastly, the accumulator is saturated to yield a result in 1.15 format. + * + * \par + * Refer to the function arm_fir_fast_q15() for a faster but less precise implementation of this function. + */ + +#ifndef ARM_MATH_CM0_FAMILY + +/* Run the below code for Cortex-M4 and Cortex-M3 */ + +#ifndef UNALIGNED_SUPPORT_DISABLE + + +void arm_fir_q15( + const arm_fir_instance_q15 * S, + q15_t * pSrc, + q15_t * pDst, + uint32_t blockSize) +{ + q15_t *pState = S->pState; /* State pointer */ + q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + q15_t *pStateCurnt; /* Points to the current sample of the state */ + q15_t *px1; /* Temporary q15 pointer for state buffer */ + q15_t *pb; /* Temporary pointer for coefficient buffer */ + q31_t x0, x1, x2, x3, c0; /* Temporary variables to hold SIMD state and coefficient values */ + q63_t acc0, acc1, acc2, acc3; /* Accumulators */ + uint32_t numTaps = S->numTaps; /* Number of taps in the filter */ + uint32_t tapCnt, blkCnt; /* Loop counters */ + + + /* S->pState points to state array which contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = &(S->pState[(numTaps - 1u)]); + + /* Apply loop unrolling and compute 4 output values simultaneously. + * The variables acc0 ... acc3 hold output values that are being computed: + * + * acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] + * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1] + * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2] + * acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3] + */ + + blkCnt = blockSize >> 2; + + /* First part of the processing with loop unrolling. Compute 4 outputs at a time. + ** a second loop below computes the remaining 1 to 3 samples. */ + while(blkCnt > 0u) + { + /* Copy four new input samples into the state buffer. + ** Use 32-bit SIMD to move the 16-bit data. Only requires two copies. */ + *__SIMD32(pStateCurnt)++ = *__SIMD32(pSrc)++; + *__SIMD32(pStateCurnt)++ = *__SIMD32(pSrc)++; + + /* Set all accumulators to zero */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + acc3 = 0; + + /* Initialize state pointer of type q15 */ + px1 = pState; + + /* Initialize coeff pointer of type q31 */ + pb = pCoeffs; + + /* Read the first two samples from the state buffer: x[n-N], x[n-N-1] */ + x0 = _SIMD32_OFFSET(px1); + + /* Read the third and forth samples from the state buffer: x[n-N-1], x[n-N-2] */ + x1 = _SIMD32_OFFSET(px1 + 1u); + + px1 += 2u; + + /* Loop over the number of taps. Unroll by a factor of 4. + ** Repeat until we've computed numTaps-4 coefficients. */ + tapCnt = numTaps >> 2; + + while(tapCnt > 0u) + { + /* Read the first two coefficients using SIMD: b[N] and b[N-1] coefficients */ + c0 = *__SIMD32(pb)++; + + /* acc0 += b[N] * x[n-N] + b[N-1] * x[n-N-1] */ + acc0 = __SMLALD(x0, c0, acc0); + + /* acc1 += b[N] * x[n-N-1] + b[N-1] * x[n-N-2] */ + acc1 = __SMLALD(x1, c0, acc1); + + /* Read state x[n-N-2], x[n-N-3] */ + x2 = _SIMD32_OFFSET(px1); + + /* Read state x[n-N-3], x[n-N-4] */ + x3 = _SIMD32_OFFSET(px1 + 1u); + + /* acc2 += b[N] * x[n-N-2] + b[N-1] * x[n-N-3] */ + acc2 = __SMLALD(x2, c0, acc2); + + /* acc3 += b[N] * x[n-N-3] + b[N-1] * x[n-N-4] */ + acc3 = __SMLALD(x3, c0, acc3); + + /* Read coefficients b[N-2], b[N-3] */ + c0 = *__SIMD32(pb)++; + + /* acc0 += b[N-2] * x[n-N-2] + b[N-3] * x[n-N-3] */ + acc0 = __SMLALD(x2, c0, acc0); + + /* acc1 += b[N-2] * x[n-N-3] + b[N-3] * x[n-N-4] */ + acc1 = __SMLALD(x3, c0, acc1); + + /* Read state x[n-N-4], x[n-N-5] */ + x0 = _SIMD32_OFFSET(px1 + 2u); + + /* Read state x[n-N-5], x[n-N-6] */ + x1 = _SIMD32_OFFSET(px1 + 3u); + + /* acc2 += b[N-2] * x[n-N-4] + b[N-3] * x[n-N-5] */ + acc2 = __SMLALD(x0, c0, acc2); + + /* acc3 += b[N-2] * x[n-N-5] + b[N-3] * x[n-N-6] */ + acc3 = __SMLALD(x1, c0, acc3); + + px1 += 4u; + + tapCnt--; + + } + + + /* If the filter length is not a multiple of 4, compute the remaining filter taps. + ** This is always be 2 taps since the filter length is even. */ + if((numTaps & 0x3u) != 0u) + { + /* Read 2 coefficients */ + c0 = *__SIMD32(pb)++; + + /* Fetch 4 state variables */ + x2 = _SIMD32_OFFSET(px1); + + x3 = _SIMD32_OFFSET(px1 + 1u); + + /* Perform the multiply-accumulates */ + acc0 = __SMLALD(x0, c0, acc0); + + px1 += 2u; + + acc1 = __SMLALD(x1, c0, acc1); + acc2 = __SMLALD(x2, c0, acc2); + acc3 = __SMLALD(x3, c0, acc3); + } + + /* The results in the 4 accumulators are in 2.30 format. Convert to 1.15 with saturation. + ** Then store the 4 outputs in the destination buffer. */ + +#ifndef ARM_MATH_BIG_ENDIAN + + *__SIMD32(pDst)++ = + __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16); + *__SIMD32(pDst)++ = + __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16); + +#else + + *__SIMD32(pDst)++ = + __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16); + *__SIMD32(pDst)++ = + __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + + + /* Advance the state pointer by 4 to process the next group of 4 samples */ + pState = pState + 4; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize % 0x4u; + while(blkCnt > 0u) + { + /* Copy two samples into state buffer */ + *pStateCurnt++ = *pSrc++; + + /* Set the accumulator to zero */ + acc0 = 0; + + /* Initialize state pointer of type q15 */ + px1 = pState; + + /* Initialize coeff pointer of type q31 */ + pb = pCoeffs; + + tapCnt = numTaps >> 1; + + do + { + + c0 = *__SIMD32(pb)++; + x0 = *__SIMD32(px1)++; + + acc0 = __SMLALD(x0, c0, acc0); + tapCnt--; + } + while(tapCnt > 0u); + + /* The result is in 2.30 format. Convert to 1.15 with saturation. + ** Then store the output in the destination buffer. */ + *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16)); + + /* Advance state pointer by 1 for the next sample */ + pState = pState + 1; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Processing is complete. + ** Now copy the last numTaps - 1 samples to the satrt of the state buffer. + ** This prepares the state buffer for the next function call. */ + + /* Points to the start of the state buffer */ + pStateCurnt = S->pState; + + /* Calculation of count for copying integer writes */ + tapCnt = (numTaps - 1u) >> 2; + + while(tapCnt > 0u) + { + + /* Copy state values to start of state buffer */ + *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++; + *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++; + + tapCnt--; + + } + + /* Calculation of count for remaining q15_t data */ + tapCnt = (numTaps - 1u) % 0x4u; + + /* copy remaining data */ + while(tapCnt > 0u) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } +} + +#else /* UNALIGNED_SUPPORT_DISABLE */ + +void arm_fir_q15( + const arm_fir_instance_q15 * S, + q15_t * pSrc, + q15_t * pDst, + uint32_t blockSize) +{ + q15_t *pState = S->pState; /* State pointer */ + q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + q15_t *pStateCurnt; /* Points to the current sample of the state */ + q63_t acc0, acc1, acc2, acc3; /* Accumulators */ + q15_t *pb; /* Temporary pointer for coefficient buffer */ + q15_t *px; /* Temporary q31 pointer for SIMD state buffer accesses */ + q31_t x0, x1, x2, c0; /* Temporary variables to hold SIMD state and coefficient values */ + uint32_t numTaps = S->numTaps; /* Number of taps in the filter */ + uint32_t tapCnt, blkCnt; /* Loop counters */ + + + /* S->pState points to state array which contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = &(S->pState[(numTaps - 1u)]); + + /* Apply loop unrolling and compute 4 output values simultaneously. + * The variables acc0 ... acc3 hold output values that are being computed: + * + * acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] + * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1] + * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2] + * acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3] + */ + + blkCnt = blockSize >> 2; + + /* First part of the processing with loop unrolling. Compute 4 outputs at a time. + ** a second loop below computes the remaining 1 to 3 samples. */ + while(blkCnt > 0u) + { + /* Copy four new input samples into the state buffer. + ** Use 32-bit SIMD to move the 16-bit data. Only requires two copies. */ + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + + + /* Set all accumulators to zero */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + acc3 = 0; + + /* Typecast q15_t pointer to q31_t pointer for state reading in q31_t */ + px = pState; + + /* Typecast q15_t pointer to q31_t pointer for coefficient reading in q31_t */ + pb = pCoeffs; + + /* Read the first two samples from the state buffer: x[n-N], x[n-N-1] */ + x0 = *__SIMD32(px)++; + + /* Read the third and forth samples from the state buffer: x[n-N-2], x[n-N-3] */ + x2 = *__SIMD32(px)++; + + /* Loop over the number of taps. Unroll by a factor of 4. + ** Repeat until we've computed numTaps-(numTaps%4) coefficients. */ + tapCnt = numTaps >> 2; + + while(tapCnt > 0) + { + /* Read the first two coefficients using SIMD: b[N] and b[N-1] coefficients */ + c0 = *__SIMD32(pb)++; + + /* acc0 += b[N] * x[n-N] + b[N-1] * x[n-N-1] */ + acc0 = __SMLALD(x0, c0, acc0); + + /* acc2 += b[N] * x[n-N-2] + b[N-1] * x[n-N-3] */ + acc2 = __SMLALD(x2, c0, acc2); + + /* pack x[n-N-1] and x[n-N-2] */ +#ifndef ARM_MATH_BIG_ENDIAN + x1 = __PKHBT(x2, x0, 0); +#else + x1 = __PKHBT(x0, x2, 0); +#endif + + /* Read state x[n-N-4], x[n-N-5] */ + x0 = _SIMD32_OFFSET(px); + + /* acc1 += b[N] * x[n-N-1] + b[N-1] * x[n-N-2] */ + acc1 = __SMLALDX(x1, c0, acc1); + + /* pack x[n-N-3] and x[n-N-4] */ +#ifndef ARM_MATH_BIG_ENDIAN + x1 = __PKHBT(x0, x2, 0); +#else + x1 = __PKHBT(x2, x0, 0); +#endif + + /* acc3 += b[N] * x[n-N-3] + b[N-1] * x[n-N-4] */ + acc3 = __SMLALDX(x1, c0, acc3); + + /* Read coefficients b[N-2], b[N-3] */ + c0 = *__SIMD32(pb)++; + + /* acc0 += b[N-2] * x[n-N-2] + b[N-3] * x[n-N-3] */ + acc0 = __SMLALD(x2, c0, acc0); + + /* Read state x[n-N-6], x[n-N-7] with offset */ + x2 = _SIMD32_OFFSET(px + 2u); + + /* acc2 += b[N-2] * x[n-N-4] + b[N-3] * x[n-N-5] */ + acc2 = __SMLALD(x0, c0, acc2); + + /* acc1 += b[N-2] * x[n-N-3] + b[N-3] * x[n-N-4] */ + acc1 = __SMLALDX(x1, c0, acc1); + + /* pack x[n-N-5] and x[n-N-6] */ +#ifndef ARM_MATH_BIG_ENDIAN + x1 = __PKHBT(x2, x0, 0); +#else + x1 = __PKHBT(x0, x2, 0); +#endif + + /* acc3 += b[N-2] * x[n-N-5] + b[N-3] * x[n-N-6] */ + acc3 = __SMLALDX(x1, c0, acc3); + + /* Update state pointer for next state reading */ + px += 4u; + + /* Decrement tap count */ + tapCnt--; + + } + + /* If the filter length is not a multiple of 4, compute the remaining filter taps. + ** This is always be 2 taps since the filter length is even. */ + if((numTaps & 0x3u) != 0u) + { + + /* Read last two coefficients */ + c0 = *__SIMD32(pb)++; + + /* Perform the multiply-accumulates */ + acc0 = __SMLALD(x0, c0, acc0); + acc2 = __SMLALD(x2, c0, acc2); + + /* pack state variables */ +#ifndef ARM_MATH_BIG_ENDIAN + x1 = __PKHBT(x2, x0, 0); +#else + x1 = __PKHBT(x0, x2, 0); +#endif + + /* Read last state variables */ + x0 = *__SIMD32(px); + + /* Perform the multiply-accumulates */ + acc1 = __SMLALDX(x1, c0, acc1); + + /* pack state variables */ +#ifndef ARM_MATH_BIG_ENDIAN + x1 = __PKHBT(x0, x2, 0); +#else + x1 = __PKHBT(x2, x0, 0); +#endif + + /* Perform the multiply-accumulates */ + acc3 = __SMLALDX(x1, c0, acc3); + } + + /* The results in the 4 accumulators are in 2.30 format. Convert to 1.15 with saturation. + ** Then store the 4 outputs in the destination buffer. */ + +#ifndef ARM_MATH_BIG_ENDIAN + + *__SIMD32(pDst)++ = + __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16); + + *__SIMD32(pDst)++ = + __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16); + +#else + + *__SIMD32(pDst)++ = + __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16); + + *__SIMD32(pDst)++ = + __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* Advance the state pointer by 4 to process the next group of 4 samples */ + pState = pState + 4; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize % 0x4u; + while(blkCnt > 0u) + { + /* Copy two samples into state buffer */ + *pStateCurnt++ = *pSrc++; + + /* Set the accumulator to zero */ + acc0 = 0; + + /* Use SIMD to hold states and coefficients */ + px = pState; + pb = pCoeffs; + + tapCnt = numTaps >> 1u; + + do + { + acc0 += (q31_t) * px++ * *pb++; + acc0 += (q31_t) * px++ * *pb++; + tapCnt--; + } + while(tapCnt > 0u); + + /* The result is in 2.30 format. Convert to 1.15 with saturation. + ** Then store the output in the destination buffer. */ + *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16)); + + /* Advance state pointer by 1 for the next sample */ + pState = pState + 1u; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Processing is complete. + ** Now copy the last numTaps - 1 samples to the satrt of the state buffer. + ** This prepares the state buffer for the next function call. */ + + /* Points to the start of the state buffer */ + pStateCurnt = S->pState; + + /* Calculation of count for copying integer writes */ + tapCnt = (numTaps - 1u) >> 2; + + while(tapCnt > 0u) + { + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + + tapCnt--; + + } + + /* Calculation of count for remaining q15_t data */ + tapCnt = (numTaps - 1u) % 0x4u; + + /* copy remaining data */ + while(tapCnt > 0u) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } +} + + +#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ + +#else /* ARM_MATH_CM0_FAMILY */ + + +/* Run the below code for Cortex-M0 */ + +void arm_fir_q15( + const arm_fir_instance_q15 * S, + q15_t * pSrc, + q15_t * pDst, + uint32_t blockSize) +{ + q15_t *pState = S->pState; /* State pointer */ + q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + q15_t *pStateCurnt; /* Points to the current sample of the state */ + + + + q15_t *px; /* Temporary pointer for state buffer */ + q15_t *pb; /* Temporary pointer for coefficient buffer */ + q63_t acc; /* Accumulator */ + uint32_t numTaps = S->numTaps; /* Number of nTaps in the filter */ + uint32_t tapCnt, blkCnt; /* Loop counters */ + + /* S->pState buffer contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = &(S->pState[(numTaps - 1u)]); + + /* Initialize blkCnt with blockSize */ + blkCnt = blockSize; + + while(blkCnt > 0u) + { + /* Copy one sample at a time into state buffer */ + *pStateCurnt++ = *pSrc++; + + /* Set the accumulator to zero */ + acc = 0; + + /* Initialize state pointer */ + px = pState; + + /* Initialize Coefficient pointer */ + pb = pCoeffs; + + tapCnt = numTaps; + + /* Perform the multiply-accumulates */ + do + { + /* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */ + acc += (q31_t) * px++ * *pb++; + tapCnt--; + } while(tapCnt > 0u); + + /* The result is in 2.30 format. Convert to 1.15 + ** Then store the output in the destination buffer. */ + *pDst++ = (q15_t) __SSAT((acc >> 15u), 16); + + /* Advance state pointer by 1 for the next sample */ + pState = pState + 1; + + /* Decrement the samples loop counter */ + blkCnt--; + } + + /* Processing is complete. + ** Now copy the last numTaps - 1 samples to the satrt of the state buffer. + ** This prepares the state buffer for the next function call. */ + + /* Points to the start of the state buffer */ + pStateCurnt = S->pState; + + /* Copy numTaps number of values */ + tapCnt = (numTaps - 1u); + + /* copy data */ + while(tapCnt > 0u) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + +} + +#endif /* #ifndef ARM_MATH_CM0_FAMILY */ + + + + +/** + * @} end of FIR group + */ diff --git a/gui/cmsis/arm_fir_q31.c b/gui/cmsis/arm_fir_q31.c new file mode 100644 index 0000000..af5707f --- /dev/null +++ b/gui/cmsis/arm_fir_q31.c @@ -0,0 +1,365 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010-2014 ARM Limited. All rights reserved. +* +* $Date: 19. March 2015 +* $Revision: V.1.4.5 +* +* Project: CMSIS DSP Library +* Title: arm_fir_q31.c +* +* Description: Q31 FIR filter processing function. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Redistribution and use in source and binary forms, with or without +* modification, are permitted provided that the following conditions +* are met: +* - Redistributions of source code must retain the above copyright +* notice, this list of conditions and the following disclaimer. +* - Redistributions in binary form must reproduce the above copyright +* notice, this list of conditions and the following disclaimer in +* the documentation and/or other materials provided with the +* distribution. +* - Neither the name of ARM LIMITED nor the names of its contributors +* may be used to endorse or promote products derived from this +* software without specific prior written permission. +* +* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +* POSSIBILITY OF SUCH DAMAGE. +* -------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup FIR + * @{ + */ + +/** + * @param[in] *S points to an instance of the Q31 FIR filter structure. + * @param[in] *pSrc points to the block of input data. + * @param[out] *pDst points to the block of output data. + * @param[in] blockSize number of samples to process per call. + * @return none. + * + * @details + * Scaling and Overflow Behavior: + * \par + * The function is implemented using an internal 64-bit accumulator. + * The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit. + * Thus, if the accumulator result overflows it wraps around rather than clip. + * In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits. + * After all multiply-accumulates are performed, the 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result. + * + * \par + * Refer to the function arm_fir_fast_q31() for a faster but less precise implementation of this filter for Cortex-M3 and Cortex-M4. + */ + +void arm_fir_q31( + const arm_fir_instance_q31 * S, + q31_t * pSrc, + q31_t * pDst, + uint32_t blockSize) +{ + q31_t *pState = S->pState; /* State pointer */ + q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + q31_t *pStateCurnt; /* Points to the current sample of the state */ + + +#ifndef ARM_MATH_CM0_FAMILY + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + q31_t x0, x1, x2; /* Temporary variables to hold state */ + q31_t c0; /* Temporary variable to hold coefficient value */ + q31_t *px; /* Temporary pointer for state */ + q31_t *pb; /* Temporary pointer for coefficient buffer */ + q63_t acc0, acc1, acc2; /* Accumulators */ + uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ + uint32_t i, tapCnt, blkCnt, tapCntN3; /* Loop counters */ + + /* S->pState points to state array which contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = &(S->pState[(numTaps - 1u)]); + + /* Apply loop unrolling and compute 4 output values simultaneously. + * The variables acc0 ... acc3 hold output values that are being computed: + * + * acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] + * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1] + * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2] + * acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3] + */ + blkCnt = blockSize / 3; + blockSize = blockSize - (3 * blkCnt); + + tapCnt = numTaps / 3; + tapCntN3 = numTaps - (3 * tapCnt); + + /* First part of the processing with loop unrolling. Compute 4 outputs at a time. + ** a second loop below computes the remaining 1 to 3 samples. */ + while(blkCnt > 0u) + { + /* Copy three new input samples into the state buffer */ + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + + /* Set all accumulators to zero */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + + /* Initialize state pointer */ + px = pState; + + /* Initialize coefficient pointer */ + pb = pCoeffs; + + /* Read the first two samples from the state buffer: + * x[n-numTaps], x[n-numTaps-1] */ + x0 = *(px++); + x1 = *(px++); + + /* Loop unrolling. Process 3 taps at a time. */ + i = tapCnt; + + while(i > 0u) + { + /* Read the b[numTaps] coefficient */ + c0 = *pb; + + /* Read x[n-numTaps-2] sample */ + x2 = *(px++); + + /* Perform the multiply-accumulates */ + acc0 += ((q63_t) x0 * c0); + acc1 += ((q63_t) x1 * c0); + acc2 += ((q63_t) x2 * c0); + + /* Read the coefficient and state */ + c0 = *(pb + 1u); + x0 = *(px++); + + /* Perform the multiply-accumulates */ + acc0 += ((q63_t) x1 * c0); + acc1 += ((q63_t) x2 * c0); + acc2 += ((q63_t) x0 * c0); + + /* Read the coefficient and state */ + c0 = *(pb + 2u); + x1 = *(px++); + + /* update coefficient pointer */ + pb += 3u; + + /* Perform the multiply-accumulates */ + acc0 += ((q63_t) x2 * c0); + acc1 += ((q63_t) x0 * c0); + acc2 += ((q63_t) x1 * c0); + + /* Decrement the loop counter */ + i--; + } + + /* If the filter length is not a multiple of 3, compute the remaining filter taps */ + + i = tapCntN3; + + while(i > 0u) + { + /* Read coefficients */ + c0 = *(pb++); + + /* Fetch 1 state variable */ + x2 = *(px++); + + /* Perform the multiply-accumulates */ + acc0 += ((q63_t) x0 * c0); + acc1 += ((q63_t) x1 * c0); + acc2 += ((q63_t) x2 * c0); + + /* Reuse the present sample states for next sample */ + x0 = x1; + x1 = x2; + + /* Decrement the loop counter */ + i--; + } + + /* Advance the state pointer by 3 to process the next group of 3 samples */ + pState = pState + 3; + + /* The results in the 3 accumulators are in 2.30 format. Convert to 1.31 + ** Then store the 3 outputs in the destination buffer. */ + *pDst++ = (q31_t) (acc0 >> 31u); + *pDst++ = (q31_t) (acc1 >> 31u); + *pDst++ = (q31_t) (acc2 >> 31u); + + /* Decrement the samples loop counter */ + blkCnt--; + } + + /* If the blockSize is not a multiple of 3, compute any remaining output samples here. + ** No loop unrolling is used. */ + + while(blockSize > 0u) + { + /* Copy one sample at a time into state buffer */ + *pStateCurnt++ = *pSrc++; + + /* Set the accumulator to zero */ + acc0 = 0; + + /* Initialize state pointer */ + px = pState; + + /* Initialize Coefficient pointer */ + pb = (pCoeffs); + + i = numTaps; + + /* Perform the multiply-accumulates */ + do + { + acc0 += (q63_t) * (px++) * (*(pb++)); + i--; + } while(i > 0u); + + /* The result is in 2.62 format. Convert to 1.31 + ** Then store the output in the destination buffer. */ + *pDst++ = (q31_t) (acc0 >> 31u); + + /* Advance state pointer by 1 for the next sample */ + pState = pState + 1; + + /* Decrement the samples loop counter */ + blockSize--; + } + + /* Processing is complete. + ** Now copy the last numTaps - 1 samples to the satrt of the state buffer. + ** This prepares the state buffer for the next function call. */ + + /* Points to the start of the state buffer */ + pStateCurnt = S->pState; + + tapCnt = (numTaps - 1u) >> 2u; + + /* copy data */ + while(tapCnt > 0u) + { + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Calculate remaining number of copies */ + tapCnt = (numTaps - 1u) % 0x4u; + + /* Copy the remaining q31_t data */ + while(tapCnt > 0u) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + +#else + +/* Run the below code for Cortex-M0 */ + + q31_t *px; /* Temporary pointer for state */ + q31_t *pb; /* Temporary pointer for coefficient buffer */ + q63_t acc; /* Accumulator */ + uint32_t numTaps = S->numTaps; /* Length of the filter */ + uint32_t i, tapCnt, blkCnt; /* Loop counters */ + + /* S->pState buffer contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = &(S->pState[(numTaps - 1u)]); + + /* Initialize blkCnt with blockSize */ + blkCnt = blockSize; + + while(blkCnt > 0u) + { + /* Copy one sample at a time into state buffer */ + *pStateCurnt++ = *pSrc++; + + /* Set the accumulator to zero */ + acc = 0; + + /* Initialize state pointer */ + px = pState; + + /* Initialize Coefficient pointer */ + pb = pCoeffs; + + i = numTaps; + + /* Perform the multiply-accumulates */ + do + { + /* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */ + acc += (q63_t) * px++ * *pb++; + i--; + } while(i > 0u); + + /* The result is in 2.62 format. Convert to 1.31 + ** Then store the output in the destination buffer. */ + *pDst++ = (q31_t) (acc >> 31u); + + /* Advance state pointer by 1 for the next sample */ + pState = pState + 1; + + /* Decrement the samples loop counter */ + blkCnt--; + } + + /* Processing is complete. + ** Now copy the last numTaps - 1 samples to the starting of the state buffer. + ** This prepares the state buffer for the next function call. */ + + /* Points to the start of the state buffer */ + pStateCurnt = S->pState; + + /* Copy numTaps number of values */ + tapCnt = numTaps - 1u; + + /* Copy the data */ + while(tapCnt > 0u) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + + +#endif /* #ifndef ARM_MATH_CM0_FAMILY */ + +} + +/** + * @} end of FIR group + */ diff --git a/gui/cmsis/arm_fir_q7.c b/gui/cmsis/arm_fir_q7.c new file mode 100644 index 0000000..54a30e2 --- /dev/null +++ b/gui/cmsis/arm_fir_q7.c @@ -0,0 +1,397 @@ +/* ---------------------------------------------------------------------- +* Copyright (C) 2010-2014 ARM Limited. All rights reserved. +* +* $Date: 19. March 2015 +* $Revision: V.1.4.5 +* +* Project: CMSIS DSP Library +* Title: arm_fir_q7.c +* +* Description: Q7 FIR filter processing function. +* +* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 +* +* Redistribution and use in source and binary forms, with or without +* modification, are permitted provided that the following conditions +* are met: +* - Redistributions of source code must retain the above copyright +* notice, this list of conditions and the following disclaimer. +* - Redistributions in binary form must reproduce the above copyright +* notice, this list of conditions and the following disclaimer in +* the documentation and/or other materials provided with the +* distribution. +* - Neither the name of ARM LIMITED nor the names of its contributors +* may be used to endorse or promote products derived from this +* software without specific prior written permission. +* +* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +* POSSIBILITY OF SUCH DAMAGE. +* -------------------------------------------------------------------- */ + +#include "arm_math.h" + +/** + * @ingroup groupFilters + */ + +/** + * @addtogroup FIR + * @{ + */ + +/** + * @param[in] *S points to an instance of the Q7 FIR filter structure. + * @param[in] *pSrc points to the block of input data. + * @param[out] *pDst points to the block of output data. + * @param[in] blockSize number of samples to process per call. + * @return none. + * + * Scaling and Overflow Behavior: + * \par + * The function is implemented using a 32-bit internal accumulator. + * Both coefficients and state variables are represented in 1.7 format and multiplications yield a 2.14 result. + * The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format. + * There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved. + * The accumulator is converted to 18.7 format by discarding the low 7 bits. + * Finally, the result is truncated to 1.7 format. + */ + +void arm_fir_q7( + const arm_fir_instance_q7 * S, + q7_t * pSrc, + q7_t * pDst, + uint32_t blockSize) +{ + +#ifndef ARM_MATH_CM0_FAMILY + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + q7_t *pState = S->pState; /* State pointer */ + q7_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + q7_t *pStateCurnt; /* Points to the current sample of the state */ + q7_t x0, x1, x2, x3; /* Temporary variables to hold state */ + q7_t c0; /* Temporary variable to hold coefficient value */ + q7_t *px; /* Temporary pointer for state */ + q7_t *pb; /* Temporary pointer for coefficient buffer */ + q31_t acc0, acc1, acc2, acc3; /* Accumulators */ + uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ + uint32_t i, tapCnt, blkCnt; /* Loop counters */ + + /* S->pState points to state array which contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = &(S->pState[(numTaps - 1u)]); + + /* Apply loop unrolling and compute 4 output values simultaneously. + * The variables acc0 ... acc3 hold output values that are being computed: + * + * acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] + * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1] + * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2] + * acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3] + */ + blkCnt = blockSize >> 2; + + /* First part of the processing with loop unrolling. Compute 4 outputs at a time. + ** a second loop below computes the remaining 1 to 3 samples. */ + while(blkCnt > 0u) + { + /* Copy four new input samples into the state buffer */ + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + *pStateCurnt++ = *pSrc++; + + /* Set all accumulators to zero */ + acc0 = 0; + acc1 = 0; + acc2 = 0; + acc3 = 0; + + /* Initialize state pointer */ + px = pState; + + /* Initialize coefficient pointer */ + pb = pCoeffs; + + /* Read the first three samples from the state buffer: + * x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2] */ + x0 = *(px++); + x1 = *(px++); + x2 = *(px++); + + /* Loop unrolling. Process 4 taps at a time. */ + tapCnt = numTaps >> 2; + i = tapCnt; + + while(i > 0u) + { + /* Read the b[numTaps] coefficient */ + c0 = *pb; + + /* Read x[n-numTaps-3] sample */ + x3 = *px; + + /* acc0 += b[numTaps] * x[n-numTaps] */ + acc0 += ((q15_t) x0 * c0); + + /* acc1 += b[numTaps] * x[n-numTaps-1] */ + acc1 += ((q15_t) x1 * c0); + + /* acc2 += b[numTaps] * x[n-numTaps-2] */ + acc2 += ((q15_t) x2 * c0); + + /* acc3 += b[numTaps] * x[n-numTaps-3] */ + acc3 += ((q15_t) x3 * c0); + + /* Read the b[numTaps-1] coefficient */ + c0 = *(pb + 1u); + + /* Read x[n-numTaps-4] sample */ + x0 = *(px + 1u); + + /* Perform the multiply-accumulates */ + acc0 += ((q15_t) x1 * c0); + acc1 += ((q15_t) x2 * c0); + acc2 += ((q15_t) x3 * c0); + acc3 += ((q15_t) x0 * c0); + + /* Read the b[numTaps-2] coefficient */ + c0 = *(pb + 2u); + + /* Read x[n-numTaps-5] sample */ + x1 = *(px + 2u); + + /* Perform the multiply-accumulates */ + acc0 += ((q15_t) x2 * c0); + acc1 += ((q15_t) x3 * c0); + acc2 += ((q15_t) x0 * c0); + acc3 += ((q15_t) x1 * c0); + + /* Read the b[numTaps-3] coefficients */ + c0 = *(pb + 3u); + + /* Read x[n-numTaps-6] sample */ + x2 = *(px + 3u); + + /* Perform the multiply-accumulates */ + acc0 += ((q15_t) x3 * c0); + acc1 += ((q15_t) x0 * c0); + acc2 += ((q15_t) x1 * c0); + acc3 += ((q15_t) x2 * c0); + + /* update coefficient pointer */ + pb += 4u; + px += 4u; + + /* Decrement the loop counter */ + i--; + } + + /* If the filter length is not a multiple of 4, compute the remaining filter taps */ + + i = numTaps - (tapCnt * 4u); + while(i > 0u) + { + /* Read coefficients */ + c0 = *(pb++); + + /* Fetch 1 state variable */ + x3 = *(px++); + + /* Perform the multiply-accumulates */ + acc0 += ((q15_t) x0 * c0); + acc1 += ((q15_t) x1 * c0); + acc2 += ((q15_t) x2 * c0); + acc3 += ((q15_t) x3 * c0); + + /* Reuse the present sample states for next sample */ + x0 = x1; + x1 = x2; + x2 = x3; + + /* Decrement the loop counter */ + i--; + } + + /* Advance the state pointer by 4 to process the next group of 4 samples */ + pState = pState + 4; + + /* The results in the 4 accumulators are in 2.62 format. Convert to 1.31 + ** Then store the 4 outputs in the destination buffer. */ + acc0 = __SSAT((acc0 >> 7u), 8); + *pDst++ = acc0; + acc1 = __SSAT((acc1 >> 7u), 8); + *pDst++ = acc1; + acc2 = __SSAT((acc2 >> 7u), 8); + *pDst++ = acc2; + acc3 = __SSAT((acc3 >> 7u), 8); + *pDst++ = acc3; + + /* Decrement the samples loop counter */ + blkCnt--; + } + + + /* If the blockSize is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize % 4u; + + while(blkCnt > 0u) + { + /* Copy one sample at a time into state buffer */ + *pStateCurnt++ = *pSrc++; + + /* Set the accumulator to zero */ + acc0 = 0; + + /* Initialize state pointer */ + px = pState; + + /* Initialize Coefficient pointer */ + pb = (pCoeffs); + + i = numTaps; + + /* Perform the multiply-accumulates */ + do + { + acc0 += (q15_t) * (px++) * (*(pb++)); + i--; + } while(i > 0u); + + /* The result is in 2.14 format. Convert to 1.7 + ** Then store the output in the destination buffer. */ + *pDst++ = __SSAT((acc0 >> 7u), 8); + + /* Advance state pointer by 1 for the next sample */ + pState = pState + 1; + + /* Decrement the samples loop counter */ + blkCnt--; + } + + /* Processing is complete. + ** Now copy the last numTaps - 1 samples to the satrt of the state buffer. + ** This prepares the state buffer for the next function call. */ + + /* Points to the start of the state buffer */ + pStateCurnt = S->pState; + + tapCnt = (numTaps - 1u) >> 2u; + + /* copy data */ + while(tapCnt > 0u) + { + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + + /* Calculate remaining number of copies */ + tapCnt = (numTaps - 1u) % 0x4u; + + /* Copy the remaining q31_t data */ + while(tapCnt > 0u) + { + *pStateCurnt++ = *pState++; + + /* Decrement the loop counter */ + tapCnt--; + } + +#else + +/* Run the below code for Cortex-M0 */ + + uint32_t numTaps = S->numTaps; /* Number of taps in the filter */ + uint32_t i, blkCnt; /* Loop counters */ + q7_t *pState = S->pState; /* State pointer */ + q7_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + q7_t *px, *pb; /* Temporary pointers to state and coeff */ + q31_t acc = 0; /* Accumlator */ + q7_t *pStateCurnt; /* Points to the current sample of the state */ + + + /* S->pState points to state array which contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = S->pState + (numTaps - 1u); + + /* Initialize blkCnt with blockSize */ + blkCnt = blockSize; + + /* Perform filtering upto BlockSize - BlockSize%4 */ + while(blkCnt > 0u) + { + /* Copy one sample at a time into state buffer */ + *pStateCurnt++ = *pSrc++; + + /* Set accumulator to zero */ + acc = 0; + + /* Initialize state pointer of type q7 */ + px = pState; + + /* Initialize coeff pointer of type q7 */ + pb = pCoeffs; + + + i = numTaps; + + while(i > 0u) + { + /* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */ + acc += (q15_t) * px++ * *pb++; + i--; + } + + /* Store the 1.7 format filter output in destination buffer */ + *pDst++ = (q7_t) __SSAT((acc >> 7), 8); + + /* Advance the state pointer by 1 to process the next sample */ + pState = pState + 1; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* Processing is complete. + ** Now copy the last numTaps - 1 samples to the satrt of the state buffer. + ** This prepares the state buffer for the next function call. */ + + + /* Points to the start of the state buffer */ + pStateCurnt = S->pState; + + + /* Copy numTaps number of values */ + i = (numTaps - 1u); + + /* Copy q7_t data */ + while(i > 0u) + { + *pStateCurnt++ = *pState++; + i--; + } + +#endif /* #ifndef ARM_MATH_CM0_FAMILY */ + +} + +/** + * @} end of FIR group + */ diff --git a/gui/wxmain.cpp b/gui/wxmain.cpp index 02dc4c0..75eedb6 100644 --- a/gui/wxmain.cpp +++ b/gui/wxmain.cpp @@ -29,14 +29,17 @@ #include #include -#ifndef WIN32 +#ifndef STMDSP_WIN32 #include +#include #endif #include "wxmain_devdata.h" enum Id { - MeasureTimer = 1, + TimerPerformance = 1, + TimerRecord, + TimerWavClip, MFileNew, MFileOpen, @@ -56,7 +59,7 @@ enum Id { MRunGenStart, MCodeCompile, MCodeDisassemble, - CompileOutput + CompileOutput }; MainFrame::MainFrame() : @@ -87,14 +90,16 @@ MainFrame::MainFrame() : wxEmptyString, wxDefaultPosition, wxSize(620, 250), wxTE_READONLY | wxTE_MULTILINE | wxHSCROLL | wxTE_RICH2); - m_measure_timer = new wxTimer(this, Id::MeasureTimer); + m_timer_performance = new wxTimer(this, Id::TimerPerformance); + m_timer_record = new wxTimer(this, Id::TimerRecord); + m_timer_wavclip = new wxTimer(this, Id::TimerWavClip); m_menu_bar = new wxMenuBar; m_rate_select = new wxComboBox(panelToolbar, wxID_ANY, wxEmptyString, wxDefaultPosition, wxDefaultSize, srateValues.size(), srateValues.data(), wxCB_READONLY); -#ifndef WIN32 +#ifndef STMDSP_WIN32 m_device_samples = reinterpret_cast(::mmap( nullptr, stmdsp::SAMPLES_MAX * sizeof(stmdsp::adcsample_t), PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0)); @@ -102,8 +107,8 @@ MainFrame::MainFrame() : nullptr, stmdsp::SAMPLES_MAX * sizeof(stmdsp::adcsample_t), PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0)); #else - m_device_samples = new stmdsp::adcsample_t[stmdsp::SAMPLES_MAX]; - m_device_samples_input = new stmdsp::adcsample_t[stmdsp::SAMPLES_MAX]; + m_device_samples = new stmdsp::adcsample_t[stmdsp::SAMPLES_MAX]; + m_device_samples_input = new stmdsp::adcsample_t[stmdsp::SAMPLES_MAX]; #endif m_menu_bar->Append(menuFile, "&File"); @@ -143,10 +148,12 @@ MainFrame::MainFrame() : // Binds: // General - Bind(wxEVT_TIMER, &MainFrame::onMeasureTimer, this, Id::MeasureTimer); - Bind(wxEVT_CLOSE_WINDOW, &MainFrame::onCloseEvent, this, wxID_ANY); + Bind(wxEVT_TIMER, &MainFrame::onTimerPerformance, this, Id::TimerPerformance); + Bind(wxEVT_TIMER, &MainFrame::onTimerRecord, this, Id::TimerRecord); + Bind(wxEVT_TIMER, &MainFrame::onTimerWavClip, this, Id::TimerWavClip); + Bind(wxEVT_CLOSE_WINDOW, &MainFrame::onCloseEvent, this, wxID_ANY); m_compile_output-> - Bind(wxEVT_PAINT, &MainFrame::onPaint, this, Id::CompileOutput); + Bind(wxEVT_PAINT, &MainFrame::onPaint, this, Id::CompileOutput); // Toolbar actions Bind(wxEVT_BUTTON, &MainFrame::onRunCompile, this, wxID_ANY, wxID_ANY, comp); @@ -192,7 +199,9 @@ void MainFrame::onCloseEvent(wxCloseEvent& event) //delete m_menu_bar->Remove(1); //delete m_menu_bar->Remove(0); //delete m_menu_bar; - delete m_measure_timer; + delete m_timer_performance; + delete m_timer_record; + delete m_timer_wavclip; delete m_device; Unbind(wxEVT_COMBOBOX, &MainFrame::onToolbarSampleRate, this, wxID_ANY, wxID_ANY); @@ -201,46 +210,45 @@ void MainFrame::onCloseEvent(wxCloseEvent& event) event.Skip(); } -// Measure timer tick handler -// Only called while connected and running. -void MainFrame::onMeasureTimer(wxTimerEvent&) +void MainFrame::onTimerPerformance(wxTimerEvent&) { - if (m_conv_result_log || m_run_draw_samples->IsChecked()) { - auto samples = m_device->continuous_read(); - if (samples.size() > 0) { - std::copy(samples.cbegin(), samples.cend(), m_device_samples); - - if (m_conv_result_log) { - for (auto& s : samples) { - auto str = std::to_string(s) + ','; - m_conv_result_log->Write(str.c_str(), str.size()); - } - m_conv_result_log->Write("\n", 1); - } - if (m_run_draw_samples->IsChecked()) { - samples = m_device->continuous_read_input(); - std::copy(samples.cbegin(), samples.cend(), m_device_samples_input); - m_compile_output->Refresh(); + // Show execution time + m_status_bar->SetStatusText(wxString::Format(wxT("Execution time: %u cycles"), + m_device->continuous_start_get_measurement())); +} + +void MainFrame::onTimerRecord(wxTimerEvent&) +{ + auto samples = m_device->continuous_read(); + if (samples.size() > 0) { + std::copy(samples.cbegin(), samples.cend(), m_device_samples); + + if (m_conv_result_log) { + for (auto& s : samples) { + auto str = std::to_string(s) + ','; + m_conv_result_log->Write(str.c_str(), str.size()); } + m_conv_result_log->Write("\n", 1); } - } - if (m_wav_clip) { - // Stream out next WAV chunk - auto size = m_device->get_buffer_size(); - auto chunk = new stmdsp::adcsample_t[size]; - auto src = reinterpret_cast(m_wav_clip->next(size)); - for (unsigned int i = 0; i < size; i++) - chunk[i] = ((uint32_t)*src++) / 16 + 2048; - m_device->siggen_upload(chunk, size); - delete[] chunk; + if (m_run_draw_samples->IsChecked()) { + samples = m_device->continuous_read_input(); + std::copy(samples.cbegin(), samples.cend(), m_device_samples_input); + m_compile_output->Refresh(); + } } +} - if (m_run_measure->IsChecked()) { - // Show execution time - m_status_bar->SetStatusText(wxString::Format(wxT("Execution time: %u cycles"), - m_device->continuous_start_get_measurement())); - } +void MainFrame::onTimerWavClip(wxTimerEvent&) +{ + // Stream out next WAV chunk + auto size = m_device->get_buffer_size(); + auto chunk = new stmdsp::adcsample_t[size]; + auto src = reinterpret_cast(m_wav_clip->next(size)); + for (unsigned int i = 0; i < size; i++) + chunk[i] = ((uint32_t)*src++) / 16 + 2048; + m_device->siggen_upload(chunk, size); + delete[] chunk; } void MainFrame::onPaint(wxPaintEvent&) @@ -248,9 +256,9 @@ void MainFrame::onPaint(wxPaintEvent&) if (!m_is_running || !m_run_draw_samples->IsChecked()) return; - const auto& dim = m_compile_output->GetSize(); + const auto& dim = m_compile_output->GetSize(); wxRect rect { - 0, 0, dim.GetWidth(), dim.GetHeight() + 0, 0, dim.GetWidth(), dim.GetHeight() }; wxBufferedPaintDC dc (m_compile_output); @@ -323,10 +331,10 @@ void MainFrame::prepareEditor() wxString MainFrame::compileEditorCode() { - stmdsp::platform platform; + stmdsp::platform platform; if (m_device != nullptr) { - platform = m_device->get_platform(); - } else { + platform = m_device->get_platform(); + } else { m_status_bar->SetStatusText("Assuming L4 platform..."); platform = stmdsp::platform::L4; } @@ -342,17 +350,19 @@ wxString MainFrame::compileEditorCode() file.Write(wxString(file_text) + m_text_editor->GetText()); file.Close(); - constexpr const char *script_ext = + constexpr const char *script_ext = #ifndef STMDSP_WIN32 - ".sh"; + ".sh"; #else - ".bat"; + ".bat"; #endif wxFile makefile (m_temp_file_name + script_ext, wxFile::write); wxString make_text (platform == stmdsp::platform::L4 ? makefile_text_l4 : makefile_text_h7); make_text.Replace("$0", m_temp_file_name); + char cwd[PATH_MAX]; + make_text.Replace("$1", getcwd(cwd, sizeof(cwd))); makefile.Write(make_text); makefile.Close(); @@ -363,12 +373,12 @@ wxString MainFrame::compileEditorCode() #ifndef STMDSP_WIN32 system(wxString("chmod +x ") + m_temp_file_name + script_ext); #endif - int result = system(make_command.ToAscii()); - wxFile result_file (make_output); - wxString result_text; - result_file.ReadAll(&result_text); - result_file.Close(); - m_compile_output->Clear(); + int result = system(make_command.ToAscii()); + wxFile result_file (make_output); + wxString result_text; + result_file.ReadAll(&result_text); + result_file.Close(); + m_compile_output->Clear(); m_compile_output->WriteText(result_text); wxRemoveFile(m_temp_file_name); @@ -401,12 +411,12 @@ void MainFrame::onCodeDisassemble(wxCommandEvent&) if (system(command.ToAscii()) == 0) { wxFile result_file (output); - wxString result_text; - result_file.ReadAll(&result_text); - result_file.Close(); - m_compile_output->Clear(); - m_compile_output->WriteText(result_text); - wxRemoveFile(output); + wxString result_text; + result_file.ReadAll(&result_text); + result_file.Close(); + m_compile_output->Clear(); + m_compile_output->WriteText(result_text); + wxRemoveFile(output); m_status_bar->SetStatusText(wxString::Format(wxT("Done. Line count: %u."), m_compile_output->GetNumberOfLines())); } else { diff --git a/gui/wxmain.hpp b/gui/wxmain.hpp index 9b3db1e..11131eb 100644 --- a/gui/wxmain.hpp +++ b/gui/wxmain.hpp @@ -59,7 +59,9 @@ public: void onCodeDisassemble(wxCommandEvent&); void onPaint(wxPaintEvent&); - void onMeasureTimer(wxTimerEvent&); + void onTimerPerformance(wxTimerEvent&); + void onTimerRecord(wxTimerEvent&); + void onTimerWavClip(wxTimerEvent&); private: // Set to true if connected and running @@ -71,7 +73,9 @@ private: wxControl *m_signal_area = nullptr; wxMenuItem *m_run_measure = nullptr; wxMenuItem *m_run_draw_samples = nullptr; - wxTimer *m_measure_timer = nullptr; + wxTimer *m_timer_performance = nullptr; + wxTimer *m_timer_record = nullptr; + wxTimer *m_timer_wavclip = nullptr; wxStatusBar *m_status_bar = nullptr; wxMenuBar *m_menu_bar = nullptr; wxComboBox *m_rate_select = nullptr; diff --git a/gui/wxmain_devdata.cpp b/gui/wxmain_devdata.cpp index 0f384fa..551f6cc 100644 --- a/gui/wxmain_devdata.cpp +++ b/gui/wxmain_devdata.cpp @@ -26,13 +26,14 @@ const std::array srateNums { #endif // $0 = temp file name +// TODO try -ffunction-sections -fdata-sections -Wl,--gc-sections const char *makefile_text_h7 = #ifdef STMDSP_WIN32 "echo off" NEWLINE #endif "arm-none-eabi-g++ -x c++ -Os -std=c++20 -fno-exceptions -fno-rtti " "-mcpu=cortex-m7 -mthumb -mfloat-abi=hard -mfpu=fpv5-d16 -mtune=cortex-m7 " - "-nostartfiles " + "-nostartfiles " "-Wl,-Ttext-segment=0x00000000 -Wl,-zmax-page-size=512 -Wl,-eprocess_data_entry " "$0 -o $0.o" NEWLINE COPY " $0.o $0.orig.o" NEWLINE @@ -48,7 +49,7 @@ const char *makefile_text_l4 = #endif "arm-none-eabi-g++ -x c++ -Os -std=c++20 -fno-exceptions -fno-rtti " "-mcpu=cortex-m4 -mthumb -mfloat-abi=hard -mfpu=fpv4-sp-d16 -mtune=cortex-m4 " - "-nostartfiles " + "-nostartfiles -I$1/cmsis" "-Wl,-Ttext-segment=0x10000000 -Wl,-zmax-page-size=512 -Wl,-eprocess_data_entry " "$0 -o $0.o" NEWLINE COPY " $0.o $0.orig.o" NEWLINE diff --git a/gui/wxmain_mrun.cpp b/gui/wxmain_mrun.cpp index 5661832..58b1873 100644 --- a/gui/wxmain_mrun.cpp +++ b/gui/wxmain_mrun.cpp @@ -46,15 +46,19 @@ void MainFrame::onRunStart(wxCommandEvent& ce) if (!m_is_running) { if (m_run_measure->IsChecked()) { m_device->continuous_start_measure(); - m_measure_timer->StartOnce(1000); + m_timer_performance->StartOnce(1000); } else { if (m_device->is_siggening() && m_wav_clip) { - m_measure_timer->Start(m_device->get_buffer_size() * 500 / + // TODO Confirm need for factor of 500 + m_timer_wavclip->Start(m_device->get_buffer_size() * 500 / srateNums[m_rate_select->GetSelection()]); } else if (m_conv_result_log) { - m_measure_timer->Start(15); + m_timer_record->Start(m_device->get_buffer_size() / + srateNums[m_rate_select->GetSelection()] * + 800 / 1000); } else if (m_run_draw_samples->IsChecked()) { - m_measure_timer->Start(300); + m_timer_record->Start(m_device->get_buffer_size() / + srateNums[m_rate_select->GetSelection()]); } m_device->continuous_start(); @@ -66,7 +70,9 @@ void MainFrame::onRunStart(wxCommandEvent& ce) m_is_running = true; } else { m_device->continuous_stop(); - m_measure_timer->Stop(); + m_timer_performance->Stop(); + m_timer_record->Stop(); + m_timer_wavclip->Stop(); m_rate_select->Enable(true); menuItem->SetItemLabel("&Start"); diff --git a/source/main.cpp b/source/main.cpp index e151a3d..aaf268a 100644 --- a/source/main.cpp +++ b/source/main.cpp @@ -99,6 +99,9 @@ static unsigned char elf_file_store[MAX_ELF_FILE_SIZE]; __attribute__((section(".convdata"))) static ELF::Entry elf_entry = nullptr; +static char userMessageBuffer[128]; +static unsigned char userMessageSize = 0; + __attribute__((section(".convcode"))) static void conversion_unprivileged_main(); @@ -187,6 +190,7 @@ THD_FUNCTION(communicationThread, arg) // 'S' - Stop conversion. // 's' - Get latest block of conversion results. // 't' - Get latest block of conversion input. + // 'u' - Get user message. // 'W' - Start signal generator (siggen). // 'w' - Stop siggen. @@ -361,6 +365,10 @@ THD_FUNCTION(communicationThread, arg) } break; + case 'u': + USBSerial::write(reinterpret_cast(userMessageBuffer), userMessageSize); + break; + case 'W': DAC::start(1, samplesSigGen.data(), samplesSigGen.size()); break; @@ -648,6 +656,17 @@ void port_syscall(struct port_extctx *ctxp, uint32_t n) case 3: ctxp->r0 = ADC::readAlt(0); break; + case 4: + { + const char *str = reinterpret_cast(ctxp->r0); + auto src = str; + auto dst = userMessageBuffer; + while (*src) + *dst++ = *src++; + *dst = '\0'; + userMessageSize = src - str; + } + break; default: while (1); break;