summaryrefslogtreecommitdiffstats
path: root/Drivers/CMSIS/DSP/Source/SupportFunctions
diff options
context:
space:
mode:
authorClyne Sullivan <clyne@bitgloo.com>2025-01-29 21:34:25 -0500
committerClyne Sullivan <clyne@bitgloo.com>2025-01-29 21:34:25 -0500
commit5b81bc8ccbd342b8566d88fc9f17a73aec03b5b6 (patch)
treecc57486912cfa74c6440d8b97c28f451ec787d78 /Drivers/CMSIS/DSP/Source/SupportFunctions
initial commit
Diffstat (limited to 'Drivers/CMSIS/DSP/Source/SupportFunctions')
-rw-r--r--Drivers/CMSIS/DSP/Source/SupportFunctions/CMakeLists.txt28
-rw-r--r--Drivers/CMSIS/DSP/Source/SupportFunctions/SupportFunctions.c63
-rw-r--r--Drivers/CMSIS/DSP/Source/SupportFunctions/SupportFunctionsF16.c36
-rw-r--r--Drivers/CMSIS/DSP/Source/SupportFunctions/arm_barycenter_f16.c274
-rw-r--r--Drivers/CMSIS/DSP/Source/SupportFunctions/arm_barycenter_f32.c414
-rw-r--r--Drivers/CMSIS/DSP/Source/SupportFunctions/arm_bitonic_sort_f32.c1039
-rw-r--r--Drivers/CMSIS/DSP/Source/SupportFunctions/arm_bubble_sort_f32.c104
-rw-r--r--Drivers/CMSIS/DSP/Source/SupportFunctions/arm_copy_f16.c130
-rw-r--r--Drivers/CMSIS/DSP/Source/SupportFunctions/arm_copy_f32.c192
-rw-r--r--Drivers/CMSIS/DSP/Source/SupportFunctions/arm_copy_f64.c71
-rw-r--r--Drivers/CMSIS/DSP/Source/SupportFunctions/arm_copy_q15.c130
-rw-r--r--Drivers/CMSIS/DSP/Source/SupportFunctions/arm_copy_q31.c135
-rw-r--r--Drivers/CMSIS/DSP/Source/SupportFunctions/arm_copy_q7.c132
-rw-r--r--Drivers/CMSIS/DSP/Source/SupportFunctions/arm_f16_to_float.c134
-rw-r--r--Drivers/CMSIS/DSP/Source/SupportFunctions/arm_f16_to_q15.c157
-rw-r--r--Drivers/CMSIS/DSP/Source/SupportFunctions/arm_fill_f16.c127
-rw-r--r--Drivers/CMSIS/DSP/Source/SupportFunctions/arm_fill_f32.c189
-rw-r--r--Drivers/CMSIS/DSP/Source/SupportFunctions/arm_fill_f64.c71
-rw-r--r--Drivers/CMSIS/DSP/Source/SupportFunctions/arm_fill_q15.c134
-rw-r--r--Drivers/CMSIS/DSP/Source/SupportFunctions/arm_fill_q31.c135
-rw-r--r--Drivers/CMSIS/DSP/Source/SupportFunctions/arm_fill_q7.c133
-rw-r--r--Drivers/CMSIS/DSP/Source/SupportFunctions/arm_float_to_f16.c131
-rw-r--r--Drivers/CMSIS/DSP/Source/SupportFunctions/arm_float_to_q15.c308
-rw-r--r--Drivers/CMSIS/DSP/Source/SupportFunctions/arm_float_to_q31.c314
-rw-r--r--Drivers/CMSIS/DSP/Source/SupportFunctions/arm_float_to_q7.c330
-rw-r--r--Drivers/CMSIS/DSP/Source/SupportFunctions/arm_heap_sort_f32.c119
-rw-r--r--Drivers/CMSIS/DSP/Source/SupportFunctions/arm_insertion_sort_f32.c93
-rw-r--r--Drivers/CMSIS/DSP/Source/SupportFunctions/arm_merge_sort_f32.c127
-rw-r--r--Drivers/CMSIS/DSP/Source/SupportFunctions/arm_merge_sort_init_f32.c53
-rw-r--r--Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q15_to_f16.c155
-rw-r--r--Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q15_to_float.c207
-rw-r--r--Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q15_to_q31.c182
-rw-r--r--Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q15_to_q7.c190
-rw-r--r--Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q31_to_float.c202
-rw-r--r--Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q31_to_q15.c181
-rw-r--r--Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q31_to_q7.c169
-rw-r--r--Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q7_to_float.c218
-rw-r--r--Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q7_to_q15.c188
-rw-r--r--Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q7_to_q31.c164
-rw-r--r--Drivers/CMSIS/DSP/Source/SupportFunctions/arm_quick_sort_f32.c181
-rw-r--r--Drivers/CMSIS/DSP/Source/SupportFunctions/arm_selection_sort_f32.c107
-rw-r--r--Drivers/CMSIS/DSP/Source/SupportFunctions/arm_sort_f32.c86
-rw-r--r--Drivers/CMSIS/DSP/Source/SupportFunctions/arm_sort_init_f32.c54
-rw-r--r--Drivers/CMSIS/DSP/Source/SupportFunctions/arm_weighted_sum_f16.c146
-rw-r--r--Drivers/CMSIS/DSP/Source/SupportFunctions/arm_weighted_sum_f32.c187
45 files changed, 7920 insertions, 0 deletions
diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/CMakeLists.txt b/Drivers/CMSIS/DSP/Source/SupportFunctions/CMakeLists.txt
new file mode 100644
index 0000000..41b13f2
--- /dev/null
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/CMakeLists.txt
@@ -0,0 +1,28 @@
+cmake_minimum_required (VERSION 3.14)
+
+project(CMSISDSPSupport)
+
+include(configLib)
+include(configDsp)
+
+file(GLOB SRC "./*_*.c")
+
+add_library(CMSISDSPSupport STATIC ${SRC})
+
+configLib(CMSISDSPSupport ${ROOT})
+configDsp(CMSISDSPSupport ${ROOT})
+
+### Includes
+target_include_directories(CMSISDSPSupport PUBLIC "${DSP}/Include")
+
+if ((NOT ARMAC5) AND (NOT DISABLEFLOAT16))
+target_sources(CMSISDSPSupport PRIVATE arm_copy_f16.c)
+target_sources(CMSISDSPSupport PRIVATE arm_fill_f16.c)
+target_sources(CMSISDSPSupport PRIVATE arm_f16_to_q15.c)
+target_sources(CMSISDSPSupport PRIVATE arm_q15_to_f16.c)
+target_sources(CMSISDSPSupport PRIVATE arm_float_to_f16.c)
+target_sources(CMSISDSPSupport PRIVATE arm_f16_to_float.c)
+target_sources(CMSISDSPSupport PRIVATE arm_weighted_sum_f16.c)
+target_sources(CMSISDSPSupport PRIVATE arm_barycenter_f16.c)
+endif()
+
diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/SupportFunctions.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/SupportFunctions.c
new file mode 100644
index 0000000..ca8b1b6
--- /dev/null
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/SupportFunctions.c
@@ -0,0 +1,63 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: SupportFunctions.c
+ * Description: Combination of all support function source files.
+ *
+ * $Date: 16. March 2020
+ * $Revision: V1.1.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2019-2020 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_barycenter_f32.c"
+#include "arm_bitonic_sort_f32.c"
+#include "arm_bubble_sort_f32.c"
+#include "arm_copy_f32.c"
+#include "arm_copy_f64.c"
+#include "arm_copy_q15.c"
+#include "arm_copy_q31.c"
+#include "arm_copy_q7.c"
+#include "arm_fill_f32.c"
+#include "arm_fill_f64.c"
+#include "arm_fill_q15.c"
+#include "arm_fill_q31.c"
+#include "arm_fill_q7.c"
+#include "arm_heap_sort_f32.c"
+#include "arm_insertion_sort_f32.c"
+#include "arm_merge_sort_f32.c"
+#include "arm_merge_sort_init_f32.c"
+#include "arm_quick_sort_f32.c"
+#include "arm_selection_sort_f32.c"
+#include "arm_sort_f32.c"
+#include "arm_sort_init_f32.c"
+#include "arm_weighted_sum_f32.c"
+
+#include "arm_float_to_q15.c"
+#include "arm_float_to_q31.c"
+#include "arm_float_to_q7.c"
+#include "arm_q15_to_float.c"
+#include "arm_q15_to_q31.c"
+#include "arm_q15_to_q7.c"
+#include "arm_q31_to_float.c"
+#include "arm_q31_to_q15.c"
+#include "arm_q31_to_q7.c"
+#include "arm_q7_to_float.c"
+#include "arm_q7_to_q15.c"
+#include "arm_q7_to_q31.c"
diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/SupportFunctionsF16.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/SupportFunctionsF16.c
new file mode 100644
index 0000000..0e39d8d
--- /dev/null
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/SupportFunctionsF16.c
@@ -0,0 +1,36 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: SupportFunctions.c
+ * Description: Combination of all support function source files.
+ *
+ * $Date: 16. March 2020
+ * $Revision: V1.1.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2019-2020 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_copy_f16.c"
+#include "arm_fill_f16.c"
+#include "arm_f16_to_q15.c"
+#include "arm_f16_to_float.c"
+#include "arm_q15_to_f16.c"
+#include "arm_float_to_f16.c"
+#include "arm_weighted_sum_f16.c"
+#include "arm_barycenter_f16.c"
diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_barycenter_f16.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_barycenter_f16.c
new file mode 100644
index 0000000..6dfe55c
--- /dev/null
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_barycenter_f16.c
@@ -0,0 +1,274 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_barycenter_f16.c
+ * Description: Barycenter
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+#include <limits.h>
+#include <math.h>
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @defgroup barycenter Barycenter
+
+ Barycenter of weighted vectors
+ */
+
+/**
+ @addtogroup barycenter
+ @{
+ */
+
+
+/**
+ * @brief Barycenter
+ *
+ *
+ * @param[in] *in List of vectors
+ * @param[in] *weights Weights of the vectors
+ * @param[out] *out Barycenter
+ * @param[in] nbVectors Number of vectors
+ * @param[in] vecDim Dimension of space (vector dimension)
+ * @return None
+ *
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+void arm_barycenter_f16(const float16_t *in,
+ const float16_t *weights,
+ float16_t *out,
+ uint32_t nbVectors,
+ uint32_t vecDim)
+{
+ const float16_t *pIn, *pW;
+ const float16_t *pIn1, *pIn2, *pIn3, *pIn4;
+ float16_t *pOut;
+ uint32_t blkCntVector, blkCntSample;
+ float16_t accum, w;
+
+ blkCntVector = nbVectors;
+ blkCntSample = vecDim;
+
+ accum = 0.0f;
+
+ pW = weights;
+ pIn = in;
+
+
+ arm_fill_f16(0.0f, out, vecDim);
+
+
+ /* Sum */
+ pIn1 = pIn;
+ pIn2 = pIn1 + vecDim;
+ pIn3 = pIn2 + vecDim;
+ pIn4 = pIn3 + vecDim;
+
+ blkCntVector = nbVectors >> 2;
+ while (blkCntVector > 0)
+ {
+ f16x8_t outV, inV1, inV2, inV3, inV4;
+ float16_t w1, w2, w3, w4;
+
+ pOut = out;
+ w1 = *pW++;
+ w2 = *pW++;
+ w3 = *pW++;
+ w4 = *pW++;
+ accum += (_Float16)w1 + (_Float16)w2 + (_Float16)w3 + (_Float16)w4;
+
+ blkCntSample = vecDim >> 3;
+ while (blkCntSample > 0) {
+ outV = vld1q((const float16_t *) pOut);
+ inV1 = vld1q(pIn1);
+ inV2 = vld1q(pIn2);
+ inV3 = vld1q(pIn3);
+ inV4 = vld1q(pIn4);
+ outV = vfmaq(outV, inV1, w1);
+ outV = vfmaq(outV, inV2, w2);
+ outV = vfmaq(outV, inV3, w3);
+ outV = vfmaq(outV, inV4, w4);
+ vst1q(pOut, outV);
+
+ pOut += 8;
+ pIn1 += 8;
+ pIn2 += 8;
+ pIn3 += 8;
+ pIn4 += 8;
+
+ blkCntSample--;
+ }
+
+ blkCntSample = vecDim & 7;
+ while (blkCntSample > 0) {
+ *pOut = (_Float16)*pOut + (_Float16)*pIn1++ * (_Float16)w1;
+ *pOut = (_Float16)*pOut + (_Float16)*pIn2++ * (_Float16)w2;
+ *pOut = (_Float16)*pOut + (_Float16)*pIn3++ * (_Float16)w3;
+ *pOut = (_Float16)*pOut + (_Float16)*pIn4++ * (_Float16)w4;
+ pOut++;
+ blkCntSample--;
+ }
+
+ pIn1 += 3 * vecDim;
+ pIn2 += 3 * vecDim;
+ pIn3 += 3 * vecDim;
+ pIn4 += 3 * vecDim;
+
+ blkCntVector--;
+ }
+
+ pIn = pIn1;
+
+ blkCntVector = nbVectors & 3;
+ while (blkCntVector > 0)
+ {
+ f16x8_t inV, outV;
+
+ pOut = out;
+ w = *pW++;
+ accum += (_Float16)w;
+
+ blkCntSample = vecDim >> 3;
+ while (blkCntSample > 0)
+ {
+ outV = vld1q_f16(pOut);
+ inV = vld1q_f16(pIn);
+ outV = vfmaq(outV, inV, w);
+ vst1q_f16(pOut, outV);
+ pOut += 8;
+ pIn += 8;
+
+ blkCntSample--;
+ }
+
+ blkCntSample = vecDim & 7;
+ while (blkCntSample > 0)
+ {
+ *pOut = (_Float16)*pOut + (_Float16)*pIn++ * (_Float16)w;
+ pOut++;
+ blkCntSample--;
+ }
+
+ blkCntVector--;
+ }
+
+ /* Normalize */
+ pOut = out;
+ accum = 1.0f16 / (_Float16)accum;
+
+ blkCntSample = vecDim >> 3;
+ while (blkCntSample > 0)
+ {
+ f16x8_t tmp;
+
+ tmp = vld1q((const float16_t *) pOut);
+ tmp = vmulq(tmp, accum);
+ vst1q(pOut, tmp);
+ pOut += 8;
+ blkCntSample--;
+ }
+
+ blkCntSample = vecDim & 7;
+ while (blkCntSample > 0)
+ {
+ *pOut = (_Float16)*pOut * (_Float16)accum;
+ pOut++;
+ blkCntSample--;
+ }
+}
+#else
+void arm_barycenter_f16(const float16_t *in, const float16_t *weights, float16_t *out, uint32_t nbVectors,uint32_t vecDim)
+{
+
+ const float16_t *pIn,*pW;
+ float16_t *pOut;
+ uint32_t blkCntVector,blkCntSample;
+ float16_t accum, w;
+
+ blkCntVector = nbVectors;
+ blkCntSample = vecDim;
+
+ accum = 0.0f16;
+
+ pW = weights;
+ pIn = in;
+
+ /* Set counters to 0 */
+ blkCntSample = vecDim;
+ pOut = out;
+
+ while(blkCntSample > 0)
+ {
+ *pOut = 0.0f16;
+ pOut++;
+ blkCntSample--;
+ }
+
+ /* Sum */
+ while(blkCntVector > 0)
+ {
+ pOut = out;
+ w = *pW++;
+ accum += (_Float16)w;
+
+ blkCntSample = vecDim;
+ while(blkCntSample > 0)
+ {
+ *pOut = (_Float16)*pOut + (_Float16)*pIn++ * (_Float16)w;
+ pOut++;
+ blkCntSample--;
+ }
+
+ blkCntVector--;
+ }
+
+ /* Normalize */
+ blkCntSample = vecDim;
+ pOut = out;
+
+ while(blkCntSample > 0)
+ {
+ *pOut = (_Float16)*pOut / (_Float16)accum;
+ pOut++;
+ blkCntSample--;
+ }
+
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ * @} end of barycenter group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
+
diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_barycenter_f32.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_barycenter_f32.c
new file mode 100644
index 0000000..817bb58
--- /dev/null
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_barycenter_f32.c
@@ -0,0 +1,414 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_barycenter_f32.c
+ * Description: Barycenter
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+#include <limits.h>
+#include <math.h>
+
+
+/**
+ @ingroup barycenter
+ */
+
+
+/**
+ * @brief Barycenter
+ *
+ *
+ * @param[in] *in List of vectors
+ * @param[in] *weights Weights of the vectors
+ * @param[out] *out Barycenter
+ * @param[in] nbVectors Number of vectors
+ * @param[in] vecDim Dimension of space (vector dimension)
+ * @return None
+ *
+ */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_barycenter_f32(const float32_t *in,
+ const float32_t *weights,
+ float32_t *out,
+ uint32_t nbVectors,
+ uint32_t vecDim)
+{
+ const float32_t *pIn, *pW;
+ const float32_t *pIn1, *pIn2, *pIn3, *pIn4;
+ float32_t *pOut;
+ uint32_t blkCntVector, blkCntSample;
+ float32_t accum, w;
+
+ blkCntVector = nbVectors;
+ blkCntSample = vecDim;
+
+ accum = 0.0f;
+
+ pW = weights;
+ pIn = in;
+
+
+ arm_fill_f32(0.0f, out, vecDim);
+
+
+ /* Sum */
+ pIn1 = pIn;
+ pIn2 = pIn1 + vecDim;
+ pIn3 = pIn2 + vecDim;
+ pIn4 = pIn3 + vecDim;
+
+ blkCntVector = nbVectors >> 2;
+ while (blkCntVector > 0)
+ {
+ f32x4_t outV, inV1, inV2, inV3, inV4;
+ float32_t w1, w2, w3, w4;
+
+ pOut = out;
+ w1 = *pW++;
+ w2 = *pW++;
+ w3 = *pW++;
+ w4 = *pW++;
+ accum += w1 + w2 + w3 + w4;
+
+ blkCntSample = vecDim >> 2;
+ while (blkCntSample > 0) {
+ outV = vld1q((const float32_t *) pOut);
+ inV1 = vld1q(pIn1);
+ inV2 = vld1q(pIn2);
+ inV3 = vld1q(pIn3);
+ inV4 = vld1q(pIn4);
+ outV = vfmaq(outV, inV1, w1);
+ outV = vfmaq(outV, inV2, w2);
+ outV = vfmaq(outV, inV3, w3);
+ outV = vfmaq(outV, inV4, w4);
+ vst1q(pOut, outV);
+
+ pOut += 4;
+ pIn1 += 4;
+ pIn2 += 4;
+ pIn3 += 4;
+ pIn4 += 4;
+
+ blkCntSample--;
+ }
+
+ blkCntSample = vecDim & 3;
+ while (blkCntSample > 0) {
+ *pOut = *pOut + *pIn1++ * w1;
+ *pOut = *pOut + *pIn2++ * w2;
+ *pOut = *pOut + *pIn3++ * w3;
+ *pOut = *pOut + *pIn4++ * w4;
+ pOut++;
+ blkCntSample--;
+ }
+
+ pIn1 += 3 * vecDim;
+ pIn2 += 3 * vecDim;
+ pIn3 += 3 * vecDim;
+ pIn4 += 3 * vecDim;
+
+ blkCntVector--;
+ }
+
+ pIn = pIn1;
+
+ blkCntVector = nbVectors & 3;
+ while (blkCntVector > 0)
+ {
+ f32x4_t inV, outV;
+
+ pOut = out;
+ w = *pW++;
+ accum += w;
+
+ blkCntSample = vecDim >> 2;
+ while (blkCntSample > 0)
+ {
+ outV = vld1q_f32(pOut);
+ inV = vld1q_f32(pIn);
+ outV = vfmaq(outV, inV, w);
+ vst1q_f32(pOut, outV);
+ pOut += 4;
+ pIn += 4;
+
+ blkCntSample--;
+ }
+
+ blkCntSample = vecDim & 3;
+ while (blkCntSample > 0)
+ {
+ *pOut = *pOut + *pIn++ * w;
+ pOut++;
+ blkCntSample--;
+ }
+
+ blkCntVector--;
+ }
+
+ /* Normalize */
+ pOut = out;
+ accum = 1.0f / accum;
+
+ blkCntSample = vecDim >> 2;
+ while (blkCntSample > 0)
+ {
+ f32x4_t tmp;
+
+ tmp = vld1q((const float32_t *) pOut);
+ tmp = vmulq(tmp, accum);
+ vst1q(pOut, tmp);
+ pOut += 4;
+ blkCntSample--;
+ }
+
+ blkCntSample = vecDim & 3;
+ while (blkCntSample > 0)
+ {
+ *pOut = *pOut * accum;
+ pOut++;
+ blkCntSample--;
+ }
+}
+#else
+#if defined(ARM_MATH_NEON)
+
+#include "NEMath.h"
+void arm_barycenter_f32(const float32_t *in, const float32_t *weights, float32_t *out, uint32_t nbVectors,uint32_t vecDim)
+{
+
+ const float32_t *pIn,*pW, *pIn1, *pIn2, *pIn3, *pIn4;
+ float32_t *pOut;
+ uint32_t blkCntVector,blkCntSample;
+ float32_t accum, w,w1,w2,w3,w4;
+
+ float32x4_t tmp, inV,outV, inV1, inV2, inV3, inV4;
+
+ blkCntVector = nbVectors;
+ blkCntSample = vecDim;
+
+ accum = 0.0f;
+
+ pW = weights;
+ pIn = in;
+
+ /* Set counters to 0 */
+ tmp = vdupq_n_f32(0.0f);
+ pOut = out;
+
+ blkCntSample = vecDim >> 2;
+ while(blkCntSample > 0)
+ {
+ vst1q_f32(pOut, tmp);
+ pOut += 4;
+ blkCntSample--;
+ }
+
+ blkCntSample = vecDim & 3;
+ while(blkCntSample > 0)
+ {
+ *pOut = 0.0f;
+ pOut++;
+ blkCntSample--;
+ }
+
+ /* Sum */
+
+ pIn1 = pIn;
+ pIn2 = pIn1 + vecDim;
+ pIn3 = pIn2 + vecDim;
+ pIn4 = pIn3 + vecDim;
+
+ blkCntVector = nbVectors >> 2;
+ while(blkCntVector > 0)
+ {
+ pOut = out;
+ w1 = *pW++;
+ w2 = *pW++;
+ w3 = *pW++;
+ w4 = *pW++;
+ accum += w1 + w2 + w3 + w4;
+
+ blkCntSample = vecDim >> 2;
+ while(blkCntSample > 0)
+ {
+ outV = vld1q_f32(pOut);
+ inV1 = vld1q_f32(pIn1);
+ inV2 = vld1q_f32(pIn2);
+ inV3 = vld1q_f32(pIn3);
+ inV4 = vld1q_f32(pIn4);
+ outV = vmlaq_n_f32(outV,inV1,w1);
+ outV = vmlaq_n_f32(outV,inV2,w2);
+ outV = vmlaq_n_f32(outV,inV3,w3);
+ outV = vmlaq_n_f32(outV,inV4,w4);
+ vst1q_f32(pOut, outV);
+ pOut += 4;
+ pIn1 += 4;
+ pIn2 += 4;
+ pIn3 += 4;
+ pIn4 += 4;
+
+ blkCntSample--;
+ }
+
+ blkCntSample = vecDim & 3;
+ while(blkCntSample > 0)
+ {
+ *pOut = *pOut + *pIn1++ * w1;
+ *pOut = *pOut + *pIn2++ * w2;
+ *pOut = *pOut + *pIn3++ * w3;
+ *pOut = *pOut + *pIn4++ * w4;
+ pOut++;
+ blkCntSample--;
+ }
+
+ pIn1 += 3*vecDim;
+ pIn2 += 3*vecDim;
+ pIn3 += 3*vecDim;
+ pIn4 += 3*vecDim;
+
+ blkCntVector--;
+ }
+
+ pIn = pIn1;
+
+ blkCntVector = nbVectors & 3;
+ while(blkCntVector > 0)
+ {
+ pOut = out;
+ w = *pW++;
+ accum += w;
+
+ blkCntSample = vecDim >> 2;
+ while(blkCntSample > 0)
+ {
+ outV = vld1q_f32(pOut);
+ inV = vld1q_f32(pIn);
+ outV = vmlaq_n_f32(outV,inV,w);
+ vst1q_f32(pOut, outV);
+ pOut += 4;
+ pIn += 4;
+
+ blkCntSample--;
+ }
+
+ blkCntSample = vecDim & 3;
+ while(blkCntSample > 0)
+ {
+ *pOut = *pOut + *pIn++ * w;
+ pOut++;
+ blkCntSample--;
+ }
+
+ blkCntVector--;
+ }
+
+ /* Normalize */
+ pOut = out;
+ accum = 1.0f / accum;
+
+ blkCntSample = vecDim >> 2;
+ while(blkCntSample > 0)
+ {
+ tmp = vld1q_f32(pOut);
+ tmp = vmulq_n_f32(tmp,accum);
+ vst1q_f32(pOut, tmp);
+ pOut += 4;
+ blkCntSample--;
+ }
+
+ blkCntSample = vecDim & 3;
+ while(blkCntSample > 0)
+ {
+ *pOut = *pOut * accum;
+ pOut++;
+ blkCntSample--;
+ }
+
+}
+#else
+void arm_barycenter_f32(const float32_t *in, const float32_t *weights, float32_t *out, uint32_t nbVectors,uint32_t vecDim)
+{
+
+ const float32_t *pIn,*pW;
+ float32_t *pOut;
+ uint32_t blkCntVector,blkCntSample;
+ float32_t accum, w;
+
+ blkCntVector = nbVectors;
+ blkCntSample = vecDim;
+
+ accum = 0.0f;
+
+ pW = weights;
+ pIn = in;
+
+ /* Set counters to 0 */
+ blkCntSample = vecDim;
+ pOut = out;
+
+ while(blkCntSample > 0)
+ {
+ *pOut = 0.0f;
+ pOut++;
+ blkCntSample--;
+ }
+
+ /* Sum */
+ while(blkCntVector > 0)
+ {
+ pOut = out;
+ w = *pW++;
+ accum += w;
+
+ blkCntSample = vecDim;
+ while(blkCntSample > 0)
+ {
+ *pOut = *pOut + *pIn++ * w;
+ pOut++;
+ blkCntSample--;
+ }
+
+ blkCntVector--;
+ }
+
+ /* Normalize */
+ blkCntSample = vecDim;
+ pOut = out;
+
+ while(blkCntSample > 0)
+ {
+ *pOut = *pOut / accum;
+ pOut++;
+ blkCntSample--;
+ }
+
+}
+#endif
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ * @} end of barycenter group
+ */
diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_bitonic_sort_f32.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_bitonic_sort_f32.c
new file mode 100644
index 0000000..e9612b1
--- /dev/null
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_bitonic_sort_f32.c
@@ -0,0 +1,1039 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_bitonic_sort_f32.c
+ * Description: Floating point bitonic sort
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+#include "arm_sorting.h"
+
+
+#if !defined(ARM_MATH_NEON)
+
+static void arm_bitonic_sort_core_f32(float32_t *pSrc, uint32_t n, uint8_t dir)
+{
+ uint32_t step;
+ uint32_t k, j;
+ float32_t *leftPtr, *rightPtr;
+ float32_t temp;
+
+ step = n>>1;
+ leftPtr = pSrc;
+ rightPtr = pSrc+n-1;
+
+ for(k=0; k<step; k++)
+ {
+ if(dir == (*leftPtr > *rightPtr))
+ {
+ // Swap
+ temp=*leftPtr;
+ *leftPtr=*rightPtr;
+ *rightPtr=temp;
+ }
+
+ leftPtr++; // Move right
+ rightPtr--; // Move left
+ }
+
+ // Merge
+ for(step=(n>>2); step>0; step/=2)
+ {
+ for(j=0; j<n; j=j+step*2)
+ {
+ leftPtr = pSrc+j;
+ rightPtr = pSrc+j+step;
+
+ for(k=0; k<step; k++)
+ {
+ if(*leftPtr > *rightPtr)
+ {
+ // Swap
+ temp=*leftPtr;
+ *leftPtr=*rightPtr;
+ *rightPtr=temp;
+ }
+
+ leftPtr++;
+ rightPtr++;
+ }
+ }
+ }
+}
+#endif
+
+#if defined(ARM_MATH_NEON)
+
+
+static float32x4x2_t arm_bitonic_resort_8_f32(float32x4_t a, float32x4_t b, uint8_t dir)
+{
+ /* Start with two vectors:
+ * +---+---+---+---+
+ * | a | b | c | d |
+ * +---+---+---+---+
+ * +---+---+---+---+
+ * | e | f | g | h |
+ * +---+---+---+---+
+ * All the elements of the first are guaranteed to be less than or equal to
+ * all of the elements in the second, and both vectors are bitonic.
+ * We need to perform these operations to completely sort both lists:
+ * vminmax([abcd],[efgh])
+ * vminmax([acbd],[egfh])
+ */
+ vtrn128_64q(a, b);
+ /* +---+---+---+---+
+ * | a | b | e | f |
+ * +---+---+---+---+
+ * +---+---+---+---+
+ * | c | d | g | h |
+ * +---+---+---+---+
+ */
+ if(dir)
+ vminmaxq(a, b);
+ else
+ vminmaxq(b, a);
+
+ vtrn128_32q(a, b);
+ /* +---+---+---+---+
+ * | a | c | e | g |
+ * +---+---+---+---+
+ * +---+---+---+---+
+ * | b | d | f | h |
+ * +---+---+---+---+
+ */
+ if(dir)
+ vminmaxq(a, b);
+ else
+ vminmaxq(b, a);
+
+ return vzipq_f32(a, b);
+}
+
+
+static float32x4x2_t arm_bitonic_merge_8_f32(float32x4_t a, float32x4_t b, uint8_t dir)
+{
+ /* a and b are guaranteed to be bitonic */
+ // Reverse the element of the second vector
+ b = vrev128q_f32(b);
+
+ // Compare the two vectors
+ if(dir)
+ vminmaxq(a, b);
+ else
+ vminmaxq(b, a);
+
+ // Merge the two vectors
+ float32x4x2_t ab = arm_bitonic_resort_8_f32(a, b, dir);
+
+ return ab;
+}
+
+static void arm_bitonic_resort_16_f32(float32_t * pOut, float32x4x2_t a, float32x4x2_t b, uint8_t dir)
+{
+ /* Start with two vectors:
+ * +---+---+---+---+---+---+---+---+
+ * | a | b | c | d | e | f | g | h |
+ * +---+---+---+---+---+---+---+---+
+ * +---+---+---+---+---+---+---+---+
+ * | i | j | k | l | m | n | o | p |
+ * +---+---+---+---+---+---+---+---+
+ * All the elements of the first are guaranteed to be less than or equal to
+ * all of the elements in the second, and both vectors are bitonic.
+ * We need to perform these operations to completely sort both lists:
+ * vminmax([abcd],[efgh]) vminmax([ijkl],[mnop])
+ * vminmax([abef],[cdgh]) vminmax([ijmn],[klop])
+ * vminmax([acef],[bdfh]) vminmax([ikmo],[jlmp])
+ */
+
+ vtrn256_128q(a, b);
+ /* +---+---+---+---+---+---+---+---+
+ * | a | b | c | d | i | j | k | l |
+ * +---+---+---+---+---+---+---+---+
+ * +---+---+---+---+---+---+---+---+
+ * | e | f | g | h | m | n | o | p |
+ * +---+---+---+---+---+---+---+---+
+ */
+ if(dir)
+ vminmax256q(a, b);
+ else
+ vminmax256q(b, a);
+
+ vtrn256_64q(a, b);
+
+ /* +---+---+---+---+---+---+---+---+
+ * | a | b | e | f | i | j | m | n |
+ * +---+---+---+---+---+---+---+---+
+ * +---+---+---+---+---+---+---+---+
+ * | c | d | g | h | k | l | o | p |
+ * +---+---+---+---+---+---+---+---+
+ */
+ if(dir)
+ vminmax256q(a, b);
+ else
+ vminmax256q(b, a);
+
+ vtrn256_32q(a, b);
+ /* We now have:
+ * +---+---+---+---+---+---+---+---+
+ * | a | c | e | g | i | k | m | o |
+ * +---+---+---+---+---+---+---+---+
+ * +---+---+---+---+---+---+---+---+
+ * | b | d | f | h | j | l | n | p |
+ * +---+---+---+---+---+---+---+---+
+ */
+ if(dir)
+ vminmax256q(a, b);
+ else
+ vminmax256q(b, a);
+
+ float32x4x2_t out1 = vzipq_f32(a.val[0], b.val[0]);
+ float32x4x2_t out2 = vzipq_f32(a.val[1], b.val[1]);
+
+ vst1q_f32(pOut, out1.val[0]);
+ vst1q_f32(pOut+4, out1.val[1]);
+ vst1q_f32(pOut+8, out2.val[0]);
+ vst1q_f32(pOut+12, out2.val[1]);
+}
+
+static void arm_bitonic_merge_16_f32(float32_t * pOut, float32x4x2_t a, float32x4x2_t b, uint8_t dir)
+{
+ // Merge two preordered float32x4x2_t
+ vrev256q_f32(b);
+
+ if(dir)
+ vminmax256q(a, b);
+ else
+ vminmax256q(b, a);
+
+ arm_bitonic_resort_16_f32(pOut, a, b, dir);
+}
+
+static void arm_bitonic_sort_16_f32(float32_t *pSrc, float32_t *pDst, uint8_t dir)
+{
+ float32x4_t a;
+ float32x4_t b;
+ float32x4_t c;
+ float32x4_t d;
+
+ // Load 16 samples
+ a = vld1q_f32(pSrc);
+ b = vld1q_f32(pSrc+4);
+ c = vld1q_f32(pSrc+8);
+ d = vld1q_f32(pSrc+12);
+
+ // Bitonic sorting network for 4 samples x 4 times
+ if(dir)
+ {
+ vminmaxq(a, b);
+ vminmaxq(c, d);
+
+ vminmaxq(a, d);
+ vminmaxq(b, c);
+
+ vminmaxq(a, b);
+ vminmaxq(c, d);
+ }
+ else
+ {
+ vminmaxq(b, a);
+ vminmaxq(d, c);
+
+ vminmaxq(d, a);
+ vminmaxq(c, b);
+
+ vminmaxq(b, a);
+ vminmaxq(d, c);
+ }
+
+ float32x4x2_t ab = vtrnq_f32 (a, b);
+ float32x4x2_t cd = vtrnq_f32 (c, d);
+
+ // Transpose 4 ordered arrays of 4 samples
+ a = vcombine_f32(vget_low_f32(ab.val[0]), vget_low_f32(cd.val[0]));
+ b = vcombine_f32(vget_low_f32(ab.val[1]), vget_low_f32(cd.val[1]));
+ c = vcombine_f32(vget_high_f32(ab.val[0]), vget_high_f32(cd.val[0]));
+ d = vcombine_f32(vget_high_f32(ab.val[1]), vget_high_f32(cd.val[1]));
+
+ // Merge pairs of arrays of 4 samples
+ ab = arm_bitonic_merge_8_f32(a, b, dir);
+ cd = arm_bitonic_merge_8_f32(c, d, dir);
+
+ // Merge arrays of 8 samples
+ arm_bitonic_merge_16_f32(pDst, ab, cd, dir);
+}
+
+
+
+
+
+static void arm_bitonic_merge_32_f32(float32_t * pSrc, float32x4x2_t ab1, float32x4x2_t ab2, float32x4x2_t cd1, float32x4x2_t cd2, uint8_t dir)
+{
+ //Compare
+ if(dir)
+ {
+ vminmax256q(ab1, cd1);
+ vminmax256q(ab2, cd2);
+ }
+ else
+ {
+ vminmax256q(cd1, ab1);
+ vminmax256q(cd2, ab2);
+ }
+ //Transpose 256
+ float32x4_t temp;
+
+ temp = ab2.val[0];
+ ab2.val[0] = cd1.val[0];
+ cd1.val[0] = temp;
+ temp = ab2.val[1];
+ ab2.val[1] = cd1.val[1];
+ cd1.val[1] = temp;
+
+ //Compare
+ if(dir)
+ {
+ vminmax256q(ab1, cd1);
+ vminmax256q(ab2, cd2);
+ }
+ else
+ {
+ vminmax256q(cd1, ab1);
+ vminmax256q(cd2, ab2);
+ }
+
+ //Transpose 128
+ arm_bitonic_merge_16_f32(pSrc+0, ab1, cd1, dir);
+ arm_bitonic_merge_16_f32(pSrc+16, ab2, cd2, dir);
+}
+
+static void arm_bitonic_merge_64_f32(float32_t * pSrc, uint8_t dir)
+{
+ float32x4x2_t ab1, ab2, ab3, ab4;
+ float32x4x2_t cd1, cd2, cd3, cd4;
+
+ //Load and reverse second array
+ ab1.val[0] = vld1q_f32(pSrc+0 );
+ ab1.val[1] = vld1q_f32(pSrc+4 );
+ ab2.val[0] = vld1q_f32(pSrc+8 );
+ ab2.val[1] = vld1q_f32(pSrc+12);
+ ab3.val[0] = vld1q_f32(pSrc+16);
+ ab3.val[1] = vld1q_f32(pSrc+20);
+ ab4.val[0] = vld1q_f32(pSrc+24);
+ ab4.val[1] = vld1q_f32(pSrc+28);
+
+ vldrev128q_f32(cd4.val[1], pSrc+32);
+ vldrev128q_f32(cd4.val[0], pSrc+36);
+ vldrev128q_f32(cd3.val[1], pSrc+40);
+ vldrev128q_f32(cd3.val[0], pSrc+44);
+ vldrev128q_f32(cd2.val[1], pSrc+48);
+ vldrev128q_f32(cd2.val[0], pSrc+52);
+ vldrev128q_f32(cd1.val[1], pSrc+56);
+ vldrev128q_f32(cd1.val[0], pSrc+60);
+
+ //Compare
+ if(dir)
+ {
+ vminmax256q(ab1, cd1);
+ vminmax256q(ab2, cd2);
+ vminmax256q(ab3, cd3);
+ vminmax256q(ab4, cd4);
+ }
+ else
+ {
+ vminmax256q(cd1, ab1);
+ vminmax256q(cd2, ab2);
+ vminmax256q(cd3, ab3);
+ vminmax256q(cd4, ab4);
+ }
+
+ //Transpose 512
+ float32x4_t temp;
+
+ temp = ab3.val[0];
+ ab3.val[0] = cd1.val[0];
+ cd1.val[0] = temp;
+ temp = ab3.val[1];
+ ab3.val[1] = cd1.val[1];
+ cd1.val[1] = temp;
+ temp = ab4.val[0];
+ ab4.val[0] = cd2.val[0];
+ cd2.val[0] = temp;
+ temp = ab4.val[1];
+ ab4.val[1] = cd2.val[1];
+ cd2.val[1] = temp;
+
+ //Compare
+ if(dir)
+ {
+ vminmax256q(ab1, cd1);
+ vminmax256q(ab2, cd2);
+ vminmax256q(ab3, cd3);
+ vminmax256q(ab4, cd4);
+ }
+ else
+ {
+ vminmax256q(cd1, ab1);
+ vminmax256q(cd2, ab2);
+ vminmax256q(cd3, ab3);
+ vminmax256q(cd4, ab4);
+ }
+
+ //Transpose 256
+ arm_bitonic_merge_32_f32(pSrc+0, ab1, ab2, cd1, cd2, dir);
+ arm_bitonic_merge_32_f32(pSrc+32, ab3, ab4, cd3, cd4, dir);
+}
+
+static void arm_bitonic_merge_128_f32(float32_t * pSrc, uint8_t dir)
+{
+ float32x4x2_t ab1, ab2, ab3, ab4, ab5, ab6, ab7, ab8;
+ float32x4x2_t cd1, cd2, cd3, cd4, cd5, cd6, cd7, cd8;
+
+ //Load and reverse second array
+ ab1.val[0] = vld1q_f32(pSrc+0 );
+ ab1.val[1] = vld1q_f32(pSrc+4 );
+ ab2.val[0] = vld1q_f32(pSrc+8 );
+ ab2.val[1] = vld1q_f32(pSrc+12);
+ ab3.val[0] = vld1q_f32(pSrc+16);
+ ab3.val[1] = vld1q_f32(pSrc+20);
+ ab4.val[0] = vld1q_f32(pSrc+24);
+ ab4.val[1] = vld1q_f32(pSrc+28);
+ ab5.val[0] = vld1q_f32(pSrc+32);
+ ab5.val[1] = vld1q_f32(pSrc+36);
+ ab6.val[0] = vld1q_f32(pSrc+40);
+ ab6.val[1] = vld1q_f32(pSrc+44);
+ ab7.val[0] = vld1q_f32(pSrc+48);
+ ab7.val[1] = vld1q_f32(pSrc+52);
+ ab8.val[0] = vld1q_f32(pSrc+56);
+ ab8.val[1] = vld1q_f32(pSrc+60);
+
+ vldrev128q_f32(cd8.val[1], pSrc+64);
+ vldrev128q_f32(cd8.val[0], pSrc+68);
+ vldrev128q_f32(cd7.val[1], pSrc+72);
+ vldrev128q_f32(cd7.val[0], pSrc+76);
+ vldrev128q_f32(cd6.val[1], pSrc+80);
+ vldrev128q_f32(cd6.val[0], pSrc+84);
+ vldrev128q_f32(cd5.val[1], pSrc+88);
+ vldrev128q_f32(cd5.val[0], pSrc+92);
+ vldrev128q_f32(cd4.val[1], pSrc+96);
+ vldrev128q_f32(cd4.val[0], pSrc+100);
+ vldrev128q_f32(cd3.val[1], pSrc+104);
+ vldrev128q_f32(cd3.val[0], pSrc+108);
+ vldrev128q_f32(cd2.val[1], pSrc+112);
+ vldrev128q_f32(cd2.val[0], pSrc+116);
+ vldrev128q_f32(cd1.val[1], pSrc+120);
+ vldrev128q_f32(cd1.val[0], pSrc+124);
+
+ //Compare
+ if(dir)
+ {
+ vminmax256q(ab1, cd1);
+ vminmax256q(ab2, cd2);
+ vminmax256q(ab3, cd3);
+ vminmax256q(ab4, cd4);
+ vminmax256q(ab5, cd5);
+ vminmax256q(ab6, cd6);
+ vminmax256q(ab7, cd7);
+ vminmax256q(ab8, cd8);
+ }
+ else
+ {
+ vminmax256q(cd1, ab1);
+ vminmax256q(cd2, ab2);
+ vminmax256q(cd3, ab3);
+ vminmax256q(cd4, ab4);
+ vminmax256q(cd5, ab5);
+ vminmax256q(cd6, ab6);
+ vminmax256q(cd7, ab7);
+ vminmax256q(cd8, ab8);
+ }
+
+ //Transpose
+ float32x4_t temp;
+
+ temp = ab5.val[0];
+ ab5.val[0] = cd1.val[0];
+ cd1.val[0] = temp;
+ temp = ab5.val[1];
+ ab5.val[1] = cd1.val[1];
+ cd1.val[1] = temp;
+ temp = ab6.val[0];
+ ab6.val[0] = cd2.val[0];
+ cd2.val[0] = temp;
+ temp = ab6.val[1];
+ ab6.val[1] = cd2.val[1];
+ cd2.val[1] = temp;
+ temp = ab7.val[0];
+ ab7.val[0] = cd3.val[0];
+ cd3.val[0] = temp;
+ temp = ab7.val[1];
+ ab7.val[1] = cd3.val[1];
+ cd3.val[1] = temp;
+ temp = ab8.val[0];
+ ab8.val[0] = cd4.val[0];
+ cd4.val[0] = temp;
+ temp = ab8.val[1];
+ ab8.val[1] = cd4.val[1];
+ cd4.val[1] = temp;
+
+ //Compare
+ if(dir)
+ {
+ vminmax256q(ab1, cd1);
+ vminmax256q(ab2, cd2);
+ vminmax256q(ab3, cd3);
+ vminmax256q(ab4, cd4);
+ vminmax256q(ab5, cd5);
+ vminmax256q(ab6, cd6);
+ vminmax256q(ab7, cd7);
+ vminmax256q(ab8, cd8);
+ }
+ else
+ {
+ vminmax256q(cd1, ab1);
+ vminmax256q(cd2, ab2);
+ vminmax256q(cd3, ab3);
+ vminmax256q(cd4, ab4);
+ vminmax256q(cd5, ab5);
+ vminmax256q(cd6, ab6);
+ vminmax256q(cd7, ab7);
+ vminmax256q(cd8, ab8);
+ }
+
+ vst1q_f32(pSrc, ab1.val[0]);
+ vst1q_f32(pSrc+4, ab1.val[1]);
+ vst1q_f32(pSrc+8, ab2.val[0]);
+ vst1q_f32(pSrc+12, ab2.val[1]);
+ vst1q_f32(pSrc+16, ab3.val[0]);
+ vst1q_f32(pSrc+20, ab3.val[1]);
+ vst1q_f32(pSrc+24, ab4.val[0]);
+ vst1q_f32(pSrc+28, ab4.val[1]);
+ vst1q_f32(pSrc+32, cd1.val[0]);
+ vst1q_f32(pSrc+36, cd1.val[1]);
+ vst1q_f32(pSrc+40, cd2.val[0]);
+ vst1q_f32(pSrc+44, cd2.val[1]);
+ vst1q_f32(pSrc+48, cd3.val[0]);
+ vst1q_f32(pSrc+52, cd3.val[1]);
+ vst1q_f32(pSrc+56, cd4.val[0]);
+ vst1q_f32(pSrc+60, cd4.val[1]);
+ vst1q_f32(pSrc+64, ab5.val[0]);
+ vst1q_f32(pSrc+68, ab5.val[1]);
+ vst1q_f32(pSrc+72, ab6.val[0]);
+ vst1q_f32(pSrc+76, ab6.val[1]);
+ vst1q_f32(pSrc+80, ab7.val[0]);
+ vst1q_f32(pSrc+84, ab7.val[1]);
+ vst1q_f32(pSrc+88, ab8.val[0]);
+ vst1q_f32(pSrc+92, ab8.val[1]);
+ vst1q_f32(pSrc+96, cd5.val[0]);
+ vst1q_f32(pSrc+100, cd5.val[1]);
+ vst1q_f32(pSrc+104, cd6.val[0]);
+ vst1q_f32(pSrc+108, cd6.val[1]);
+ vst1q_f32(pSrc+112, cd7.val[0]);
+ vst1q_f32(pSrc+116, cd7.val[1]);
+ vst1q_f32(pSrc+120, cd8.val[0]);
+ vst1q_f32(pSrc+124, cd8.val[1]);
+
+ //Transpose
+ arm_bitonic_merge_64_f32(pSrc+0 , dir);
+ arm_bitonic_merge_64_f32(pSrc+64, dir);
+}
+
+static void arm_bitonic_merge_256_f32(float32_t * pSrc, uint8_t dir)
+{
+ float32x4x2_t ab1, ab2, ab3, ab4, ab5, ab6, ab7, ab8;
+ float32x4x2_t ab9, ab10, ab11, ab12, ab13, ab14, ab15, ab16;
+ float32x4x2_t cd1, cd2, cd3, cd4, cd5, cd6, cd7, cd8;
+ float32x4x2_t cd9, cd10, cd11, cd12, cd13, cd14, cd15, cd16;
+
+ //Load and reverse second array
+ ab1.val[0] = vld1q_f32(pSrc+0 );
+ ab1.val[1] = vld1q_f32(pSrc+4 );
+ ab2.val[0] = vld1q_f32(pSrc+8 );
+ ab2.val[1] = vld1q_f32(pSrc+12 );
+ ab3.val[0] = vld1q_f32(pSrc+16 );
+ ab3.val[1] = vld1q_f32(pSrc+20 );
+ ab4.val[0] = vld1q_f32(pSrc+24 );
+ ab4.val[1] = vld1q_f32(pSrc+28 );
+ ab5.val[0] = vld1q_f32(pSrc+32 );
+ ab5.val[1] = vld1q_f32(pSrc+36 );
+ ab6.val[0] = vld1q_f32(pSrc+40 );
+ ab6.val[1] = vld1q_f32(pSrc+44 );
+ ab7.val[0] = vld1q_f32(pSrc+48 );
+ ab7.val[1] = vld1q_f32(pSrc+52 );
+ ab8.val[0] = vld1q_f32(pSrc+56 );
+ ab8.val[1] = vld1q_f32(pSrc+60 );
+ ab9.val[0] = vld1q_f32(pSrc+64 );
+ ab9.val[1] = vld1q_f32(pSrc+68 );
+ ab10.val[0] = vld1q_f32(pSrc+72 );
+ ab10.val[1] = vld1q_f32(pSrc+76 );
+ ab11.val[0] = vld1q_f32(pSrc+80 );
+ ab11.val[1] = vld1q_f32(pSrc+84 );
+ ab12.val[0] = vld1q_f32(pSrc+88 );
+ ab12.val[1] = vld1q_f32(pSrc+92 );
+ ab13.val[0] = vld1q_f32(pSrc+96 );
+ ab13.val[1] = vld1q_f32(pSrc+100);
+ ab14.val[0] = vld1q_f32(pSrc+104);
+ ab14.val[1] = vld1q_f32(pSrc+108);
+ ab15.val[0] = vld1q_f32(pSrc+112);
+ ab15.val[1] = vld1q_f32(pSrc+116);
+ ab16.val[0] = vld1q_f32(pSrc+120);
+ ab16.val[1] = vld1q_f32(pSrc+124);
+
+ vldrev128q_f32(cd16.val[1], pSrc+128);
+ vldrev128q_f32(cd16.val[0], pSrc+132);
+ vldrev128q_f32(cd15.val[1], pSrc+136);
+ vldrev128q_f32(cd15.val[0], pSrc+140);
+ vldrev128q_f32(cd14.val[1], pSrc+144);
+ vldrev128q_f32(cd14.val[0], pSrc+148);
+ vldrev128q_f32(cd13.val[1], pSrc+152);
+ vldrev128q_f32(cd13.val[0], pSrc+156);
+ vldrev128q_f32(cd12.val[1], pSrc+160);
+ vldrev128q_f32(cd12.val[0], pSrc+164);
+ vldrev128q_f32(cd11.val[1], pSrc+168);
+ vldrev128q_f32(cd11.val[0], pSrc+172);
+ vldrev128q_f32(cd10.val[1], pSrc+176);
+ vldrev128q_f32(cd10.val[0], pSrc+180);
+ vldrev128q_f32(cd9.val[1] , pSrc+184);
+ vldrev128q_f32(cd9.val[0] , pSrc+188);
+ vldrev128q_f32(cd8.val[1] , pSrc+192);
+ vldrev128q_f32(cd8.val[0] , pSrc+196);
+ vldrev128q_f32(cd7.val[1] , pSrc+200);
+ vldrev128q_f32(cd7.val[0] , pSrc+204);
+ vldrev128q_f32(cd6.val[1] , pSrc+208);
+ vldrev128q_f32(cd6.val[0] , pSrc+212);
+ vldrev128q_f32(cd5.val[1] , pSrc+216);
+ vldrev128q_f32(cd5.val[0] , pSrc+220);
+ vldrev128q_f32(cd4.val[1] , pSrc+224);
+ vldrev128q_f32(cd4.val[0] , pSrc+228);
+ vldrev128q_f32(cd3.val[1] , pSrc+232);
+ vldrev128q_f32(cd3.val[0] , pSrc+236);
+ vldrev128q_f32(cd2.val[1] , pSrc+240);
+ vldrev128q_f32(cd2.val[0] , pSrc+244);
+ vldrev128q_f32(cd1.val[1] , pSrc+248);
+ vldrev128q_f32(cd1.val[0] , pSrc+252);
+
+ //Compare
+ if(dir)
+ {
+ vminmax256q(ab1 , cd1 );
+ vminmax256q(ab2 , cd2 );
+ vminmax256q(ab3 , cd3 );
+ vminmax256q(ab4 , cd4 );
+ vminmax256q(ab5 , cd5 );
+ vminmax256q(ab6 , cd6 );
+ vminmax256q(ab7 , cd7 );
+ vminmax256q(ab8 , cd8 );
+ vminmax256q(ab9 , cd9 );
+ vminmax256q(ab10, cd10);
+ vminmax256q(ab11, cd11);
+ vminmax256q(ab12, cd12);
+ vminmax256q(ab13, cd13);
+ vminmax256q(ab14, cd14);
+ vminmax256q(ab15, cd15);
+ vminmax256q(ab16, cd16);
+ }
+ else
+ {
+ vminmax256q(cd1 , ab1 );
+ vminmax256q(cd2 , ab2 );
+ vminmax256q(cd3 , ab3 );
+ vminmax256q(cd4 , ab4 );
+ vminmax256q(cd5 , ab5 );
+ vminmax256q(cd6 , ab6 );
+ vminmax256q(cd7 , ab7 );
+ vminmax256q(cd8 , ab8 );
+ vminmax256q(cd9 , ab9 );
+ vminmax256q(cd10, ab10);
+ vminmax256q(cd11, ab11);
+ vminmax256q(cd12, ab12);
+ vminmax256q(cd13, ab13);
+ vminmax256q(cd14, ab14);
+ vminmax256q(cd15, ab15);
+ vminmax256q(cd16, ab16);
+ }
+
+ //Transpose
+ float32x4_t temp;
+
+ temp = ab9.val[0];
+ ab9.val[0] = cd1.val[0];
+ cd1.val[0] = temp;
+ temp = ab9.val[1];
+ ab9.val[1] = cd1.val[1];
+ cd1.val[1] = temp;
+ temp = ab10.val[0];
+ ab10.val[0] = cd2.val[0];
+ cd2.val[0] = temp;
+ temp = ab10.val[1];
+ ab10.val[1] = cd2.val[1];
+ cd2.val[1] = temp;
+ temp = ab11.val[0];
+ ab11.val[0] = cd3.val[0];
+ cd3.val[0] = temp;
+ temp = ab11.val[1];
+ ab11.val[1] = cd3.val[1];
+ cd3.val[1] = temp;
+ temp = ab12.val[0];
+ ab12.val[0] = cd4.val[0];
+ cd4.val[0] = temp;
+ temp = ab12.val[1];
+ ab12.val[1] = cd4.val[1];
+ cd4.val[1] = temp;
+ temp = ab13.val[0];
+ ab13.val[0] = cd5.val[0];
+ cd5.val[0] = temp;
+ temp = ab13.val[1];
+ ab13.val[1] = cd5.val[1];
+ cd5.val[1] = temp;
+ temp = ab14.val[0];
+ ab14.val[0] = cd6.val[0];
+ cd6.val[0] = temp;
+ temp = ab14.val[1];
+ ab14.val[1] = cd6.val[1];
+ cd6.val[1] = temp;
+ temp = ab15.val[0];
+ ab15.val[0] = cd7.val[0];
+ cd7.val[0] = temp;
+ temp = ab15.val[1];
+ ab15.val[1] = cd7.val[1];
+ cd7.val[1] = temp;
+ temp = ab16.val[0];
+ ab16.val[0] = cd8.val[0];
+ cd8.val[0] = temp;
+ temp = ab16.val[1];
+ ab16.val[1] = cd8.val[1];
+ cd8.val[1] = temp;
+
+ //Compare
+ if(dir)
+ {
+ vminmax256q(ab1 , cd1 );
+ vminmax256q(ab2 , cd2 );
+ vminmax256q(ab3 , cd3 );
+ vminmax256q(ab4 , cd4 );
+ vminmax256q(ab5 , cd5 );
+ vminmax256q(ab6 , cd6 );
+ vminmax256q(ab7 , cd7 );
+ vminmax256q(ab8 , cd8 );
+ vminmax256q(ab9 , cd9 );
+ vminmax256q(ab10, cd10);
+ vminmax256q(ab11, cd11);
+ vminmax256q(ab12, cd12);
+ vminmax256q(ab13, cd13);
+ vminmax256q(ab14, cd14);
+ vminmax256q(ab15, cd15);
+ vminmax256q(ab16, cd16);
+ }
+ else
+ {
+ vminmax256q(cd1 , ab1 );
+ vminmax256q(cd2 , ab2 );
+ vminmax256q(cd3 , ab3 );
+ vminmax256q(cd4 , ab4 );
+ vminmax256q(cd5 , ab5 );
+ vminmax256q(cd6 , ab6 );
+ vminmax256q(cd7 , ab7 );
+ vminmax256q(cd8 , ab8 );
+ vminmax256q(cd9 , ab9 );
+ vminmax256q(cd10, ab10);
+ vminmax256q(cd11, ab11);
+ vminmax256q(cd12, ab12);
+ vminmax256q(cd13, ab13);
+ vminmax256q(cd14, ab14);
+ vminmax256q(cd15, ab15);
+ vminmax256q(cd16, ab16);
+ }
+
+ vst1q_f32(pSrc, ab1.val[0] );
+ vst1q_f32(pSrc+4, ab1.val[1] );
+ vst1q_f32(pSrc+8, ab2.val[0] );
+ vst1q_f32(pSrc+12, ab2.val[1] );
+ vst1q_f32(pSrc+16, ab3.val[0] );
+ vst1q_f32(pSrc+20, ab3.val[1] );
+ vst1q_f32(pSrc+24, ab4.val[0] );
+ vst1q_f32(pSrc+28, ab4.val[1] );
+ vst1q_f32(pSrc+32, ab5.val[0] );
+ vst1q_f32(pSrc+36, ab5.val[1] );
+ vst1q_f32(pSrc+40, ab6.val[0] );
+ vst1q_f32(pSrc+44, ab6.val[1] );
+ vst1q_f32(pSrc+48, ab7.val[0] );
+ vst1q_f32(pSrc+52, ab7.val[1] );
+ vst1q_f32(pSrc+56, ab8.val[0] );
+ vst1q_f32(pSrc+60, ab8.val[1] );
+ vst1q_f32(pSrc+64, cd1.val[0] );
+ vst1q_f32(pSrc+68, cd1.val[1] );
+ vst1q_f32(pSrc+72, cd2.val[0] );
+ vst1q_f32(pSrc+76, cd2.val[1] );
+ vst1q_f32(pSrc+80, cd3.val[0] );
+ vst1q_f32(pSrc+84, cd3.val[1] );
+ vst1q_f32(pSrc+88, cd4.val[0] );
+ vst1q_f32(pSrc+92, cd4.val[1] );
+ vst1q_f32(pSrc+96, cd5.val[0] );
+ vst1q_f32(pSrc+100, cd5.val[1] );
+ vst1q_f32(pSrc+104, cd6.val[0] );
+ vst1q_f32(pSrc+108, cd6.val[1] );
+ vst1q_f32(pSrc+112, cd7.val[0] );
+ vst1q_f32(pSrc+116, cd7.val[1] );
+ vst1q_f32(pSrc+120, cd8.val[0] );
+ vst1q_f32(pSrc+124, cd8.val[1] );
+ vst1q_f32(pSrc+128, ab9.val[0] );
+ vst1q_f32(pSrc+132, ab9.val[1] );
+ vst1q_f32(pSrc+136, ab10.val[0]);
+ vst1q_f32(pSrc+140, ab10.val[1]);
+ vst1q_f32(pSrc+144, ab11.val[0]);
+ vst1q_f32(pSrc+148, ab11.val[1]);
+ vst1q_f32(pSrc+152, ab12.val[0]);
+ vst1q_f32(pSrc+156, ab12.val[1]);
+ vst1q_f32(pSrc+160, ab13.val[0]);
+ vst1q_f32(pSrc+164, ab13.val[1]);
+ vst1q_f32(pSrc+168, ab14.val[0]);
+ vst1q_f32(pSrc+172, ab14.val[1]);
+ vst1q_f32(pSrc+176, ab15.val[0]);
+ vst1q_f32(pSrc+180, ab15.val[1]);
+ vst1q_f32(pSrc+184, ab16.val[0]);
+ vst1q_f32(pSrc+188, ab16.val[1]);
+ vst1q_f32(pSrc+192, cd9.val[0] );
+ vst1q_f32(pSrc+196, cd9.val[1] );
+ vst1q_f32(pSrc+200, cd10.val[0]);
+ vst1q_f32(pSrc+204, cd10.val[1]);
+ vst1q_f32(pSrc+208, cd11.val[0]);
+ vst1q_f32(pSrc+212, cd11.val[1]);
+ vst1q_f32(pSrc+216, cd12.val[0]);
+ vst1q_f32(pSrc+220, cd12.val[1]);
+ vst1q_f32(pSrc+224, cd13.val[0]);
+ vst1q_f32(pSrc+228, cd13.val[1]);
+ vst1q_f32(pSrc+232, cd14.val[0]);
+ vst1q_f32(pSrc+236, cd14.val[1]);
+ vst1q_f32(pSrc+240, cd15.val[0]);
+ vst1q_f32(pSrc+244, cd15.val[1]);
+ vst1q_f32(pSrc+248, cd16.val[0]);
+ vst1q_f32(pSrc+252, cd16.val[1]);
+
+ //Transpose
+ arm_bitonic_merge_128_f32(pSrc+0 , dir);
+ arm_bitonic_merge_128_f32(pSrc+128, dir);
+}
+
+#define SWAP(a,i,j) \
+ temp = vgetq_lane_f32(a, j); \
+ a = vsetq_lane_f32(vgetq_lane_f32(a, i), a, j);\
+ a = vsetq_lane_f32(temp, a, i);
+
+static float32x4_t arm_bitonic_sort_4_f32(float32x4_t a, uint8_t dir)
+{
+ float32_t temp;
+
+
+ if( dir==(vgetq_lane_f32(a, 0) > vgetq_lane_f32(a, 1)) )
+ {
+ SWAP(a,0,1);
+ }
+ if( dir==(vgetq_lane_f32(a, 2) > vgetq_lane_f32(a, 3)) )
+ {
+ SWAP(a,2,3);
+ }
+
+ if( dir==(vgetq_lane_f32(a, 0) > vgetq_lane_f32(a, 3)) )
+ {
+ SWAP(a,0,3);
+ }
+ if( dir==(vgetq_lane_f32(a, 1) > vgetq_lane_f32(a, 2)) )
+ {
+ SWAP(a,1,2);
+ }
+
+ if( dir==(vgetq_lane_f32(a, 0) > vgetq_lane_f32(a, 1)) )
+ {
+ SWAP(a,0,1);
+ }
+ if( dir==(vgetq_lane_f32(a, 2)>vgetq_lane_f32(a, 3)) )
+ {
+ SWAP(a,2,3);
+ }
+
+ return a;
+}
+
+static float32x4x2_t arm_bitonic_sort_8_f32(float32x4_t a, float32x4_t b, uint8_t dir)
+{
+ a = arm_bitonic_sort_4_f32(a, dir);
+ b = arm_bitonic_sort_4_f32(b, dir);
+ return arm_bitonic_merge_8_f32(a, b, dir);
+}
+
+
+
+#endif
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @defgroup Sorting Vector sorting algorithms
+
+ Sort the elements of a vector
+
+ There are separate functions for floating-point, Q31, Q15, and Q7 data types.
+ */
+
+/**
+ @addtogroup Sorting
+ @{
+ */
+
+/**
+ * @private
+ * @param[in] S points to an instance of the sorting structure.
+ * @param[in] pSrc points to the block of input data.
+ * @param[out] pDst points to the block of output data
+ * @param[in] blockSize number of samples to process.
+ */
+void arm_bitonic_sort_f32(
+const arm_sort_instance_f32 * S,
+ float32_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ uint16_t s, i;
+ uint8_t dir = S->dir;
+
+#ifdef ARM_MATH_NEON
+ (void)s;
+
+ float32_t * pOut;
+ uint16_t counter = blockSize>>5;
+
+ if( (blockSize & (blockSize-1)) == 0 ) // Powers of 2 only
+ {
+ if(pSrc == pDst) // in-place
+ pOut = pSrc;
+ else
+ pOut = pDst;
+
+ float32x4x2_t ab1, ab2;
+ float32x4x2_t cd1, cd2;
+
+ if(blockSize == 1)
+ pOut = pSrc;
+ else if(blockSize == 2)
+ {
+ float32_t temp;
+
+ if( dir==(pSrc[0]>pSrc[1]) )
+ {
+ temp = pSrc[1];
+ pOut[1] = pSrc[0];
+ pOut[0] = temp;
+ }
+ else
+ pOut = pSrc;
+ }
+ else if(blockSize == 4)
+ {
+ float32x4_t a = vld1q_f32(pSrc);
+
+ a = arm_bitonic_sort_4_f32(a, dir);
+
+ vst1q_f32(pOut, a);
+ }
+ else if(blockSize == 8)
+ {
+ float32x4_t a;
+ float32x4_t b;
+ float32x4x2_t ab;
+
+ a = vld1q_f32(pSrc);
+ b = vld1q_f32(pSrc+4);
+
+ ab = arm_bitonic_sort_8_f32(a, b, dir);
+
+ vst1q_f32(pOut, ab.val[0]);
+ vst1q_f32(pOut+4, ab.val[1]);
+ }
+ else if(blockSize >=16)
+ {
+ // Order 16 bits long vectors
+ for(i=0; i<blockSize; i=i+16)
+ arm_bitonic_sort_16_f32(pSrc+i, pOut+i, dir);
+
+ // Merge
+ for(i=0; i<counter; i++)
+ {
+ // Load and reverse second vector
+ ab1.val[0] = vld1q_f32(pOut+32*i+0 );
+ ab1.val[1] = vld1q_f32(pOut+32*i+4 );
+ ab2.val[0] = vld1q_f32(pOut+32*i+8 );
+ ab2.val[1] = vld1q_f32(pOut+32*i+12);
+
+ vldrev128q_f32(cd2.val[1], pOut+32*i+16);
+ vldrev128q_f32(cd2.val[0], pOut+32*i+20);
+ vldrev128q_f32(cd1.val[1], pOut+32*i+24);
+ vldrev128q_f32(cd1.val[0], pOut+32*i+28);
+
+ arm_bitonic_merge_32_f32(pOut+32*i, ab1, ab2, cd1, cd2, dir);
+ }
+
+ counter = counter>>1;
+ for(i=0; i<counter; i++)
+ arm_bitonic_merge_64_f32(pOut+64*i, dir);
+
+ counter = counter>>1;
+ for(i=0; i<counter; i++)
+ arm_bitonic_merge_128_f32(pOut+128*i, dir);
+
+ counter = counter>>1;
+ for(i=0; i<counter; i++)
+ arm_bitonic_merge_256_f32(pOut+256*i, dir);
+
+ // Etc...
+ }
+ }
+
+#else
+
+ float32_t * pA;
+
+ if(pSrc != pDst) // out-of-place
+ {
+ memcpy(pDst, pSrc, blockSize*sizeof(float32_t) );
+ pA = pDst;
+ }
+ else
+ pA = pSrc;
+
+
+ if( (blockSize & (blockSize-1)) == 0 ) // Powers of 2 only
+ {
+ for(s=2; s<=blockSize; s=s*2)
+ {
+ for(i=0; i<blockSize; i=i+s)
+ arm_bitonic_sort_core_f32(pA+i, s, dir);
+ }
+ }
+#endif
+}
+
+/**
+ @} end of Sorting group
+ */
diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_bubble_sort_f32.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_bubble_sort_f32.c
new file mode 100644
index 0000000..640778d
--- /dev/null
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_bubble_sort_f32.c
@@ -0,0 +1,104 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_bubble_sort_f32.c
+ * Description: Floating point bubble sort
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+#include "arm_sorting.h"
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @addtogroup Sorting
+ @{
+ */
+
+/**
+ * @private
+ * @param[in] S points to an instance of the sorting structure.
+ * @param[in] pSrc points to the block of input data.
+ * @param[out] pDst points to the block of output data
+ * @param[in] blockSize number of samples to process.
+ *
+ * @par Algorithm
+ * The bubble sort algorithm is a simple comparison algorithm that
+ * reads the elements of a vector from the beginning to the end,
+ * compares the adjacent ones and swaps them if they are in the
+ * wrong order. The procedure is repeated until there is nothing
+ * left to swap. Bubble sort is fast for input vectors that are
+ * nearly sorted.
+ *
+ * @par It's an in-place algorithm. In order to obtain an out-of-place
+ * function, a memcpy of the source vector is performed
+ */
+
+void arm_bubble_sort_f32(
+ const arm_sort_instance_f32 * S,
+ float32_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ uint8_t dir = S->dir;
+ uint32_t i;
+ uint8_t swapped =1;
+ float32_t * pA;
+ float32_t temp;
+
+ if(pSrc != pDst) // out-of-place
+ {
+ memcpy(pDst, pSrc, blockSize*sizeof(float32_t) );
+ pA = pDst;
+ }
+ else
+ pA = pSrc;
+
+ while(swapped==1) // If nothing has been swapped after one loop stop
+ {
+ swapped=0;
+
+ for(i=0; i<blockSize-1; i++)
+ {
+ if(dir==(pA[i]>pA[i+1]))
+ {
+ // Swap
+ temp = pA[i];
+ pA[i] = pA[i+1];
+ pA[i+1] = temp;
+
+ // Update flag
+ swapped = 1;
+ }
+ }
+
+ blockSize--;
+ }
+}
+
+/**
+ @} end of Sorting group
+ */
diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_copy_f16.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_copy_f16.c
new file mode 100644
index 0000000..d441332
--- /dev/null
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_copy_f16.c
@@ -0,0 +1,130 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_copy_f16.c
+ * Description: Copies the elements of a floating-point vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+/**
+ @ingroup groupSupport
+ */
+
+
+/**
+ @addtogroup copy
+ @{
+ */
+
+/**
+ @brief Copies the elements of a f16 vector.
+ @param[in] pSrc points to input vector
+ @param[out] pDst points to output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+ */
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+void arm_copy_f16(
+ const float16_t * pSrc,
+ float16_t * pDst,
+ uint32_t blockSize)
+{
+ do {
+ mve_pred16_t p = vctp16q(blockSize);
+
+ vstrhq_p_f16(pDst,
+ vldrhq_z_f16((float16_t const *) pSrc, p), p);
+ /*
+ * Decrement the blockSize loop counter
+ * Advance vector source and destination pointers
+ */
+ pSrc += 8;
+ pDst += 8;
+ blockSize -= 8;
+ }
+ while ((int32_t) blockSize > 0);
+}
+
+#else
+
+void arm_copy_f16(
+ const float16_t * pSrc,
+ float16_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = A */
+
+ /* Copy and store result in destination buffer */
+ *pDst++ = *pSrc++;
+ *pDst++ = *pSrc++;
+ *pDst++ = *pSrc++;
+ *pDst++ = *pSrc++;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = A */
+
+ /* Copy and store result in destination buffer */
+ *pDst++ = *pSrc++;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of BasicCopy group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
+
diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_copy_f32.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_copy_f32.c
new file mode 100644
index 0000000..d739f7c
--- /dev/null
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_copy_f32.c
@@ -0,0 +1,192 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_copy_f32.c
+ * Description: Copies the elements of a floating-point vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @defgroup copy Vector Copy
+
+ Copies sample by sample from source vector to destination vector.
+
+ <pre>
+ pDst[n] = pSrc[n]; 0 <= n < blockSize.
+ </pre>
+
+ There are separate functions for floating point, Q31, Q15, and Q7 data types.
+ */
+
+/**
+ @addtogroup copy
+ @{
+ */
+
+/**
+ @brief Copies the elements of a floating-point vector.
+ @param[in] pSrc points to input vector
+ @param[out] pDst points to output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+ */
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+void arm_copy_f32(
+ const float32_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt;
+ blkCnt = blockSize >> 2U;
+
+ /* Compute 4 outputs at a time */
+ while (blkCnt > 0U)
+ {
+ vstrwq_f32(pDst, vldrwq_f32(pSrc));
+ /*
+ * Decrement the blockSize loop counter
+ * Advance vector source and destination pointers
+ */
+ pSrc += 4;
+ pDst += 4;
+ blkCnt --;
+ }
+
+ blkCnt = blockSize & 3;
+
+ while (blkCnt > 0U)
+ {
+ /* C = A */
+
+ /* Copy and store result in destination buffer */
+ *pDst++ = *pSrc++;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+}
+
+#else
+#if defined(ARM_MATH_NEON_EXPERIMENTAL)
+void arm_copy_f32(
+ const float32_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* loop counter */
+
+ float32x4_t inV;
+
+ blkCnt = blockSize >> 2U;
+
+ /* Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while (blkCnt > 0U)
+ {
+ /* C = A */
+ /* Copy and then store the results in the destination buffer */
+ inV = vld1q_f32(pSrc);
+ vst1q_f32(pDst, inV);
+ pSrc += 4;
+ pDst += 4;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize & 3;
+
+ while (blkCnt > 0U)
+ {
+ /* C = A */
+ /* Copy and then store the results in the destination buffer */
+ *pDst++ = *pSrc++;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+}
+#else
+void arm_copy_f32(
+ const float32_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = A */
+
+ /* Copy and store result in destination buffer */
+ *pDst++ = *pSrc++;
+ *pDst++ = *pSrc++;
+ *pDst++ = *pSrc++;
+ *pDst++ = *pSrc++;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = A */
+
+ /* Copy and store result in destination buffer */
+ *pDst++ = *pSrc++;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+}
+#endif /* #if defined(ARM_MATH_NEON) */
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of BasicCopy group
+ */
diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_copy_f64.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_copy_f64.c
new file mode 100644
index 0000000..ec1df54
--- /dev/null
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_copy_f64.c
@@ -0,0 +1,71 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_copy_f64.c
+ * Description: Copies the elements of a floating-point vector
+ *
+ * $Date: 13 September 2021
+ * $Revision: V1.10.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @addtogroup copy
+ @{
+ */
+
+/**
+ @brief Copies the elements of a floating-point vector.
+ @param[in] pSrc points to input vector
+ @param[out] pDst points to output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+ */
+void arm_copy_f64(
+ const float64_t * pSrc,
+ float64_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* Loop counter */
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+ while (blkCnt > 0U)
+ {
+ /* C = A */
+
+ /* Copy and store result in destination buffer */
+ *pDst++ = *pSrc++;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+}
+
+/**
+ @} end of BasicCopy group
+ */
diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_copy_q15.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_copy_q15.c
new file mode 100644
index 0000000..18f3387
--- /dev/null
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_copy_q15.c
@@ -0,0 +1,130 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_copy_q15.c
+ * Description: Copies the elements of a Q15 vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @addtogroup copy
+ @{
+ */
+
+/**
+ @brief Copies the elements of a Q15 vector.
+ @param[in] pSrc points to input vector
+ @param[out] pDst points to output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_copy_q15(
+ const q15_t * pSrc,
+ q15_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt;
+
+ blkCnt = blockSize >> 3;
+ while (blkCnt > 0U)
+ {
+ vstrhq_s16(pDst,vldrhq_s16(pSrc));
+ /*
+ * Decrement the blockSize loop counter
+ * Advance vector source and destination pointers
+ */
+ pSrc += 8;
+ pDst += 8;
+ blkCnt --;
+ }
+
+ blkCnt = blockSize & 7;
+ while (blkCnt > 0U)
+ {
+ /* C = A */
+
+ /* Copy and store result in destination buffer */
+ *pDst++ = *pSrc++;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+}
+#else
+void arm_copy_q15(
+ const q15_t * pSrc,
+ q15_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = A */
+
+ /* read 2 times 2 samples at a time */
+ write_q15x2_ia (&pDst, read_q15x2_ia (&pSrc));
+ write_q15x2_ia (&pDst, read_q15x2_ia (&pSrc));
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = A */
+
+ /* Copy and store result in destination buffer */
+ *pDst++ = *pSrc++;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ @} end of BasicCopy group
+ */
diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_copy_q31.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_copy_q31.c
new file mode 100644
index 0000000..8e06bda
--- /dev/null
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_copy_q31.c
@@ -0,0 +1,135 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_copy_q31.c
+ * Description: Copies the elements of a Q31 vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @addtogroup copy
+ @{
+ */
+
+/**
+ @brief Copies the elements of a Q31 vector.
+ @param[in] pSrc points to input vector
+ @param[out] pDst points to output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_copy_q31(
+ const q31_t * pSrc,
+ q31_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt;
+ blkCnt = blockSize >> 2U;
+
+ /* Compute 4 outputs at a time */
+ while (blkCnt > 0U)
+ {
+ vstrwq_s32(pDst,vldrwq_s32(pSrc));
+ /*
+ * Decrement the blockSize loop counter
+ * Advance vector source and destination pointers
+ */
+ pSrc += 4;
+ pDst += 4;
+ blkCnt --;
+ }
+
+ blkCnt = blockSize & 3;
+ while (blkCnt > 0U)
+ {
+ /* C = A */
+
+ /* Copy and store result in destination buffer */
+ *pDst++ = *pSrc++;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+}
+
+#else
+void arm_copy_q31(
+ const q31_t * pSrc,
+ q31_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = A */
+
+ /* Copy and store result in destination buffer */
+ *pDst++ = *pSrc++;
+ *pDst++ = *pSrc++;
+ *pDst++ = *pSrc++;
+ *pDst++ = *pSrc++;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = A */
+
+ /* Copy and store result in destination buffer */
+ *pDst++ = *pSrc++;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ @} end of BasicCopy group
+ */
diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_copy_q7.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_copy_q7.c
new file mode 100644
index 0000000..1918d3e
--- /dev/null
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_copy_q7.c
@@ -0,0 +1,132 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_copy_q7.c
+ * Description: Copies the elements of a Q7 vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @addtogroup copy
+ @{
+ */
+
+/**
+ @brief Copies the elements of a Q7 vector.
+ @param[in] pSrc points to input vector
+ @param[out] pDst points to output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_copy_q7(
+ const q7_t * pSrc,
+ q7_t * pDst,
+ uint32_t blockSize)
+{
+
+ uint32_t blkCnt;
+
+ blkCnt = blockSize >> 4;
+ while (blkCnt > 0U)
+ {
+
+ vstrbq_s8(pDst,vldrbq_s8(pSrc));
+ /*
+ * Decrement the blockSize loop counter
+ * Advance vector source and destination pointers
+ */
+ pSrc += 16;
+ pDst += 16;
+ blkCnt --;
+ }
+
+ blkCnt = blockSize & 0xF;
+ while (blkCnt > 0U)
+ {
+ /* C = A */
+
+ /* Copy and store result in destination buffer */
+ *pDst++ = *pSrc++;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+}
+
+#else
+void arm_copy_q7(
+ const q7_t * pSrc,
+ q7_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = A */
+
+ /* read 4 samples at a time */
+ write_q7x4_ia (&pDst, read_q7x4_ia (&pSrc));
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = A */
+
+ /* Copy and store result in destination buffer */
+ *pDst++ = *pSrc++;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ @} end of BasicCopy group
+ */
diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_f16_to_float.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_f16_to_float.c
new file mode 100644
index 0000000..a004353
--- /dev/null
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_f16_to_float.c
@@ -0,0 +1,134 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_float_to_q15.c
+ * Description: Converts the elements of the floating-point vector to Q15 vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ * @defgroup f16_to_x Convert 16-bit floating point value
+ */
+
+/**
+ @addtogroup f16_to_x
+ @{
+ */
+
+/**
+ @brief Converts the elements of the f16 vector to f32 vector.
+ @param[in] pSrc points to the f16 input vector
+ @param[out] pDst points to the f32 output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) && defined(__CMSIS_GCC_H)
+#pragma GCC warning "Scalar version of arm_f16_to_float built. Helium version has build issues with gcc."
+#endif
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) && !defined(__CMSIS_GCC_H)
+
+void arm_f16_to_float(
+ const float16_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ int32_t blkCnt; /* loop counters */
+ float16x8_t vecDst;
+ float32x4x2_t tmp;
+
+ blkCnt = blockSize >> 3;
+ while (blkCnt > 0)
+ {
+ vecDst = vldrhq_f16(pSrc);
+ pSrc += 8;
+
+ tmp.val[0] = vcvtbq_f32_f16(vecDst);
+ tmp.val[1] = vcvttq_f32_f16(vecDst);
+ vst2q(pDst,tmp);
+
+ pDst += 8;
+ /*
+ * Decrement the blockSize loop counter
+ */
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = blockSize & 7;
+ while (blkCnt > 0)
+ {
+
+ *pDst++ = (float32_t) *pSrc++;
+ /*
+ * Decrement the loop counter
+ */
+ blkCnt--;
+ }
+}
+
+#else
+void arm_f16_to_float(
+ const float16_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ const float16_t *pIn = pSrc; /* Src pointer */
+ uint32_t blkCnt; /* loop counter */
+
+ /*
+ * Loop over blockSize number of values
+ */
+ blkCnt = blockSize;
+
+ while (blkCnt > 0U)
+ {
+
+ *pDst++ = (float32_t) * pIn++;
+ /*
+ * Decrement the loop counter
+ */
+ blkCnt--;
+ }
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of f16_to_x group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
+
diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_f16_to_q15.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_f16_to_q15.c
new file mode 100644
index 0000000..bb425d1
--- /dev/null
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_f16_to_q15.c
@@ -0,0 +1,157 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_float_to_q15.c
+ * Description: Converts the elements of the floating-point vector to Q15 vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @addtogroup f16_to_x
+ @{
+ */
+
+/**
+ @brief Converts the elements of the f16 vector to Q15 vector.
+ @param[in] pSrc points to the f16 input vector
+ @param[out] pDst points to the Q15 output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+
+ @par Details
+ The equation used for the conversion process is:
+ <pre>
+ pDst[n] = (q15_t)(pSrc[n] * 32768); 0 <= n < blockSize.
+ </pre>
+
+ @par Scaling and Overflow Behavior
+ The function uses saturating arithmetic.
+ Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
+
+ @note
+ In order to apply rounding in scalar version, the library should be rebuilt with the ROUNDING macro
+ defined in the preprocessor section of project options.
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+void arm_f16_to_q15(
+ const float16_t * pSrc,
+ q15_t * pDst,
+ uint32_t blockSize)
+{
+ float16_t maxQ = (float16_t) Q15_MAX;
+ float16x8_t vecDst;
+
+
+ do {
+ mve_pred16_t p = vctp16q(blockSize);
+
+ vecDst = vldrhq_z_f16((float16_t const *) pSrc, p);
+ /* C = A * 32767 */
+ /* convert from float to Q15 and then store the results in the destination buffer */
+ vecDst = vmulq_m(vuninitializedq_f16(), vecDst, maxQ, p);
+
+ vstrhq_p_s16(pDst,
+ vcvtaq_m(vuninitializedq_s16(), vecDst, p), p);
+ /*
+ * Decrement the blockSize loop counter
+ * Advance vector source and destination pointers
+ */
+ pSrc += 8;
+ pDst += 8;
+ blockSize -= 8;
+ }
+ while ((int32_t) blockSize > 0);
+}
+
+#else
+
+void arm_f16_to_q15(
+ const float16_t * pSrc,
+ q15_t * pDst,
+ uint32_t blockSize)
+{
+ const float16_t *pIn = pSrc; /* Src pointer */
+ uint32_t blkCnt; /* loop counter */
+#ifdef ARM_MATH_ROUNDING
+ float16_t in;
+#endif /* #ifdef ARM_MATH_ROUNDING */
+
+ /*
+ * Loop over blockSize number of values
+ */
+ blkCnt = blockSize;
+
+ while (blkCnt > 0U)
+ {
+
+#ifdef ARM_MATH_ROUNDING
+
+ /*
+ * C = A * 65536
+ */
+ /*
+ * convert from float to Q31 and then store the results in the destination buffer
+ */
+ in = *pIn++;
+ in = (in * 32768.0);
+ in += in > 0.0 ? 0.5 : -0.5;
+ *pDst++ = clip_q31_to_q15((q31_t) (in));
+
+#else
+
+ /*
+ * C = A * 32768
+ */
+ /*
+ * convert from float to Q31 and then store the results in the destination buffer
+ */
+ *pDst++ = clip_q31_to_q15((q31_t) ((_Float16)*pIn++ * 32768.0f16));
+
+#endif /* #ifdef ARM_MATH_ROUNDING */
+
+ /*
+ * Decrement the loop counter
+ */
+ blkCnt--;
+ }
+
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of f16_to_x group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
+
diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_fill_f16.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_fill_f16.c
new file mode 100644
index 0000000..0b08f12
--- /dev/null
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_fill_f16.c
@@ -0,0 +1,127 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_fill_f16.c
+ * Description: Fills a constant value into a floating-point vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+/**
+ @ingroup groupSupport
+ */
+
+
+/**
+ @addtogroup Fill
+ @{
+ */
+
+/**
+ @brief Fills a constant value into a f16 vector.
+ @param[in] value input value to be filled
+ @param[out] pDst points to output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+ */
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+void arm_fill_f16(
+ float16_t value,
+ float16_t * pDst,
+ uint32_t blockSize)
+{
+ do {
+ mve_pred16_t p = vctp16q(blockSize);
+
+ vstrhq_p_f16(pDst,
+ vdupq_m_n_f16(vuninitializedq_f16(), value, p), p);
+ /*
+ * Decrement the blockSize loop counter
+ * Advance vector source and destination pointers
+ */
+ pDst += 8;
+ blockSize -= 8;
+ }
+ while ((int32_t) blockSize > 0);
+}
+#else
+void arm_fill_f16(
+ float16_t value,
+ float16_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = value */
+
+ /* Fill value in destination buffer */
+ *pDst++ = value;
+ *pDst++ = value;
+ *pDst++ = value;
+ *pDst++ = value;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = value */
+
+ /* Fill value in destination buffer */
+ *pDst++ = value;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of Fill group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
+
diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_fill_f32.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_fill_f32.c
new file mode 100644
index 0000000..50cdd8f
--- /dev/null
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_fill_f32.c
@@ -0,0 +1,189 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_fill_f32.c
+ * Description: Fills a constant value into a floating-point vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @defgroup Fill Vector Fill
+
+ Fills the destination vector with a constant value.
+
+ <pre>
+ pDst[n] = value; 0 <= n < blockSize.
+ </pre>
+
+ There are separate functions for floating point, Q31, Q15, and Q7 data types.
+ */
+
+/**
+ @addtogroup Fill
+ @{
+ */
+
+/**
+ @brief Fills a constant value into a floating-point vector.
+ @param[in] value input value to be filled
+ @param[out] pDst points to output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+ */
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_fill_f32(
+ float32_t value,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt;
+ blkCnt = blockSize >> 2U;
+
+ /* Compute 4 outputs at a time */
+ while (blkCnt > 0U)
+ {
+
+ vstrwq_f32(pDst,vdupq_n_f32(value));
+ /*
+ * Decrement the blockSize loop counter
+ * Advance vector source and destination pointers
+ */
+ pDst += 4;
+ blkCnt --;
+ }
+
+ blkCnt = blockSize & 3;
+
+ while (blkCnt > 0U)
+ {
+ /* C = value */
+
+ /* Fill value in destination buffer */
+ *pDst++ = value;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+}
+#else
+#if defined(ARM_MATH_NEON_EXPERIMENTAL)
+void arm_fill_f32(
+ float32_t value,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* loop counter */
+
+
+ float32x4_t inV = vdupq_n_f32(value);
+
+ blkCnt = blockSize >> 2U;
+
+ /* Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while (blkCnt > 0U)
+ {
+ /* C = value */
+ /* Fill the value in the destination buffer */
+ vst1q_f32(pDst, inV);
+ pDst += 4;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize & 3;
+
+ while (blkCnt > 0U)
+ {
+ /* C = value */
+ /* Fill the value in the destination buffer */
+ *pDst++ = value;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+}
+#else
+void arm_fill_f32(
+ float32_t value,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = value */
+
+ /* Fill value in destination buffer */
+ *pDst++ = value;
+ *pDst++ = value;
+ *pDst++ = value;
+ *pDst++ = value;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = value */
+
+ /* Fill value in destination buffer */
+ *pDst++ = value;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+}
+#endif /* #if defined(ARM_MATH_NEON) */
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of Fill group
+ */
diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_fill_f64.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_fill_f64.c
new file mode 100644
index 0000000..4bc2700
--- /dev/null
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_fill_f64.c
@@ -0,0 +1,71 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_fill_f64.c
+ * Description: Fills a constant value into a floating-point vector
+ *
+ * $Date: 13 September 2021
+ * $Revision: V1.10.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @addtogroup Fill
+ @{
+ */
+
+/**
+ @brief Fills a constant value into a floating-point vector.
+ @param[in] value input value to be filled
+ @param[out] pDst points to output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+ */
+void arm_fill_f64(
+ float64_t value,
+ float64_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* Loop counter */
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+ while (blkCnt > 0U)
+ {
+ /* C = value */
+
+ /* Fill value in destination buffer */
+ *pDst++ = value;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+}
+
+/**
+ @} end of Fill group
+ */
diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_fill_q15.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_fill_q15.c
new file mode 100644
index 0000000..997a728
--- /dev/null
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_fill_q15.c
@@ -0,0 +1,134 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_fill_q15.c
+ * Description: Fills a constant value into a Q15 vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @addtogroup Fill
+ @{
+ */
+
+/**
+ @brief Fills a constant value into a Q15 vector.
+ @param[in] value input value to be filled
+ @param[out] pDst points to output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_fill_q15(
+ q15_t value,
+ q15_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt;
+ blkCnt = blockSize >> 3;
+ while (blkCnt > 0U)
+ {
+
+ vstrhq_s16(pDst,vdupq_n_s16(value));
+ /*
+ * Decrement the blockSize loop counter
+ * Advance vector source and destination pointers
+ */
+ pDst += 8;
+ blkCnt --;
+ }
+
+ blkCnt = blockSize & 7;
+ while (blkCnt > 0U)
+ {
+ /* C = value */
+
+ /* Fill value in destination buffer */
+ *pDst++ = value;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+}
+
+#else
+void arm_fill_q15(
+ q15_t value,
+ q15_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ q31_t packedValue; /* value packed to 32 bits */
+
+ /* Packing two 16 bit values to 32 bit value in order to use SIMD */
+ packedValue = __PKHBT(value, value, 16U);
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = value */
+
+ /* fill 2 times 2 samples at a time */
+ write_q15x2_ia (&pDst, packedValue);
+ write_q15x2_ia (&pDst, packedValue);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = value */
+
+ /* Fill value in destination buffer */
+ *pDst++ = value;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ @} end of Fill group
+ */
diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_fill_q31.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_fill_q31.c
new file mode 100644
index 0000000..7da5fb6
--- /dev/null
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_fill_q31.c
@@ -0,0 +1,135 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_fill_q31.c
+ * Description: Fills a constant value into a Q31 vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @addtogroup Fill
+ @{
+ */
+
+/**
+ @brief Fills a constant value into a Q31 vector.
+ @param[in] value input value to be filled
+ @param[out] pDst points to output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_fill_q31(
+ q31_t value,
+ q31_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt;
+ blkCnt = blockSize >> 2U;
+
+ /* Compute 4 outputs at a time */
+ while (blkCnt > 0U)
+ {
+
+ vstrwq_s32(pDst,vdupq_n_s32(value));
+ /*
+ * Decrement the blockSize loop counter
+ * Advance vector source and destination pointers
+ */
+ pDst += 4;
+ blkCnt --;
+ }
+
+ blkCnt = blockSize & 3;
+ while (blkCnt > 0U)
+ {
+ /* C = value */
+
+ /* Fill value in destination buffer */
+ *pDst++ = value;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+}
+
+#else
+void arm_fill_q31(
+ q31_t value,
+ q31_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = value */
+
+ /* Fill value in destination buffer */
+ *pDst++ = value;
+ *pDst++ = value;
+ *pDst++ = value;
+ *pDst++ = value;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = value */
+
+ /* Fill value in destination buffer */
+ *pDst++ = value;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ @} end of Fill group
+ */
diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_fill_q7.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_fill_q7.c
new file mode 100644
index 0000000..830fc73
--- /dev/null
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_fill_q7.c
@@ -0,0 +1,133 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_fill_q7.c
+ * Description: Fills a constant value into a Q7 vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @addtogroup Fill
+ @{
+ */
+
+/**
+ @brief Fills a constant value into a Q7 vector.
+ @param[in] value input value to be filled
+ @param[out] pDst points to output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_fill_q7(
+ q7_t value,
+ q7_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt;
+
+ blkCnt = blockSize >> 4;
+ while (blkCnt > 0U)
+ {
+
+ vstrbq_s8(pDst,vdupq_n_s8(value));
+ /*
+ * Decrement the blockSize loop counter
+ * Advance vector source and destination pointers
+ */
+ pDst += 16;
+ blkCnt --;
+ }
+
+ blkCnt = blockSize & 0xF;
+ while (blkCnt > 0U)
+ {
+ /* C = value */
+
+ /* Fill value in destination buffer */
+ *pDst++ = value;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+}
+#else
+void arm_fill_q7(
+ q7_t value,
+ q7_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ q31_t packedValue; /* value packed to 32 bits */
+
+ /* Packing four 8 bit values to 32 bit value in order to use SIMD */
+ packedValue = __PACKq7(value, value, value, value);
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = value */
+
+ /* fill 4 samples at a time */
+ write_q7x4_ia (&pDst, packedValue);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = value */
+
+ /* Fill value in destination buffer */
+ *pDst++ = value;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ @} end of Fill group
+ */
diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_float_to_f16.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_float_to_f16.c
new file mode 100644
index 0000000..d627a8a
--- /dev/null
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_float_to_f16.c
@@ -0,0 +1,131 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_float_to_q15.c
+ * Description: Converts the elements of the floating-point vector to Q15 vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @addtogroup float_to_x
+ @{
+ */
+
+/**
+ @brief Converts the elements of the floating-point vector to f16 vector.
+ @param[in] pSrc points to the f32 input vector
+ @param[out] pDst points to the f16 output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) && defined(__CMSIS_GCC_H)
+#pragma GCC warning "Scalar version of arm_float_to_f16 built. Helium version has build issues with gcc."
+#endif
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) && !defined(__CMSIS_GCC_H)
+
+void arm_float_to_f16(
+ const float32_t * pSrc,
+ float16_t * pDst,
+ uint32_t blockSize)
+{
+ int32_t blkCnt; /* loop counters */
+ float32x4x2_t tmp;
+ float16x8_t vecDst;
+ float32_t const *pSrcVec;
+
+
+ pSrcVec = (float32_t const *) pSrc;
+ blkCnt = blockSize >> 3;
+ while (blkCnt > 0)
+ {
+ /* convert from float32 to float16 and then store the results in the destination buffer */
+ tmp = vld2q(pSrcVec); pSrcVec += 8;
+ /* narrow / merge */
+ vecDst = vcvtbq_f16_f32(vecDst, tmp.val[0]);
+ vecDst = vcvttq_f16_f32(vecDst, tmp.val[1]);
+ vst1q(pDst, vecDst); pDst += 8;
+ /*
+ * Decrement the blockSize loop counter
+ */
+ blkCnt--;
+ }
+
+ /*
+ * tail
+ */
+ blkCnt = blockSize & 7;
+ if (blkCnt > 0)
+ {
+ mve_pred16_t p0 = vctp16q(blkCnt);
+ tmp = vld2q(pSrcVec);
+ vecDst = vcvtbq_f16_f32(vecDst, tmp.val[0]);
+ vecDst = vcvttq_f16_f32(vecDst, tmp.val[1]);
+ vstrhq_p(pDst, vecDst, p0);
+ }
+}
+
+#else
+
+void arm_float_to_f16(
+ const float32_t * pSrc,
+ float16_t * pDst,
+ uint32_t blockSize)
+{
+ const float32_t *pIn = pSrc; /* Src pointer */
+ uint32_t blkCnt; /* loop counter */
+
+ /*
+ * Loop over blockSize number of values
+ */
+ blkCnt = blockSize;
+
+ while (blkCnt > 0U)
+ {
+
+ *pDst++ = (float16_t) * pIn++;
+ /*
+ * Decrement the loop counter
+ */
+ blkCnt--;
+ }
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of float_to_x group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
+
diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_float_to_q15.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_float_to_q15.c
new file mode 100644
index 0000000..8061c9f
--- /dev/null
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_float_to_q15.c
@@ -0,0 +1,308 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_float_to_q15.c
+ * Description: Converts the elements of the floating-point vector to Q15 vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @addtogroup float_to_x
+ @{
+ */
+
+/**
+ @brief Converts the elements of the floating-point vector to Q15 vector.
+ @param[in] pSrc points to the floating-point input vector
+ @param[out] pDst points to the Q15 output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+
+ @par Details
+ The equation used for the conversion process is:
+ <pre>
+ pDst[n] = (q15_t)(pSrc[n] * 32768); 0 <= n < blockSize.
+ </pre>
+
+ @par Scaling and Overflow Behavior
+ The function uses saturating arithmetic.
+ Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
+
+ @note
+ In order to apply rounding, the library should be rebuilt with the ROUNDING macro
+ defined in the preprocessor section of project options.
+ */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_float_to_q15(
+ const float32_t * pSrc,
+ q15_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt;
+ float32_t maxQ = (float32_t) Q15_MAX;
+ f32x4x2_t tmp;
+ q15x8_t vecDst;
+#ifdef ARM_MATH_ROUNDING
+ float32_t in;
+#endif
+
+
+ blkCnt = blockSize >> 3;
+ while (blkCnt > 0U)
+ {
+ /* C = A * 32768 */
+ /* convert from float to q15 and then store the results in the destination buffer */
+ tmp = vld2q(pSrc);
+
+ tmp.val[0] = vmulq(tmp.val[0], maxQ);
+ tmp.val[1] = vmulq(tmp.val[1], maxQ);
+
+ vecDst = vqmovnbq(vecDst, vcvtaq_s32_f32(tmp.val[0]));
+ vecDst = vqmovntq(vecDst, vcvtaq_s32_f32(tmp.val[1]));
+ vst1q(pDst, vecDst);
+ /*
+ * Decrement the blockSize loop counter
+ */
+ blkCnt--;
+ pDst += 8;
+ pSrc += 8;
+ }
+
+ blkCnt = blockSize & 7;
+ while (blkCnt > 0U)
+ {
+ /* C = A * 32768 */
+
+ /* convert from float to Q15 and store result in destination buffer */
+#ifdef ARM_MATH_ROUNDING
+
+ in = (*pSrc++ * 32768.0f);
+ in += in > 0.0f ? 0.5f : -0.5f;
+ *pDst++ = (q15_t) (__SSAT((q31_t) (in), 16));
+
+#else
+
+ /* C = A * 32768 */
+ /* Convert from float to q15 and then store the results in the destination buffer */
+ *pDst++ = (q15_t) __SSAT((q31_t) (*pSrc++ * 32768.0f), 16);
+
+#endif /* #ifdef ARM_MATH_ROUNDING */
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+}
+
+#else
+#if defined(ARM_MATH_NEON_EXPERIMENTAL)
+void arm_float_to_q15(
+ const float32_t * pSrc,
+ q15_t * pDst,
+ uint32_t blockSize)
+{
+ const float32_t *pIn = pSrc; /* Src pointer */
+ uint32_t blkCnt; /* loop counter */
+
+ float32x4_t inV;
+ #ifdef ARM_MATH_ROUNDING
+ float32x4_t zeroV = vdupq_n_f32(0.0f);
+ float32x4_t pHalf = vdupq_n_f32(0.5f / 32768.0f);
+ float32x4_t mHalf = vdupq_n_f32(-0.5f / 32768.0f);
+ float32x4_t r;
+ uint32x4_t cmp;
+ float32_t in;
+ #endif
+
+ int32x4_t cvt;
+ int16x4_t outV;
+
+ blkCnt = blockSize >> 2U;
+
+ /* Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while (blkCnt > 0U)
+ {
+
+#ifdef ARM_MATH_ROUNDING
+ /* C = A * 32768 */
+ /* Convert from float to q15 and then store the results in the destination buffer */
+ inV = vld1q_f32(pIn);
+ cmp = vcgtq_f32(inV,zeroV);
+ r = vbslq_f32(cmp,pHalf,mHalf);
+ inV = vaddq_f32(inV, r);
+
+ pIn += 4;
+
+ cvt = vcvtq_n_s32_f32(inV,15);
+ outV = vqmovn_s32(cvt);
+
+ vst1_s16(pDst, outV);
+ pDst += 4;
+
+#else
+
+ /* C = A * 32768 */
+ /* Convert from float to q15 and then store the results in the destination buffer */
+ inV = vld1q_f32(pIn);
+
+ cvt = vcvtq_n_s32_f32(inV,15);
+ outV = vqmovn_s32(cvt);
+
+ vst1_s16(pDst, outV);
+ pDst += 4;
+ pIn += 4;
+
+#endif /* #ifdef ARM_MATH_ROUNDING */
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize & 3;
+
+ while (blkCnt > 0U)
+ {
+
+#ifdef ARM_MATH_ROUNDING
+ /* C = A * 32768 */
+ /* Convert from float to q15 and then store the results in the destination buffer */
+ in = *pIn++;
+ in = (in * 32768.0f);
+ in += in > 0.0f ? 0.5f : -0.5f;
+ *pDst++ = (q15_t) (__SSAT((q31_t) (in), 16));
+
+#else
+
+ /* C = A * 32768 */
+ /* Convert from float to q15 and then store the results in the destination buffer */
+ *pDst++ = (q15_t) __SSAT((q31_t) (*pIn++ * 32768.0f), 16);
+
+#endif /* #ifdef ARM_MATH_ROUNDING */
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+}
+#else
+void arm_float_to_q15(
+ const float32_t * pSrc,
+ q15_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* Loop counter */
+ const float32_t *pIn = pSrc; /* Source pointer */
+
+#ifdef ARM_MATH_ROUNDING
+ float32_t in;
+#endif /* #ifdef ARM_MATH_ROUNDING */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = A * 32768 */
+
+ /* convert from float to Q15 and store result in destination buffer */
+#ifdef ARM_MATH_ROUNDING
+
+ in = (*pIn++ * 32768.0f);
+ in += in > 0.0f ? 0.5f : -0.5f;
+ *pDst++ = (q15_t) (__SSAT((q31_t) (in), 16));
+
+ in = (*pIn++ * 32768.0f);
+ in += in > 0.0f ? 0.5f : -0.5f;
+ *pDst++ = (q15_t) (__SSAT((q31_t) (in), 16));
+
+ in = (*pIn++ * 32768.0f);
+ in += in > 0.0f ? 0.5f : -0.5f;
+ *pDst++ = (q15_t) (__SSAT((q31_t) (in), 16));
+
+ in = (*pIn++ * 32768.0f);
+ in += in > 0.0f ? 0.5f : -0.5f;
+ *pDst++ = (q15_t) (__SSAT((q31_t) (in), 16));
+
+#else
+
+ *pDst++ = (q15_t) __SSAT((q31_t) (*pIn++ * 32768.0f), 16);
+ *pDst++ = (q15_t) __SSAT((q31_t) (*pIn++ * 32768.0f), 16);
+ *pDst++ = (q15_t) __SSAT((q31_t) (*pIn++ * 32768.0f), 16);
+ *pDst++ = (q15_t) __SSAT((q31_t) (*pIn++ * 32768.0f), 16);
+
+#endif /* #ifdef ARM_MATH_ROUNDING */
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = A * 32768 */
+
+ /* convert from float to Q15 and store result in destination buffer */
+#ifdef ARM_MATH_ROUNDING
+
+ in = (*pIn++ * 32768.0f);
+ in += in > 0.0f ? 0.5f : -0.5f;
+ *pDst++ = (q15_t) (__SSAT((q31_t) (in), 16));
+
+#else
+
+ /* C = A * 32768 */
+ /* Convert from float to q15 and then store the results in the destination buffer */
+ *pDst++ = (q15_t) __SSAT((q31_t) (*pIn++ * 32768.0f), 16);
+
+#endif /* #ifdef ARM_MATH_ROUNDING */
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+}
+#endif /* #if defined(ARM_MATH_NEON) */
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of float_to_x group
+ */
diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_float_to_q31.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_float_to_q31.c
new file mode 100644
index 0000000..a222e49
--- /dev/null
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_float_to_q31.c
@@ -0,0 +1,314 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_float_to_q31.c
+ * Description: Converts the elements of the floating-point vector to Q31 vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ * @defgroup float_to_x Convert 32-bit floating point value
+ */
+
+/**
+ @addtogroup float_to_x
+ @{
+ */
+
+/**
+ @brief Converts the elements of the floating-point vector to Q31 vector.
+ @param[in] pSrc points to the floating-point input vector
+ @param[out] pDst points to the Q31 output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+
+ @par Details
+ The equation used for the conversion process is:
+ <pre>
+ pDst[n] = (q31_t)(pSrc[n] * 2147483648); 0 <= n < blockSize.
+ </pre>
+
+ @par Scaling and Overflow Behavior
+ The function uses saturating arithmetic.
+ Results outside of the allowable Q31 range[0x80000000 0x7FFFFFFF] are saturated.
+
+ @note
+ In order to apply rounding, the library should be rebuilt with the ROUNDING macro
+ defined in the preprocessor section of project options.
+ */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_float_to_q31(
+ const float32_t * pSrc,
+ q31_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt;
+ float32_t maxQ = (float32_t) Q31_MAX;
+ f32x4_t vecDst;
+#ifdef ARM_MATH_ROUNDING
+ float32_t in;
+#endif
+
+
+ blkCnt = blockSize >> 2U;
+
+ /* Compute 4 outputs at a time. */
+ while (blkCnt > 0U)
+ {
+
+ vecDst = vldrwq_f32(pSrc);
+ /* C = A * 2147483648 */
+ /* convert from float to Q31 and then store the results in the destination buffer */
+ vecDst = vmulq(vecDst, maxQ);
+
+ vstrwq_s32(pDst, vcvtaq_s32_f32(vecDst));
+ /*
+ * Decrement the blockSize loop counter
+ * Advance vector source and destination pointers
+ */
+ pSrc += 4;
+ pDst += 4;
+ blkCnt --;
+ }
+
+ blkCnt = blockSize & 3;
+
+ while (blkCnt > 0U)
+ {
+ /* C = A * 2147483648 */
+
+ /* convert from float to Q31 and store result in destination buffer */
+#ifdef ARM_MATH_ROUNDING
+
+ in = (*pSrc++ * 2147483648.0f);
+ in += in > 0.0f ? 0.5f : -0.5f;
+ *pDst++ = clip_q63_to_q31((q63_t) (in));
+
+#else
+
+ /* C = A * 2147483648 */
+ /* Convert from float to Q31 and then store the results in the destination buffer */
+ *pDst++ = clip_q63_to_q31((q63_t) (*pSrc++ * 2147483648.0f));
+
+#endif /* #ifdef ARM_MATH_ROUNDING */
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+}
+#else
+#if defined(ARM_MATH_NEON)
+void arm_float_to_q31(
+ const float32_t * pSrc,
+ q31_t * pDst,
+ uint32_t blockSize)
+{
+ const float32_t *pIn = pSrc; /* Src pointer */
+ uint32_t blkCnt; /* loop counter */
+
+ float32x4_t inV;
+ #ifdef ARM_MATH_ROUNDING
+ float32_t in;
+ float32x4_t zeroV = vdupq_n_f32(0.0f);
+ float32x4_t pHalf = vdupq_n_f32(0.5f / 2147483648.0f);
+ float32x4_t mHalf = vdupq_n_f32(-0.5f / 2147483648.0f);
+ float32x4_t r;
+ uint32x4_t cmp;
+ #endif
+
+ int32x4_t outV;
+
+ blkCnt = blockSize >> 2U;
+
+ /* Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while (blkCnt > 0U)
+ {
+
+#ifdef ARM_MATH_ROUNDING
+
+ /* C = A * 32768 */
+ /* Convert from float to Q31 and then store the results in the destination buffer */
+ inV = vld1q_f32(pIn);
+ cmp = vcgtq_f32(inV,zeroV);
+ r = vbslq_f32(cmp,pHalf,mHalf);
+ inV = vaddq_f32(inV, r);
+
+ pIn += 4;
+
+ outV = vcvtq_n_s32_f32(inV,31);
+
+ vst1q_s32(pDst, outV);
+ pDst += 4;
+
+#else
+
+ /* C = A * 2147483648 */
+ /* Convert from float to Q31 and then store the results in the destination buffer */
+ inV = vld1q_f32(pIn);
+
+ outV = vcvtq_n_s32_f32(inV,31);
+
+ vst1q_s32(pDst, outV);
+ pDst += 4;
+ pIn += 4;
+
+#endif /* #ifdef ARM_MATH_ROUNDING */
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize & 3;
+
+ while (blkCnt > 0U)
+ {
+
+#ifdef ARM_MATH_ROUNDING
+
+ /* C = A * 2147483648 */
+ /* Convert from float to Q31 and then store the results in the destination buffer */
+ in = *pIn++;
+ in = (in * 2147483648.0f);
+ in += in > 0.0f ? 0.5f : -0.5f;
+ *pDst++ = clip_q63_to_q31((q63_t) (in));
+
+#else
+
+ /* C = A * 2147483648 */
+ /* Convert from float to Q31 and then store the results in the destination buffer */
+ *pDst++ = clip_q63_to_q31((q63_t) (*pIn++ * 2147483648.0f));
+
+#endif /* #ifdef ARM_MATH_ROUNDING */
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+
+}
+#else
+void arm_float_to_q31(
+ const float32_t * pSrc,
+ q31_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* Loop counter */
+ const float32_t *pIn = pSrc; /* Source pointer */
+
+#ifdef ARM_MATH_ROUNDING
+ float32_t in;
+#endif /* #ifdef ARM_MATH_ROUNDING */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = A * 2147483648 */
+
+ /* convert from float to Q31 and store result in destination buffer */
+#ifdef ARM_MATH_ROUNDING
+
+ in = (*pIn++ * 2147483648.0f);
+ in += in > 0.0f ? 0.5f : -0.5f;
+ *pDst++ = clip_q63_to_q31((q63_t) (in));
+
+ in = (*pIn++ * 2147483648.0f);
+ in += in > 0.0f ? 0.5f : -0.5f;
+ *pDst++ = clip_q63_to_q31((q63_t) (in));
+
+ in = (*pIn++ * 2147483648.0f);
+ in += in > 0.0f ? 0.5f : -0.5f;
+ *pDst++ = clip_q63_to_q31((q63_t) (in));
+
+ in = (*pIn++ * 2147483648.0f);
+ in += in > 0.0f ? 0.5f : -0.5f;
+ *pDst++ = clip_q63_to_q31((q63_t) (in));
+
+#else
+
+ /* C = A * 2147483648 */
+ /* Convert from float to Q31 and then store the results in the destination buffer */
+ *pDst++ = clip_q63_to_q31((q63_t) (*pIn++ * 2147483648.0f));
+ *pDst++ = clip_q63_to_q31((q63_t) (*pIn++ * 2147483648.0f));
+ *pDst++ = clip_q63_to_q31((q63_t) (*pIn++ * 2147483648.0f));
+ *pDst++ = clip_q63_to_q31((q63_t) (*pIn++ * 2147483648.0f));
+
+#endif /* #ifdef ARM_MATH_ROUNDING */
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = A * 2147483648 */
+
+ /* convert from float to Q31 and store result in destination buffer */
+#ifdef ARM_MATH_ROUNDING
+
+ in = (*pIn++ * 2147483648.0f);
+ in += in > 0.0f ? 0.5f : -0.5f;
+ *pDst++ = clip_q63_to_q31((q63_t) (in));
+
+#else
+
+ /* C = A * 2147483648 */
+ /* Convert from float to Q31 and then store the results in the destination buffer */
+ *pDst++ = clip_q63_to_q31((q63_t) (*pIn++ * 2147483648.0f));
+
+#endif /* #ifdef ARM_MATH_ROUNDING */
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+}
+#endif /* #if defined(ARM_MATH_NEON) */
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of float_to_x group
+ */
diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_float_to_q7.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_float_to_q7.c
new file mode 100644
index 0000000..27af520
--- /dev/null
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_float_to_q7.c
@@ -0,0 +1,330 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_float_to_q7.c
+ * Description: Converts the elements of the floating-point vector to Q7 vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @addtogroup float_to_x
+ @{
+ */
+
+/**
+ * @brief Converts the elements of the floating-point vector to Q7 vector.
+ * @param[in] *pSrc points to the floating-point input vector
+ * @param[out] *pDst points to the Q7 output vector
+ * @param[in] blockSize length of the input vector
+ * @return none.
+ *
+ *\par Description:
+ * \par
+ * The equation used for the conversion process is:
+ * <pre>
+ * pDst[n] = (q7_t)(pSrc[n] * 128); 0 <= n < blockSize.
+ * </pre>
+ * \par Scaling and Overflow Behavior:
+ * \par
+ * The function uses saturating arithmetic.
+ * Results outside of the allowable Q7 range [0x80 0x7F] will be saturated.
+ * \note
+ * In order to apply rounding, the library should be rebuilt with the ROUNDING macro
+ * defined in the preprocessor section of project options.
+ */
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_float_to_q7(
+ const float32_t * pSrc,
+ q7_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* loop counters */
+ float32_t maxQ = powf(2.0, 7);
+ f32x4x4_t tmp;
+ q15x8_t evVec, oddVec;
+ q7x16_t vecDst;
+ float32_t const *pSrcVec;
+#ifdef ARM_MATH_ROUNDING
+ float32_t in;
+#endif
+
+ pSrcVec = (float32_t const *) pSrc;
+ blkCnt = blockSize >> 4;
+ while (blkCnt > 0U) {
+ tmp = vld4q(pSrcVec);
+ pSrcVec += 16;
+ /*
+ * C = A * 128.0
+ * convert from float to q7 and then store the results in the destination buffer
+ */
+ tmp.val[0] = vmulq(tmp.val[0], maxQ);
+ tmp.val[1] = vmulq(tmp.val[1], maxQ);
+ tmp.val[2] = vmulq(tmp.val[2], maxQ);
+ tmp.val[3] = vmulq(tmp.val[3], maxQ);
+
+ /*
+ * convert and pack evens
+ */
+ evVec = vqmovnbq(evVec, vcvtaq_s32_f32(tmp.val[0]));
+ evVec = vqmovntq(evVec, vcvtaq_s32_f32(tmp.val[2]));
+ /*
+ * convert and pack odds
+ */
+ oddVec = vqmovnbq(oddVec, vcvtaq_s32_f32(tmp.val[1]));
+ oddVec = vqmovntq(oddVec, vcvtaq_s32_f32(tmp.val[3]));
+ /*
+ * merge
+ */
+ vecDst = vqmovnbq(vecDst, evVec);
+ vecDst = vqmovntq(vecDst, oddVec);
+
+ vst1q(pDst, vecDst);
+ pDst += 16;
+ /*
+ * Decrement the blockSize loop counter
+ */
+ blkCnt--;
+ }
+
+ blkCnt = blockSize & 0xF;
+ while (blkCnt > 0U)
+ {
+ /* C = A * 128 */
+
+ /* Convert from float to q7 and store result in destination buffer */
+#ifdef ARM_MATH_ROUNDING
+
+ in = (*pSrcVec++ * 128);
+ in += in > 0.0f ? 0.5f : -0.5f;
+ *pDst++ = (q7_t) (__SSAT((q15_t) (in), 8));
+
+#else
+
+ *pDst++ = (q7_t) __SSAT((q31_t) (*pSrcVec++ * 128.0f), 8);
+
+#endif /* #ifdef ARM_MATH_ROUNDING */
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+}
+#else
+#if defined(ARM_MATH_NEON)
+void arm_float_to_q7(
+ const float32_t * pSrc,
+ q7_t * pDst,
+ uint32_t blockSize)
+{
+ const float32_t *pIn = pSrc; /* Src pointer */
+ uint32_t blkCnt; /* loop counter */
+
+ float32x4_t inV;
+ #ifdef ARM_MATH_ROUNDING
+ float32_t in;
+ float32x4_t zeroV = vdupq_n_f32(0.0f);
+ float32x4_t pHalf = vdupq_n_f32(0.5f / 128.0f);
+ float32x4_t mHalf = vdupq_n_f32(-0.5f / 128.0f);
+ float32x4_t r;
+ uint32x4_t cmp;
+ #endif
+
+ int16x4_t cvt1,cvt2;
+ int8x8_t outV;
+
+ blkCnt = blockSize >> 3U;
+
+ /* Compute 8 outputs at a time.
+ ** a second loop below computes the remaining 1 to 7 samples. */
+ while (blkCnt > 0U)
+ {
+
+#ifdef ARM_MATH_ROUNDING
+ /* C = A * 128 */
+ /* Convert from float to q7 and then store the results in the destination buffer */
+ inV = vld1q_f32(pIn);
+ cmp = vcgtq_f32(inV,zeroV);
+ r = vbslq_f32(cmp,pHalf,mHalf);
+ inV = vaddq_f32(inV, r);
+ cvt1 = vqmovn_s32(vcvtq_n_s32_f32(inV,7));
+ pIn += 4;
+
+ inV = vld1q_f32(pIn);
+ cmp = vcgtq_f32(inV,zeroV);
+ r = vbslq_f32(cmp,pHalf,mHalf);
+ inV = vaddq_f32(inV, r);
+ cvt2 = vqmovn_s32(vcvtq_n_s32_f32(inV,7));
+ pIn += 4;
+
+ outV = vqmovn_s16(vcombine_s16(cvt1,cvt2));
+ vst1_s8(pDst, outV);
+ pDst += 8;
+
+#else
+
+ /* C = A * 128 */
+ /* Convert from float to q7 and then store the results in the destination buffer */
+ inV = vld1q_f32(pIn);
+ cvt1 = vqmovn_s32(vcvtq_n_s32_f32(inV,7));
+ pIn += 4;
+
+ inV = vld1q_f32(pIn);
+ cvt2 = vqmovn_s32(vcvtq_n_s32_f32(inV,7));
+ pIn += 4;
+
+ outV = vqmovn_s16(vcombine_s16(cvt1,cvt2));
+
+ vst1_s8(pDst, outV);
+ pDst += 8;
+#endif /* #ifdef ARM_MATH_ROUNDING */
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize & 7;
+
+ while (blkCnt > 0U)
+ {
+
+#ifdef ARM_MATH_ROUNDING
+ /* C = A * 128 */
+ /* Convert from float to q7 and then store the results in the destination buffer */
+ in = *pIn++;
+ in = (in * 128);
+ in += in > 0.0f ? 0.5f : -0.5f;
+ *pDst++ = (q7_t) (__SSAT((q15_t) (in), 8));
+
+#else
+
+ /* C = A * 128 */
+ /* Convert from float to q7 and then store the results in the destination buffer */
+ *pDst++ = __SSAT((q31_t) (*pIn++ * 128.0f), 8);
+
+#endif /* #ifdef ARM_MATH_ROUNDING */
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+}
+#else
+void arm_float_to_q7(
+ const float32_t * pSrc,
+ q7_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* Loop counter */
+ const float32_t *pIn = pSrc; /* Source pointer */
+
+#ifdef ARM_MATH_ROUNDING
+ float32_t in;
+#endif /* #ifdef ARM_MATH_ROUNDING */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = A * 128 */
+
+ /* Convert from float to q7 and store result in destination buffer */
+#ifdef ARM_MATH_ROUNDING
+
+ in = (*pIn++ * 128);
+ in += in > 0.0f ? 0.5f : -0.5f;
+ *pDst++ = (q7_t) (__SSAT((q15_t) (in), 8));
+
+ in = (*pIn++ * 128);
+ in += in > 0.0f ? 0.5f : -0.5f;
+ *pDst++ = (q7_t) (__SSAT((q15_t) (in), 8));
+
+ in = (*pIn++ * 128);
+ in += in > 0.0f ? 0.5f : -0.5f;
+ *pDst++ = (q7_t) (__SSAT((q15_t) (in), 8));
+
+ in = (*pIn++ * 128);
+ in += in > 0.0f ? 0.5f : -0.5f;
+ *pDst++ = (q7_t) (__SSAT((q15_t) (in), 8));
+
+#else
+
+ *pDst++ = __SSAT((q31_t) (*pIn++ * 128.0f), 8);
+ *pDst++ = __SSAT((q31_t) (*pIn++ * 128.0f), 8);
+ *pDst++ = __SSAT((q31_t) (*pIn++ * 128.0f), 8);
+ *pDst++ = __SSAT((q31_t) (*pIn++ * 128.0f), 8);
+
+#endif /* #ifdef ARM_MATH_ROUNDING */
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = A * 128 */
+
+ /* Convert from float to q7 and store result in destination buffer */
+#ifdef ARM_MATH_ROUNDING
+
+ in = (*pIn++ * 128);
+ in += in > 0.0f ? 0.5f : -0.5f;
+ *pDst++ = (q7_t) (__SSAT((q15_t) (in), 8));
+
+#else
+
+ *pDst++ = (q7_t) __SSAT((q31_t) (*pIn++ * 128.0f), 8);
+
+#endif /* #ifdef ARM_MATH_ROUNDING */
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+}
+#endif /* #if defined(ARM_MATH_NEON) */
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of float_to_x group
+ */
diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_heap_sort_f32.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_heap_sort_f32.c
new file mode 100644
index 0000000..5a46caa
--- /dev/null
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_heap_sort_f32.c
@@ -0,0 +1,119 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_heap_sort_f32.c
+ * Description: Floating point heap sort
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+#include "arm_sorting.h"
+
+
+
+static void arm_heapify(float32_t * pSrc, uint32_t n, uint32_t i, uint8_t dir)
+{
+ /* Put all the elements of pSrc in heap order */
+ uint32_t k = i; // Initialize largest/smallest as root
+ uint32_t l = 2*i + 1; // left = 2*i + 1
+ uint32_t r = 2*i + 2; // right = 2*i + 2
+ float32_t temp;
+
+ if (l < n && dir==(pSrc[l] > pSrc[k]) )
+ k = l;
+
+ if (r < n && dir==(pSrc[r] > pSrc[k]) )
+ k = r;
+
+ if (k != i)
+ {
+ temp = pSrc[i];
+ pSrc[i]=pSrc[k];
+ pSrc[k]=temp;
+
+ arm_heapify(pSrc, n, k, dir);
+ }
+}
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @addtogroup Sorting
+ @{
+ */
+
+/**
+ * @private
+ * @param[in] S points to an instance of the sorting structure.
+ * @param[in] pSrc points to the block of input data.
+ * @param[out] pDst points to the block of output data
+ * @param[in] blockSize number of samples to process.
+ *
+ * @par Algorithm
+ * The heap sort algorithm is a comparison algorithm that
+ * divides the input array into a sorted and an unsorted region,
+ * and shrinks the unsorted region by extracting the largest
+ * element and moving it to the sorted region. A heap data
+ * structure is used to find the maximum.
+ *
+ * @par It's an in-place algorithm. In order to obtain an out-of-place
+ * function, a memcpy of the source vector is performed.
+ */
+void arm_heap_sort_f32(
+ const arm_sort_instance_f32 * S,
+ float32_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ float32_t * pA;
+ int32_t i;
+ float32_t temp;
+
+ if(pSrc != pDst) // out-of-place
+ {
+ memcpy(pDst, pSrc, blockSize*sizeof(float32_t) );
+ pA = pDst;
+ }
+ else
+ pA = pSrc;
+
+ // Build the heap array so that the largest value is the root
+ for (i = blockSize/2 - 1; i >= 0; i--)
+ arm_heapify(pA, blockSize, i, S->dir);
+
+ for (i = blockSize - 1; i >= 0; i--)
+ {
+ // Swap
+ temp = pA[i];
+ pA[i] = pA[0];
+ pA[0] = temp;
+
+ // Restore heap order
+ arm_heapify(pA, i, 0, S->dir);
+ }
+}
+/**
+ @} end of Sorting group
+ */
diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_insertion_sort_f32.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_insertion_sort_f32.c
new file mode 100644
index 0000000..4e85043
--- /dev/null
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_insertion_sort_f32.c
@@ -0,0 +1,93 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_insertion_sort_f32.c
+ * Description: Floating point insertion sort
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+#include "arm_sorting.h"
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @addtogroup Sorting
+ @{
+ */
+
+/**
+ * @private
+ * @param[in] S points to an instance of the sorting structure.
+ * @param[in] pSrc points to the block of input data.
+ * @param[out] pDst points to the block of output data
+ * @param[in] blockSize number of samples to process.
+ *
+ * @par Algorithm
+ * The insertion sort is a simple sorting algorithm that
+ * reads all the element of the input array and removes one element
+ * at a time, finds the location it belongs in the final sorted list,
+ * and inserts it there.
+ *
+ * @par It's an in-place algorithm. In order to obtain an out-of-place
+ * function, a memcpy of the source vector is performed.
+ */
+
+void arm_insertion_sort_f32(
+ const arm_sort_instance_f32 * S,
+ float32_t *pSrc,
+ float32_t* pDst,
+ uint32_t blockSize)
+{
+ float32_t * pA;
+ uint8_t dir = S->dir;
+ uint32_t i, j;
+ float32_t temp;
+
+ if(pSrc != pDst) // out-of-place
+ {
+ memcpy(pDst, pSrc, blockSize*sizeof(float32_t) );
+ pA = pDst;
+ }
+ else
+ pA = pSrc;
+
+ // Real all the element of the input array
+ for(i=0; i<blockSize; i++)
+ {
+ // Move the i-th element to the right position
+ for (j = i; j>0 && dir==(pA[j]<pA[j-1]); j--)
+ {
+ // Swap
+ temp = pA[j];
+ pA[j] = pA[j-1];
+ pA[j-1] = temp;
+ }
+ }
+}
+
+/**
+ @} end of Sorting group
+ */
diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_merge_sort_f32.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_merge_sort_f32.c
new file mode 100644
index 0000000..5c21201
--- /dev/null
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_merge_sort_f32.c
@@ -0,0 +1,127 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_merge_sort_f32.c
+ * Description: Floating point merge sort
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+#include "arm_sorting.h"
+
+
+static void topDownMerge(float32_t * pA, uint32_t begin, uint32_t middle, uint32_t end, float32_t * pB, uint8_t dir)
+{
+ /* Left array is pA[begin:middle-1]
+ * Right Array is pA[middle:end-1]
+ * They are merged in pB
+ */
+
+ uint32_t i = begin;
+ uint32_t j = middle;
+ uint32_t k;
+
+ // Read all the elements in the sublist
+ for (k = begin; k < end; k++)
+ {
+ // Merge
+ if (i < middle && (j >= end || dir==(pA[i] <= pA[j])) )
+ {
+ pB[k] = pA[i];
+ i++;
+ }
+ else
+ {
+ pB[k] = pA[j];
+ j++;
+ }
+ }
+}
+
+static void arm_merge_sort_core_f32(float32_t * pB, uint32_t begin, uint32_t end, float32_t * pA, uint8_t dir)
+{
+ if((int32_t)end - (int32_t)begin >= 2 ) // If run size != 1 divide
+ {
+ int32_t middle = (end + begin) / 2; // Take the middle point
+
+ arm_merge_sort_core_f32(pA, begin, middle, pB, dir); // Sort the left part
+ arm_merge_sort_core_f32(pA, middle, end, pB, dir); // Sort the right part
+
+ topDownMerge(pB, begin, middle, end, pA, dir);
+ }
+}
+
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @addtogroup Sorting
+ @{
+ */
+
+/**
+ * @param[in] S points to an instance of the sorting structure.
+ * @param[in] pSrc points to the block of input data.
+ * @param[out] pDst points to the block of output data
+ * @param[in] blockSize number of samples to process.
+ *
+ * @par Algorithm
+ * The merge sort algorithm is a comparison algorithm that
+ * divide the input array in sublists and merge them to produce
+ * longer sorted sublists until there is only one list remaining.
+ *
+ * @par A work array is always needed. It must be allocated by the user
+ * linked to the instance at initialization time.
+ *
+ * @par It's an in-place algorithm. In order to obtain an out-of-place
+ * function, a memcpy of the source vector is performed
+ */
+
+
+void arm_merge_sort_f32(
+ const arm_merge_sort_instance_f32 * S,
+ float32_t *pSrc,
+ float32_t *pDst,
+ uint32_t blockSize)
+{
+ float32_t * pA;
+
+ /* Out-of-place */
+ if(pSrc != pDst)
+ {
+ memcpy(pDst, pSrc, blockSize*sizeof(float32_t));
+ pA = pDst;
+ }
+ else
+ pA = pSrc;
+
+ /* A working buffer is needed */
+ memcpy(S->buffer, pSrc, blockSize*sizeof(float32_t));
+
+ arm_merge_sort_core_f32(S->buffer, 0, blockSize, pA, S->dir);
+}
+/**
+ @} end of Sorting group
+ */
diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_merge_sort_init_f32.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_merge_sort_init_f32.c
new file mode 100644
index 0000000..bd93a00
--- /dev/null
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_merge_sort_init_f32.c
@@ -0,0 +1,53 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_merge_sort_init_f32.c
+ * Description: Floating point merge sort initialization function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @addtogroup Sorting
+ @{
+ */
+
+
+ /**
+ * @param[in,out] S points to an instance of the sorting structure.
+ * @param[in] dir Sorting order.
+ * @param[in] buffer Working buffer.
+ */
+void arm_merge_sort_init_f32(arm_merge_sort_instance_f32 * S, arm_sort_dir dir, float32_t * buffer)
+{
+ S->dir = dir;
+ S->buffer = buffer;
+}
+/**
+ @} end of Sorting group
+ */
diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q15_to_f16.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q15_to_f16.c
new file mode 100644
index 0000000..22a7eaa
--- /dev/null
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q15_to_f16.c
@@ -0,0 +1,155 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_q15_to_float.c
+ * Description: Converts the elements of the Q15 vector to floating-point vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ * @defgroup q15_to_x Convert 16-bit fixed point value
+ */
+
+/**
+ @addtogroup q15_to_x
+ @{
+ */
+
+/**
+ @brief Converts the elements of the Q15 vector to f16 vector.
+ @param[in] pSrc points to the Q15 input vector
+ @param[out] pDst points to the f16 output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+
+ @par Details
+ The equation used for the conversion process is:
+ <pre>
+ pDst[n] = (float16_t) pSrc[n] / 32768; 0 <= n < blockSize.
+ </pre>
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+void arm_q15_to_f16(
+ const q15_t * pSrc,
+ float16_t * pDst,
+ uint32_t blockSize)
+{
+ int32_t blkCnt; /* loop counters */
+ q15x8_t vecDst;
+ q15_t const *pSrcVec;
+
+ pSrcVec = (q15_t const *) pSrc;
+ blkCnt = blockSize >> 3;
+ while (blkCnt > 0)
+ {
+ /* C = (float16_t) A / 32768 */
+ /* convert from q15 to float and then store the results in the destination buffer */
+ vecDst = vld1q(pSrcVec); pSrcVec += 8;
+ vstrhq(pDst, vcvtq_n_f16_s16(vecDst, 15)); pDst += 8;
+ /*
+ * Decrement the blockSize loop counter
+ */
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = blockSize & 7;
+ if (blkCnt > 0)
+ {
+ mve_pred16_t p0 = vctp16q(blkCnt);
+ vecDst = vld1q(pSrcVec); pSrcVec += 8;
+ vstrhq_p(pDst, vcvtq_n_f16_s16(vecDst, 15), p0);
+ }
+}
+#else
+
+void arm_q15_to_f16(
+ const q15_t * pSrc,
+ float16_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* Loop counter */
+ const q15_t *pIn = pSrc; /* Source pointer */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = (float16_t) A / 32768 */
+
+ /* Convert from q15 to float and store result in destination buffer */
+ *pDst++ = ((_Float16) * pIn++ / 32768.0f16);
+ *pDst++ = ((_Float16) * pIn++ / 32768.0f16);
+ *pDst++ = ((_Float16) * pIn++ / 32768.0f16);
+ *pDst++ = ((_Float16) * pIn++ / 32768.0f16);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = (float16_t) A / 32768 */
+
+ /* Convert from q15 to float and store result in destination buffer */
+ *pDst++ = ((_Float16) *pIn++ / 32768.0f16);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of q15_to_x group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
+
diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q15_to_float.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q15_to_float.c
new file mode 100644
index 0000000..1a20def
--- /dev/null
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q15_to_float.c
@@ -0,0 +1,207 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_q15_to_float.c
+ * Description: Converts the elements of the Q15 vector to floating-point vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ * @defgroup q15_to_x Convert 16-bit fixed point value
+ */
+
+/**
+ @addtogroup q15_to_x
+ @{
+ */
+
+/**
+ @brief Converts the elements of the Q15 vector to floating-point vector.
+ @param[in] pSrc points to the Q15 input vector
+ @param[out] pDst points to the floating-point output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+
+ @par Details
+ The equation used for the conversion process is:
+ <pre>
+ pDst[n] = (float32_t) pSrc[n] / 32768; 0 <= n < blockSize.
+ </pre>
+ */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_q15_to_float(
+ const q15_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt;
+
+ q15x8_t vecDst;
+ q15_t const *pSrcVec;
+
+ pSrcVec = (q15_t const *) pSrc;
+ blkCnt = blockSize >> 2;
+ while (blkCnt > 0U)
+ {
+ /* C = (float32_t) A / 32768 */
+ /* convert from q15 to float and then store the results in the destination buffer */
+ vecDst = vldrhq_s32(pSrcVec);
+ pSrcVec += 4;
+ vstrwq(pDst, vcvtq_n_f32_s32((int32x4_t)vecDst, 15));
+ pDst += 4;
+ /*
+ * Decrement the blockSize loop counter
+ */
+ blkCnt--;
+ }
+
+ blkCnt = blockSize & 3;
+ while (blkCnt > 0U)
+ {
+ /* C = (float32_t) A / 32768 */
+
+ /* Convert from q15 to float and store result in destination buffer */
+ *pDst++ = ((float32_t) *pSrcVec++ / 32768.0f);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+}
+#else
+#if defined(ARM_MATH_NEON_EXPERIMENTAL)
+void arm_q15_to_float(
+ const q15_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ const q15_t *pIn = pSrc; /* Src pointer */
+ uint32_t blkCnt; /* loop counter */
+
+ int16x8_t inV;
+ int32x4_t inV0, inV1;
+ float32x4_t outV;
+
+ blkCnt = blockSize >> 3U;
+
+ /* Compute 8 outputs at a time.
+ ** a second loop below computes the remaining 1 to 7 samples. */
+ while (blkCnt > 0U)
+ {
+ /* C = (float32_t) A / 32768 */
+ /* convert from q15 to float and then store the results in the destination buffer */
+ inV = vld1q_s16(pIn);
+ pIn += 8;
+
+ inV0 = vmovl_s16(vget_low_s16(inV));
+ inV1 = vmovl_s16(vget_high_s16(inV));
+
+ outV = vcvtq_n_f32_s32(inV0,15);
+ vst1q_f32(pDst, outV);
+ pDst += 4;
+
+ outV = vcvtq_n_f32_s32(inV1,15);
+ vst1q_f32(pDst, outV);
+ pDst += 4;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 8, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize & 7;
+
+
+ while (blkCnt > 0U)
+ {
+ /* C = (float32_t) A / 32768 */
+ /* convert from q15 to float and then store the results in the destination buffer */
+ *pDst++ = ((float32_t) * pIn++ / 32768.0f);
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+}
+#else
+void arm_q15_to_float(
+ const q15_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* Loop counter */
+ const q15_t *pIn = pSrc; /* Source pointer */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = (float32_t) A / 32768 */
+
+ /* Convert from q15 to float and store result in destination buffer */
+ *pDst++ = ((float32_t) * pIn++ / 32768.0f);
+ *pDst++ = ((float32_t) * pIn++ / 32768.0f);
+ *pDst++ = ((float32_t) * pIn++ / 32768.0f);
+ *pDst++ = ((float32_t) * pIn++ / 32768.0f);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = (float32_t) A / 32768 */
+
+ /* Convert from q15 to float and store result in destination buffer */
+ *pDst++ = ((float32_t) *pIn++ / 32768.0f);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+}
+#endif /* #if defined(ARM_MATH_NEON) */
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of q15_to_x group
+ */
diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q15_to_q31.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q15_to_q31.c
new file mode 100644
index 0000000..fc3c868
--- /dev/null
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q15_to_q31.c
@@ -0,0 +1,182 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_q15_to_q31.c
+ * Description: Converts the elements of the Q15 vector to Q31 vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @addtogroup q15_to_x
+ @{
+ */
+
+/**
+ @brief Converts the elements of the Q15 vector to Q31 vector.
+ @param[in] pSrc points to the Q15 input vector
+ @param[out] pDst points to the Q31 output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+
+ @par Details
+ The equation used for the conversion process is:
+ <pre>
+ pDst[n] = (q31_t) pSrc[n] << 16; 0 <= n < blockSize.
+ </pre>
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_q15_to_q31(
+ const q15_t * pSrc,
+ q31_t * pDst,
+ uint32_t blockSize)
+{
+
+ uint32_t blkCnt;
+
+ q31x4_t vecDst;
+
+ blkCnt = blockSize>> 2;
+ while (blkCnt > 0U)
+ {
+
+ /* C = (q31_t)A << 16 */
+ /* convert from q15 to q31 and then store the results in the destination buffer */
+ /* load q15 + 32-bit widening */
+ vecDst = vldrhq_s32((q15_t const *) pSrc);
+ vecDst = vshlq_n(vecDst, 16);
+ vstrwq_s32(pDst, vecDst);
+
+ /*
+ * Decrement the blockSize loop counter
+ * Advance vector source and destination pointers
+ */
+ pDst += 4;
+ pSrc += 4;
+ blkCnt --;
+ }
+
+ blkCnt = blockSize & 3;
+ while (blkCnt > 0U)
+ {
+ /* C = (q31_t) A << 16 */
+
+ /* Convert from q15 to q31 and store result in destination buffer */
+ *pDst++ = (q31_t) *pSrc++ << 16;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+}
+#else
+void arm_q15_to_q31(
+ const q15_t * pSrc,
+ q31_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* Loop counter */
+ const q15_t *pIn = pSrc; /* Source pointer */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ q31_t in1, in2;
+ q31_t out1, out2, out3, out4;
+#endif
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = (q31_t)A << 16 */
+
+ /* Convert from q15 to q31 and store result in destination buffer */
+ in1 = read_q15x2_ia (&pIn);
+ in2 = read_q15x2_ia (&pIn);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ /* extract lower 16 bits to 32 bit result */
+ out1 = in1 << 16U;
+ /* extract upper 16 bits to 32 bit result */
+ out2 = in1 & 0xFFFF0000;
+ /* extract lower 16 bits to 32 bit result */
+ out3 = in2 << 16U;
+ /* extract upper 16 bits to 32 bit result */
+ out4 = in2 & 0xFFFF0000;
+
+#else
+
+ /* extract upper 16 bits to 32 bit result */
+ out1 = in1 & 0xFFFF0000;
+ /* extract lower 16 bits to 32 bit result */
+ out2 = in1 << 16U;
+ /* extract upper 16 bits to 32 bit result */
+ out3 = in2 & 0xFFFF0000;
+ /* extract lower 16 bits to 32 bit result */
+ out4 = in2 << 16U;
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ *pDst++ = out1;
+ *pDst++ = out2;
+ *pDst++ = out3;
+ *pDst++ = out4;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = (q31_t) A << 16 */
+
+ /* Convert from q15 to q31 and store result in destination buffer */
+ *pDst++ = (q31_t) *pIn++ << 16;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ @} end of q15_to_x group
+ */
diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q15_to_q7.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q15_to_q7.c
new file mode 100644
index 0000000..eac8105
--- /dev/null
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q15_to_q7.c
@@ -0,0 +1,190 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_q15_to_q7.c
+ * Description: Converts the elements of the Q15 vector to Q7 vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @addtogroup q15_to_x
+ @{
+ */
+
+/**
+ @brief Converts the elements of the Q15 vector to Q7 vector.
+ @param[in] pSrc points to the Q15 input vector
+ @param[out] pDst points to the Q7 output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+
+ @par Details
+ The equation used for the conversion process is:
+ <pre>
+ pDst[n] = (q7_t) pSrc[n] >> 8; 0 <= n < blockSize.
+ </pre>
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_q15_to_q7(
+ const q15_t * pSrc,
+ q7_t * pDst,
+ uint32_t blockSize)
+{
+
+ uint32_t blkCnt; /* loop counters */
+ q15x8x2_t tmp;
+ q15_t const *pSrcVec;
+ q7x16_t vecDst;
+
+
+ pSrcVec = (q15_t const *) pSrc;
+ blkCnt = blockSize >> 4;
+ while (blkCnt > 0U)
+ {
+ /* C = (q7_t) A >> 8 */
+ /* convert from q15 to q7 and then store the results in the destination buffer */
+ tmp = vld2q(pSrcVec);
+ pSrcVec += 16;
+ vecDst = vqshrnbq_n_s16(vecDst, tmp.val[0], 8);
+ vecDst = vqshrntq_n_s16(vecDst, tmp.val[1], 8);
+ vst1q(pDst, vecDst);
+ pDst += 16;
+ /*
+ * Decrement the blockSize loop counter
+ */
+ blkCnt--;
+ }
+
+ blkCnt = blockSize & 0xF;
+ while (blkCnt > 0U)
+ {
+ /* C = (q7_t) A >> 8 */
+
+ /* Convert from q15 to q7 and store result in destination buffer */
+ *pDst++ = (q7_t) (*pSrcVec++ >> 8);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+}
+#else
+void arm_q15_to_q7(
+ const q15_t * pSrc,
+ q7_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* Loop counter */
+ const q15_t *pIn = pSrc; /* Source pointer */
+
+#if defined (ARM_MATH_LOOPUNROLL) && defined (ARM_MATH_DSP)
+ q31_t in1, in2;
+ q31_t out1, out2;
+#endif
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = (q7_t) A >> 8 */
+
+ /* Convert from q15 to q7 and store result in destination buffer */
+#if defined (ARM_MATH_DSP)
+
+ in1 = read_q15x2_ia (&pIn);
+ in2 = read_q15x2_ia (&pIn);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ out1 = __PKHTB(in2, in1, 16);
+ out2 = __PKHBT(in2, in1, 16);
+
+#else
+
+ out1 = __PKHTB(in1, in2, 16);
+ out2 = __PKHBT(in1, in2, 16);
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* rotate packed value by 24 */
+ out2 = ((uint32_t) out2 << 8) | ((uint32_t) out2 >> 24);
+
+ /* anding with 0xff00ff00 to get two 8 bit values */
+ out1 = out1 & 0xFF00FF00;
+ /* anding with 0x00ff00ff to get two 8 bit values */
+ out2 = out2 & 0x00FF00FF;
+
+ /* oring two values(contains two 8 bit values) to get four packed 8 bit values */
+ out1 = out1 | out2;
+
+ /* store 4 samples at a time to destiantion buffer */
+ write_q7x4_ia (&pDst, out1);
+
+#else
+
+ *pDst++ = (q7_t) (*pIn++ >> 8);
+ *pDst++ = (q7_t) (*pIn++ >> 8);
+ *pDst++ = (q7_t) (*pIn++ >> 8);
+ *pDst++ = (q7_t) (*pIn++ >> 8);
+
+#endif /* #if defined (ARM_MATH_DSP) */
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = (q7_t) A >> 8 */
+
+ /* Convert from q15 to q7 and store result in destination buffer */
+ *pDst++ = (q7_t) (*pIn++ >> 8);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ @} end of q15_to_x group
+ */
diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q31_to_float.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q31_to_float.c
new file mode 100644
index 0000000..ce7ce42
--- /dev/null
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q31_to_float.c
@@ -0,0 +1,202 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_q31_to_float.c
+ * Description: Converts the elements of the Q31 vector to floating-point vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ * @defgroup q31_to_x Convert 32-bit fixed point value
+ */
+
+/**
+ @addtogroup q31_to_x
+ @{
+ */
+
+/**
+ @brief Converts the elements of the Q31 vector to floating-point vector.
+ @param[in] pSrc points to the Q31 input vector
+ @param[out] pDst points to the floating-point output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+
+ @par Details
+ The equation used for the conversion process is:
+ <pre>
+ pDst[n] = (float32_t) pSrc[n] / 2147483648; 0 <= n < blockSize.
+ </pre>
+ */
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_q31_to_float(
+ const q31_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* loop counters */
+ q31x4_t vecDst;
+ q31_t const *pSrcVec;
+
+ pSrcVec = (q31_t const *) pSrc;
+ blkCnt = blockSize >> 2;
+ while (blkCnt > 0U)
+ {
+ /* C = (float32_t) A / 2147483648 */
+ /* convert from q31 to float and then store the results in the destination buffer */
+ vecDst = vld1q(pSrcVec);
+ pSrcVec += 4;
+ vstrwq(pDst, vcvtq_n_f32_s32(vecDst, 31));
+ pDst += 4;
+ /*
+ * Decrement the blockSize loop counter
+ */
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = blockSize & 3;
+ while (blkCnt > 0U)
+ {
+ /* C = (float32_t) A / 2147483648 */
+
+ /* Convert from q31 to float and store result in destination buffer */
+ *pDst++ = ((float32_t) *pSrcVec++ / 2147483648.0f);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+}
+
+#else
+#if defined(ARM_MATH_NEON_EXPERIMENTAL)
+void arm_q31_to_float(
+ const q31_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ const q31_t *pIn = pSrc; /* Src pointer */
+ uint32_t blkCnt; /* loop counter */
+
+ int32x4_t inV;
+ float32x4_t outV;
+
+ blkCnt = blockSize >> 2U;
+
+ /* Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while (blkCnt > 0U)
+ {
+ /* C = (float32_t) A / 2147483648 */
+ /* Convert from q31 to float and then store the results in the destination buffer */
+ inV = vld1q_s32(pIn);
+ pIn += 4;
+
+ outV = vcvtq_n_f32_s32(inV,31);
+
+ vst1q_f32(pDst, outV);
+ pDst += 4;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize & 3;
+
+
+ while (blkCnt > 0U)
+ {
+ /* C = (float32_t) A / 2147483648 */
+ /* Convert from q31 to float and then store the results in the destination buffer */
+ *pDst++ = ((float32_t) * pIn++ / 2147483648.0f);
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+}
+#else
+void arm_q31_to_float(
+ const q31_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ const q31_t *pIn = pSrc; /* Src pointer */
+ uint32_t blkCnt; /* loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = (float32_t) A / 2147483648 */
+
+ /* Convert from q31 to float and store result in destination buffer */
+ *pDst++ = ((float32_t) *pIn++ / 2147483648.0f);
+ *pDst++ = ((float32_t) *pIn++ / 2147483648.0f);
+ *pDst++ = ((float32_t) *pIn++ / 2147483648.0f);
+ *pDst++ = ((float32_t) *pIn++ / 2147483648.0f);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = (float32_t) A / 2147483648 */
+
+ /* Convert from q31 to float and store result in destination buffer */
+ *pDst++ = ((float32_t) *pIn++ / 2147483648.0f);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+}
+#endif /* #if defined(ARM_MATH_NEON) */
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of q31_to_x group
+ */
diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q31_to_q15.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q31_to_q15.c
new file mode 100644
index 0000000..a25a2bb
--- /dev/null
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q31_to_q15.c
@@ -0,0 +1,181 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_q31_to_q15.c
+ * Description: Converts the elements of the Q31 vector to Q15 vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @addtogroup q31_to_x
+ @{
+ */
+
+/**
+ @brief Converts the elements of the Q31 vector to Q15 vector.
+ @param[in] pSrc points to the Q31 input vector
+ @param[out] pDst points to the Q15 output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+
+ @par Details
+ The equation used for the conversion process is:
+ <pre>
+ pDst[n] = (q15_t) pSrc[n] >> 16; 0 <= n < blockSize.
+ </pre>
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_q31_to_q15(
+ const q31_t * pSrc,
+ q15_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* loop counters */
+ q31x4x2_t tmp;
+ q15x8_t vecDst;
+ q31_t const *pSrcVec;
+
+
+ pSrcVec = (q31_t const *) pSrc;
+ blkCnt = blockSize >> 3;
+ while (blkCnt > 0U)
+ {
+ /* C = (q15_t) A >> 16 */
+ /* convert from q31 to q15 and then store the results in the destination buffer */
+ tmp = vld2q(pSrcVec);
+ pSrcVec += 8;
+ vecDst = vshrnbq_n_s32(vecDst, tmp.val[0], 16);
+ vecDst = vshrntq_n_s32(vecDst, tmp.val[1], 16);
+ vst1q(pDst, vecDst);
+ pDst += 8;
+ /*
+ * Decrement the blockSize loop counter
+ */
+ blkCnt--;
+ }
+
+ /*
+ * tail
+ */
+ blkCnt = blockSize & 7;
+ while (blkCnt > 0U)
+ {
+ /* C = (q15_t) (A >> 16) */
+
+ /* Convert from q31 to q15 and store result in destination buffer */
+ *pDst++ = (q15_t) (*pSrcVec++ >> 16);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+}
+
+#else
+void arm_q31_to_q15(
+ const q31_t * pSrc,
+ q15_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* Loop counter */
+ const q31_t *pIn = pSrc; /* Source pointer */
+
+#if defined (ARM_MATH_LOOPUNROLL) && defined (ARM_MATH_DSP)
+ q31_t in1, in2, in3, in4;
+ q31_t out1, out2;
+#endif
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = (q15_t) (A >> 16) */
+
+ /* Convert from q31 to q15 and store result in destination buffer */
+#if defined (ARM_MATH_DSP)
+
+ in1 = *pIn++;
+ in2 = *pIn++;
+ in3 = *pIn++;
+ in4 = *pIn++;
+
+ /* pack two higher 16-bit values from two 32-bit values */
+#ifndef ARM_MATH_BIG_ENDIAN
+ out1 = __PKHTB(in2, in1, 16);
+ out2 = __PKHTB(in4, in3, 16);
+#else
+ out1 = __PKHTB(in1, in2, 16);
+ out2 = __PKHTB(in3, in4, 16);
+#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
+
+ write_q15x2_ia (&pDst, out1);
+ write_q15x2_ia (&pDst, out2);
+
+#else
+
+ *pDst++ = (q15_t) (*pIn++ >> 16);
+ *pDst++ = (q15_t) (*pIn++ >> 16);
+ *pDst++ = (q15_t) (*pIn++ >> 16);
+ *pDst++ = (q15_t) (*pIn++ >> 16);
+
+#endif /* #if defined (ARM_MATH_DSP) */
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = (q15_t) (A >> 16) */
+
+ /* Convert from q31 to q15 and store result in destination buffer */
+ *pDst++ = (q15_t) (*pIn++ >> 16);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ @} end of q31_to_x group
+ */
diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q31_to_q7.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q31_to_q7.c
new file mode 100644
index 0000000..16fbe18
--- /dev/null
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q31_to_q7.c
@@ -0,0 +1,169 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_q31_to_q7.c
+ * Description: Converts the elements of the Q31 vector to Q7 vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @addtogroup q31_to_x
+ @{
+ */
+
+/**
+ @brief Converts the elements of the Q31 vector to Q7 vector.
+ @param[in] pSrc points to the Q31 input vector
+ @param[out] pDst points to the Q7 output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+
+ @par Details
+ The equation used for the conversion process is:
+ <pre>
+ pDst[n] = (q7_t) pSrc[n] >> 24; 0 <= n < blockSize.
+ </pre>
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_q31_to_q7(
+ const q31_t * pSrc,
+ q7_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* loop counters */
+ q31x4x4_t tmp;
+ q15x8_t evVec, oddVec;
+ q7x16_t vecDst;
+ q31_t const *pSrcVec;
+
+ pSrcVec = (q31_t const *) pSrc;
+ blkCnt = blockSize >> 4;
+ while (blkCnt > 0U)
+ {
+ tmp = vld4q(pSrcVec);
+ pSrcVec += 16;
+ /* C = (q7_t) A >> 24 */
+ /* convert from q31 to q7 and then store the results in the destination buffer */
+ /*
+ * narrow and pack evens
+ */
+ evVec = vshrnbq_n_s32(evVec, tmp.val[0], 16);
+ evVec = vshrntq_n_s32(evVec, tmp.val[2], 16);
+ /*
+ * narrow and pack odds
+ */
+ oddVec = vshrnbq_n_s32(oddVec, tmp.val[1], 16);
+ oddVec = vshrntq_n_s32(oddVec, tmp.val[3], 16);
+ /*
+ * narrow & merge
+ */
+ vecDst = vshrnbq_n_s16(vecDst, evVec, 8);
+ vecDst = vshrntq_n_s16(vecDst, oddVec, 8);
+
+ vst1q(pDst, vecDst);
+ pDst += 16;
+ /*
+ * Decrement the blockSize loop counter
+ */
+ blkCnt--;
+ }
+ /*
+ * tail
+ */
+ blkCnt = blockSize & 0xF;
+ while (blkCnt > 0U)
+ {
+ /* C = (q7_t) (A >> 24) */
+
+ /* Convert from q31 to q7 and store result in destination buffer */
+ *pDst++ = (q7_t) (*pSrcVec++ >> 24);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+}
+#else
+void arm_q31_to_q7(
+ const q31_t * pSrc,
+ q7_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* Loop counter */
+ const q31_t *pIn = pSrc; /* Source pointer */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ q7_t out1, out2, out3, out4;
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = (q7_t) (A >> 24) */
+
+ /* Convert from q31 to q7 and store result in destination buffer */
+
+ out1 = (q7_t) (*pIn++ >> 24);
+ out2 = (q7_t) (*pIn++ >> 24);
+ out3 = (q7_t) (*pIn++ >> 24);
+ out4 = (q7_t) (*pIn++ >> 24);
+ write_q7x4_ia (&pDst, __PACKq7(out1, out2, out3, out4));
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = (q7_t) (A >> 24) */
+
+ /* Convert from q31 to q7 and store result in destination buffer */
+ *pDst++ = (q7_t) (*pIn++ >> 24);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ @} end of q31_to_x group
+ */
diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q7_to_float.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q7_to_float.c
new file mode 100644
index 0000000..9cbf8ad
--- /dev/null
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q7_to_float.c
@@ -0,0 +1,218 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_q7_to_float.c
+ * Description: Converts the elements of the Q7 vector to floating-point vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ * @defgroup q7_to_x Convert 8-bit fixed point value
+ */
+
+/**
+ @addtogroup q7_to_x
+ @{
+ */
+
+/**
+ @brief Converts the elements of the Q7 vector to floating-point vector.
+ @param[in] pSrc points to the Q7 input vector
+ @param[out] pDst points to the floating-point output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+
+ @par Details
+ The equation used for the conversion process is:
+ <pre>
+ pDst[n] = (float32_t) pSrc[n] / 128; 0 <= n < blockSize.
+ </pre>
+ */
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_q7_to_float(
+ const q7_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* loop counters */
+ q7x16_t vecDst;
+ q7_t const *pSrcVec;
+
+ pSrcVec = (q7_t const *) pSrc;
+ blkCnt = blockSize >> 2;
+ while (blkCnt > 0U)
+ {
+ /* C = (float32_t) A / 32768 */
+ /* convert from q7 to float and then store the results in the destination buffer */
+ vecDst = vldrbq_s32(pSrcVec);
+ pSrcVec += 4;
+ vstrwq(pDst, vcvtq_n_f32_s32((int32x4_t)vecDst, 7));
+ pDst += 4;
+ /*
+ * Decrement the blockSize loop counter
+ */
+ blkCnt--;
+ }
+
+ blkCnt = blockSize & 3;
+ while (blkCnt > 0U)
+ {
+ /* C = (float32_t) A / 128 */
+
+ /* Convert from q7 to float and store result in destination buffer */
+ *pDst++ = ((float32_t) * pSrcVec++ / 128.0f);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+}
+#else
+#if defined(ARM_MATH_NEON)
+void arm_q7_to_float(
+ const q7_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ const q7_t *pIn = pSrc; /* Src pointer */
+ uint32_t blkCnt; /* loop counter */
+
+ int8x16_t inV;
+ int16x8_t inVLO, inVHI;
+ int32x4_t inVLL, inVLH, inVHL, inVHH;
+ float32x4_t outV;
+
+ blkCnt = blockSize >> 4U;
+
+ /* Compute 16 outputs at a time.
+ ** a second loop below computes the remaining 1 to 15 samples. */
+ while (blkCnt > 0U)
+ {
+ /* C = (float32_t) A / 128 */
+ /* Convert from q7 to float and then store the results in the destination buffer */
+ inV = vld1q_s8(pIn);
+ pIn += 16;
+
+ inVLO = vmovl_s8(vget_low_s8(inV));
+ inVHI = vmovl_s8(vget_high_s8(inV));
+
+ inVLL = vmovl_s16(vget_low_s16(inVLO));
+ inVLH = vmovl_s16(vget_high_s16(inVLO));
+ inVHL = vmovl_s16(vget_low_s16(inVHI));
+ inVHH = vmovl_s16(vget_high_s16(inVHI));
+
+ outV = vcvtq_n_f32_s32(inVLL,7);
+ vst1q_f32(pDst, outV);
+ pDst += 4;
+
+ outV = vcvtq_n_f32_s32(inVLH,7);
+ vst1q_f32(pDst, outV);
+ pDst += 4;
+
+ outV = vcvtq_n_f32_s32(inVHL,7);
+ vst1q_f32(pDst, outV);
+ pDst += 4;
+
+ outV = vcvtq_n_f32_s32(inVHH,7);
+ vst1q_f32(pDst, outV);
+ pDst += 4;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 16, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize & 0xF;
+
+ while (blkCnt > 0U)
+ {
+ /* C = (float32_t) A / 128 */
+ /* Convert from q7 to float and then store the results in the destination buffer */
+ *pDst++ = ((float32_t) * pIn++ / 128.0f);
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+}
+#else
+void arm_q7_to_float(
+ const q7_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* Loop counter */
+ const q7_t *pIn = pSrc; /* Source pointer */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = (float32_t) A / 128 */
+
+ /* Convert from q7 to float and store result in destination buffer */
+ *pDst++ = ((float32_t) * pIn++ / 128.0f);
+ *pDst++ = ((float32_t) * pIn++ / 128.0f);
+ *pDst++ = ((float32_t) * pIn++ / 128.0f);
+ *pDst++ = ((float32_t) * pIn++ / 128.0f);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = (float32_t) A / 128 */
+
+ /* Convert from q7 to float and store result in destination buffer */
+ *pDst++ = ((float32_t) * pIn++ / 128.0f);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+}
+#endif /* #if defined(ARM_MATH_NEON) */
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of q7_to_x group
+ */
diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q7_to_q15.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q7_to_q15.c
new file mode 100644
index 0000000..be52531
--- /dev/null
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q7_to_q15.c
@@ -0,0 +1,188 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_q7_to_q15.c
+ * Description: Converts the elements of the Q7 vector to Q15 vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @addtogroup q7_to_x
+ @{
+ */
+
+/**
+ @brief Converts the elements of the Q7 vector to Q15 vector.
+ @param[in] pSrc points to the Q7 input vector
+ @param[out] pDst points to the Q15 output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+
+ @par Details
+ The equation used for the conversion process is:
+ <pre>
+ pDst[n] = (q15_t) pSrc[n] << 8; 0 <= n < blockSize.
+ </pre>
+ */
+
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_q7_to_q15(
+ const q7_t * pSrc,
+ q15_t * pDst,
+ uint32_t blockSize)
+{
+
+ uint32_t blkCnt; /* loop counters */
+ q15x8_t vecDst;
+ q7_t const *pSrcVec;
+
+
+ pSrcVec = (q7_t const *) pSrc;
+ blkCnt = blockSize >> 3;
+ while (blkCnt > 0U)
+ {
+ /* C = (q15_t) A << 8 */
+ /* convert from q7 to q15 and then store the results in the destination buffer */
+ /* load q7 + 32-bit widening */
+ vecDst = vldrbq_s16(pSrcVec);
+ pSrcVec += 8;
+ vecDst = vecDst << 8;
+ vstrhq(pDst, vecDst);
+ pDst += 8;
+ /*
+ * Decrement the blockSize loop counter
+ */
+ blkCnt--;
+ }
+
+ blkCnt = blockSize & 7;
+ while (blkCnt > 0U)
+ {
+ /* C = (q15_t) A << 8 */
+
+ /* Convert from q7 to q15 and store result in destination buffer */
+ *pDst++ = (q15_t) * pSrcVec++ << 8;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+}
+#else
+void arm_q7_to_q15(
+ const q7_t * pSrc,
+ q15_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* Loop counter */
+ const q7_t *pIn = pSrc; /* Source pointer */
+
+#if defined (ARM_MATH_LOOPUNROLL) && defined (ARM_MATH_DSP)
+ q31_t in;
+ q31_t in1, in2;
+ q31_t out1, out2;
+#endif
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = (q15_t) A << 8 */
+
+ /* Convert from q7 to q15 and store result in destination buffer */
+#if defined (ARM_MATH_DSP)
+
+ in = read_q7x4_ia (&pIn);
+
+ /* rotatate in by 8 and extend two q7_t values to q15_t values */
+ in1 = __SXTB16(__ROR(in, 8));
+
+ /* extend remainig two q7_t values to q15_t values */
+ in2 = __SXTB16(in);
+
+ in1 = in1 << 8U;
+ in2 = in2 << 8U;
+
+ in1 = in1 & 0xFF00FF00;
+ in2 = in2 & 0xFF00FF00;
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ out2 = __PKHTB(in1, in2, 16);
+ out1 = __PKHBT(in2, in1, 16);
+#else
+ out1 = __PKHTB(in1, in2, 16);
+ out2 = __PKHBT(in2, in1, 16);
+#endif
+
+ write_q15x2_ia (&pDst, out1);
+ write_q15x2_ia (&pDst, out2);
+
+#else
+
+ *pDst++ = (q15_t) *pIn++ << 8;
+ *pDst++ = (q15_t) *pIn++ << 8;
+ *pDst++ = (q15_t) *pIn++ << 8;
+ *pDst++ = (q15_t) *pIn++ << 8;
+
+#endif /* #if defined (ARM_MATH_DSP) */
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = (q15_t) A << 8 */
+
+ /* Convert from q7 to q15 and store result in destination buffer */
+ *pDst++ = (q15_t) * pIn++ << 8;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ @} end of q7_to_x group
+ */
diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q7_to_q31.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q7_to_q31.c
new file mode 100644
index 0000000..01d5f2b
--- /dev/null
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q7_to_q31.c
@@ -0,0 +1,164 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_q7_to_q31.c
+ * Description: Converts the elements of the Q7 vector to Q31 vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @addtogroup q7_to_x
+ @{
+ */
+
+/**
+ @brief Converts the elements of the Q7 vector to Q31 vector.
+ @param[in] pSrc points to the Q7 input vector
+ @param[out] pDst points to the Q31 output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+
+ @par Details
+ The equation used for the conversion process is:
+ <pre>
+ pDst[n] = (q31_t) pSrc[n] << 24; 0 <= n < blockSize.
+ </pre>
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_q7_to_q31(
+ const q7_t * pSrc,
+ q31_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt;
+ q31x4_t vecDst;
+
+ blkCnt = blockSize >> 2;
+ while (blkCnt > 0U)
+ {
+
+ /* C = (q31_t)A << 16 */
+ /* convert from q15 to q31 and then store the results in the destination buffer */
+ /* load q15 + 32-bit widening */
+ vecDst = vldrbq_s32((q7_t const *) pSrc);
+ vecDst = vshlq_n(vecDst, 24);
+ vstrwq_s32(pDst, vecDst);
+
+ /*
+ * Decrement the blockSize loop counter
+ * Advance vector source and destination pointers
+ */
+ pDst += 4;
+ pSrc += 4;
+ blkCnt --;
+ }
+
+ blkCnt = blockSize & 3;
+ while (blkCnt > 0U)
+ {
+ /* C = (q31_t) A << 24 */
+
+ /* Convert from q7 to q31 and store result in destination buffer */
+ *pDst++ = (q31_t) *pSrc++ << 24;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+}
+
+#else
+void arm_q7_to_q31(
+ const q7_t * pSrc,
+ q31_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* Loop counter */
+ const q7_t *pIn = pSrc; /* Source pointer */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ q31_t in;
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = (q31_t) A << 24 */
+
+ /* Convert from q7 to q31 and store result in destination buffer */
+ in = read_q7x4_ia (&pIn);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ *pDst++ = (__ROR(in, 8)) & 0xFF000000;
+ *pDst++ = (__ROR(in, 16)) & 0xFF000000;
+ *pDst++ = (__ROR(in, 24)) & 0xFF000000;
+ *pDst++ = (in & 0xFF000000);
+
+#else
+
+ *pDst++ = (in & 0xFF000000);
+ *pDst++ = (__ROR(in, 24)) & 0xFF000000;
+ *pDst++ = (__ROR(in, 16)) & 0xFF000000;
+ *pDst++ = (__ROR(in, 8)) & 0xFF000000;
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = (q31_t) A << 24 */
+
+ /* Convert from q7 to q31 and store result in destination buffer */
+ *pDst++ = (q31_t) * pIn++ << 24;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ @} end of q7_to_x group
+ */
diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_quick_sort_f32.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_quick_sort_f32.c
new file mode 100644
index 0000000..c6c44d9
--- /dev/null
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_quick_sort_f32.c
@@ -0,0 +1,181 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_quick_sort_f32.c
+ * Description: Floating point quick sort
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_sorting.h"
+
+static uint32_t arm_quick_sort_partition_f32(float32_t *pSrc, int32_t first, int32_t last, uint8_t dir)
+{
+ /* This function will be called */
+ int32_t i, j, pivot_index;
+ float32_t pivot;
+ float32_t temp;
+
+ /* The first element is the pivot */
+ pivot_index = first;
+ pivot = pSrc[pivot_index];
+
+ /* Initialize indices for do-while loops */
+ i = first - 1;
+ j = last + 1;
+
+ while(i < j)
+ {
+ /* The loop will stop as soon as the indices i and j cross each other.
+ *
+ * This event will happen surely since the values of the indices are incremented and
+ * decrement in the do-while loops that are executed at least once.
+ * It is impossible to loop forever inside the do-while loops since the pivot is
+ * always an element of the array and the conditions cannot be always true (at least
+ * the i-th or the j-th element will be equal to the pivot-th element).
+ * For example, in the extreme case of an ordered array the do-while loop related to i will stop
+ * at the first iteration (because pSrc[i]=pSrc[pivot] already), and the loop related to j
+ * will stop after (last-first) iterations (when j=pivot=i=first). j is returned and
+ * j+1 is going to be used as pivot by other calls of the function, until j=pivot=last. */
+
+ /* Move indices to the right and to the left */
+ if(dir)
+ {
+ /* Compare left elements with pivot */
+ do
+ {
+ i++;
+ } while (pSrc[i] < pivot && i<last);
+
+ /* Compare right elements with pivot */
+ do
+ {
+ j--;
+ } while (pSrc[j] > pivot);
+ }
+ else
+ {
+ /* Compare left elements with pivot */
+ do
+ {
+ i++;
+ } while (pSrc[i] > pivot && i<last);
+
+ /* Compare right elements with pivot */
+ do
+ {
+ j--;
+ } while (pSrc[j] < pivot);
+ }
+
+ /* If the indices didn't cross each other */
+ if (i < j)
+ {
+ /* i and j are in the wrong position -> Swap */
+ temp=pSrc[i];
+ pSrc[i]=pSrc[j];
+ pSrc[j]=temp;
+ }
+ }
+
+ return j;
+}
+
+static void arm_quick_sort_core_f32(float32_t *pSrc, int32_t first, int32_t last, uint8_t dir)
+{
+ /* If the array [first ... last] has more than one element */
+ if(first<last)
+ {
+ int32_t pivot;
+
+ /* Compute pivot */
+ pivot = arm_quick_sort_partition_f32(pSrc, first, last, dir);
+
+ /* Iterate algorithm with two sub-arrays [first ... pivot] and [pivot+1 ... last] */
+ arm_quick_sort_core_f32(pSrc, first, pivot, dir);
+ arm_quick_sort_core_f32(pSrc, pivot+1, last, dir);
+ }
+}
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @addtogroup Sorting
+ @{
+ */
+
+/**
+ * @private
+ * @param[in] S points to an instance of the sorting structure.
+ * @param[in,out] pSrc points to the block of input data.
+ * @param[out] pDst points to the block of output data.
+ * @param[in] blockSize number of samples to process.
+ *
+ * @par Algorithm
+ * The quick sort algorithm is a comparison algorithm that
+ * divides the input array into two smaller sub-arrays and
+ * recursively sort them. An element of the array (the pivot)
+ * is chosen, all the elements with values smaller than the
+ * pivot are moved before the pivot, while all elements with
+ * values greater than the pivot are moved after it (partition).
+ *
+ * @par
+ * In this implementation the Hoare partition scheme has been
+ * used [Hoare, C. A. R. (1 January 1962). "Quicksort". The Computer
+ * Journal. 5 (1): 10...16.] The first element has always been chosen
+ * as the pivot. The partition algorithm guarantees that the returned
+ * pivot is never placed outside the vector, since it is returned only
+ * when the pointers crossed each other. In this way it isn't
+ * possible to obtain empty partitions and infinite recursion is avoided.
+ *
+ * @par
+ * It's an in-place algorithm. In order to obtain an out-of-place
+ * function, a memcpy of the source vector is performed.
+ */
+
+void arm_quick_sort_f32(
+ const arm_sort_instance_f32 * S,
+ float32_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ float32_t * pA;
+
+ /* Out-of-place */
+ if(pSrc != pDst)
+ {
+ memcpy(pDst, pSrc, blockSize*sizeof(float32_t) );
+ pA = pDst;
+ }
+ else
+ pA = pSrc;
+
+ arm_quick_sort_core_f32(pA, 0, blockSize-1, S->dir);
+ /* The previous function could be called recursively a maximum
+ * of (blockSize-1) times, generating a stack consumption of 4*(blockSize-1) bytes. */
+}
+
+/**
+ @} end of Sorting group
+ */
diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_selection_sort_f32.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_selection_sort_f32.c
new file mode 100644
index 0000000..ad534bc
--- /dev/null
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_selection_sort_f32.c
@@ -0,0 +1,107 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_selection_sort_f32.c
+ * Description: Floating point selection sort
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_sorting.h"
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @addtogroup Sorting
+ @{
+ */
+
+/**
+ * @private
+ * @param[in] S points to an instance of the sorting structure.
+ * @param[in] pSrc points to the block of input data.
+ * @param[out] pDst points to the block of output data
+ * @param[in] blockSize number of samples to process.
+ *
+ * @par Algorithm
+ * The Selection sort algorithm is a comparison algorithm that
+ * divides the input array into a sorted and an unsorted sublist
+ * (initially the sorted sublist is empty and the unsorted sublist
+ * is the input array), looks for the smallest (or biggest)
+ * element in the unsorted sublist, swapping it with the leftmost
+ * one, and moving the sublists boundary one element to the right.
+ *
+ * @par It's an in-place algorithm. In order to obtain an out-of-place
+ * function, a memcpy of the source vector is performed.
+ */
+
+void arm_selection_sort_f32(
+ const arm_sort_instance_f32 * S,
+ float32_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t i, j, k;
+ uint8_t dir = S->dir;
+ float32_t temp;
+
+ float32_t * pA;
+
+ if(pSrc != pDst) // out-of-place
+ {
+ memcpy(pDst, pSrc, blockSize*sizeof(float32_t) );
+ pA = pDst;
+ }
+ else
+ pA = pSrc;
+
+ /* Move the boundary one element to the right */
+ for (i=0; i<blockSize-1; i++)
+ {
+ /* Initialize the minimum/maximum as the first element */
+ k = i;
+
+ /* Look in the unsorted list to find the minimum/maximum value */
+ for (j=i+1; j<blockSize; j++)
+ {
+ if (dir==(pA[j] < pA[k]) )
+ {
+ /* Update value */
+ k = j;
+ }
+ }
+
+ if (k != i)
+ {
+ /* Swap the minimum/maximum with the leftmost element */
+ temp=pA[i];
+ pA[i]=pA[k];
+ pA[k]=temp;
+ }
+ }
+}
+
+/**
+ @} end of Sorting group
+ */
diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_sort_f32.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_sort_f32.c
new file mode 100644
index 0000000..43786dc
--- /dev/null
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_sort_f32.c
@@ -0,0 +1,86 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_sort_f32.c
+ * Description: Floating point sort
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_sorting.h"
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @addtogroup Sorting
+ @{
+ */
+
+
+/**
+ * @brief Generic sorting function
+ *
+ * @param[in] S points to an instance of the sorting structure.
+ * @param[in] pSrc points to the block of input data.
+ * @param[out] pDst points to the block of output data.
+ * @param[in] blockSize number of samples to process.
+ */
+
+void arm_sort_f32(
+ const arm_sort_instance_f32 * S,
+ float32_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ switch(S->alg)
+ {
+ case ARM_SORT_BITONIC:
+ arm_bitonic_sort_f32(S, pSrc, pDst, blockSize);
+ break;
+
+ case ARM_SORT_BUBBLE:
+ arm_bubble_sort_f32(S, pSrc, pDst, blockSize);
+ break;
+
+ case ARM_SORT_HEAP:
+ arm_heap_sort_f32(S, pSrc, pDst, blockSize);
+ break;
+
+ case ARM_SORT_INSERTION:
+ arm_insertion_sort_f32(S, pSrc, pDst, blockSize);
+ break;
+
+ case ARM_SORT_QUICK:
+ arm_quick_sort_f32(S, pSrc, pDst, blockSize);
+ break;
+
+ case ARM_SORT_SELECTION:
+ arm_selection_sort_f32(S, pSrc, pDst, blockSize);
+ break;
+ }
+}
+
+/**
+ @} end of Sorting group
+ */
diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_sort_init_f32.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_sort_init_f32.c
new file mode 100644
index 0000000..72ad9c5
--- /dev/null
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_sort_init_f32.c
@@ -0,0 +1,54 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_sort_init_f32.c
+ * Description: Floating point sort initialization function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_sorting.h"
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @addtogroup Sorting
+ @{
+ */
+
+
+ /**
+ * @param[in,out] S points to an instance of the sorting structure.
+ * @param[in] alg Selected algorithm.
+ * @param[in] dir Sorting order.
+ */
+void arm_sort_init_f32(arm_sort_instance_f32 * S, arm_sort_alg alg, arm_sort_dir dir)
+{
+ S->alg = alg;
+ S->dir = dir;
+}
+
+/**
+ @} end of Sorting group
+ */
diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_weighted_sum_f16.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_weighted_sum_f16.c
new file mode 100644
index 0000000..f9aae6b
--- /dev/null
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_weighted_sum_f16.c
@@ -0,0 +1,146 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_weighted_sum_f16.c
+ * Description: Weighted Sum
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <limits.h>
+#include <math.h>
+
+#include "dsp/support_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @defgroup weightedsum Weighted Sum
+
+ Weighted sum of values
+ */
+
+
+/**
+ * @addtogroup weightedsum
+ * @{
+ */
+
+
+/**
+ * @brief Weighted sum
+ *
+ *
+ * @param[in] *in Array of input values.
+ * @param[in] *weigths Weights
+ * @param[in] blockSize Number of samples in the input array.
+ * @return Weighted sum
+ *
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+float16_t arm_weighted_sum_f16(const float16_t *in,const float16_t *weigths, uint32_t blockSize)
+{
+ _Float16 accum1, accum2;
+ float16x8_t accum1V, accum2V;
+ float16x8_t inV, wV;
+ const float16_t *pIn, *pW;
+ uint32_t blkCnt;
+
+
+ pIn = in;
+ pW = weigths;
+
+
+ accum1V = vdupq_n_f16(0.0f16);
+ accum2V = vdupq_n_f16(0.0f16);
+
+ blkCnt = blockSize >> 3;
+ while (blkCnt > 0)
+ {
+ inV = vld1q(pIn);
+ wV = vld1q(pW);
+
+ pIn += 4;
+ pW += 4;
+
+ accum1V = vfmaq(accum1V, inV, wV);
+ accum2V = vaddq(accum2V, wV);
+ blkCnt--;
+ }
+
+ accum1 = vecAddAcrossF16Mve(accum1V);
+ accum2 = vecAddAcrossF16Mve(accum2V);
+
+ blkCnt = blockSize & 7;
+ while(blkCnt > 0)
+ {
+ accum1 += (_Float16)*pIn++ * (_Float16)*pW;
+ accum2 += (_Float16)*pW++;
+ blkCnt--;
+ }
+
+
+ return (accum1 / accum2);
+}
+
+#else
+
+float16_t arm_weighted_sum_f16(const float16_t *in, const float16_t *weigths, uint32_t blockSize)
+{
+
+ _Float16 accum1, accum2;
+ const float16_t *pIn, *pW;
+ uint32_t blkCnt;
+
+
+ pIn = in;
+ pW = weigths;
+
+ accum1=0.0f16;
+ accum2=0.0f16;
+
+ blkCnt = blockSize;
+ while(blkCnt > 0)
+ {
+ accum1 += (_Float16)*pIn++ * (_Float16)*pW;
+ accum2 += (_Float16)*pW++;
+ blkCnt--;
+ }
+
+ return(accum1 / accum2);
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ * @} end of weightedsum group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
+
diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_weighted_sum_f32.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_weighted_sum_f32.c
new file mode 100644
index 0000000..dd37861
--- /dev/null
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_weighted_sum_f32.c
@@ -0,0 +1,187 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_weighted_sum_f32.c
+ * Description: Weighted Sum
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <limits.h>
+#include <math.h>
+
+#include "dsp/support_functions.h"
+
+/**
+ * @addtogroup weightedsum
+ * @{
+ */
+
+
+/**
+ * @brief Weighted sum
+ *
+ *
+ * @param[in] *in Array of input values.
+ * @param[in] *weigths Weights
+ * @param[in] blockSize Number of samples in the input array.
+ * @return Weighted sum
+ *
+ */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+float32_t arm_weighted_sum_f32(const float32_t *in,const float32_t *weigths, uint32_t blockSize)
+{
+ float32_t accum1, accum2;
+ f32x4_t accum1V, accum2V;
+ f32x4_t inV, wV;
+ const float32_t *pIn, *pW;
+ uint32_t blkCnt;
+
+
+ pIn = in;
+ pW = weigths;
+
+
+ accum1V = vdupq_n_f32(0.0);
+ accum2V = vdupq_n_f32(0.0);
+
+ blkCnt = blockSize >> 2;
+ while (blkCnt > 0)
+ {
+ inV = vld1q(pIn);
+ wV = vld1q(pW);
+
+ pIn += 4;
+ pW += 4;
+
+ accum1V = vfmaq(accum1V, inV, wV);
+ accum2V = vaddq(accum2V, wV);
+ blkCnt--;
+ }
+
+ accum1 = vecAddAcrossF32Mve(accum1V);
+ accum2 = vecAddAcrossF32Mve(accum2V);
+
+ blkCnt = blockSize & 3;
+ while(blkCnt > 0)
+ {
+ accum1 += *pIn++ * *pW;
+ accum2 += *pW++;
+ blkCnt--;
+ }
+
+
+ return (accum1 / accum2);
+}
+
+#else
+#if defined(ARM_MATH_NEON)
+
+#include "NEMath.h"
+float32_t arm_weighted_sum_f32(const float32_t *in,const float32_t *weigths, uint32_t blockSize)
+{
+
+ float32_t accum1, accum2;
+ float32x4_t accum1V, accum2V;
+ float32x2_t tempV;
+
+ float32x4_t inV,wV;
+
+ const float32_t *pIn, *pW;
+ uint32_t blkCnt;
+
+
+ pIn = in;
+ pW = weigths;
+
+ accum1=0.0f;
+ accum2=0.0f;
+
+ accum1V = vdupq_n_f32(0.0f);
+ accum2V = vdupq_n_f32(0.0f);
+
+ blkCnt = blockSize >> 2;
+ while(blkCnt > 0)
+ {
+ inV = vld1q_f32(pIn);
+ wV = vld1q_f32(pW);
+
+ pIn += 4;
+ pW += 4;
+
+ accum1V = vmlaq_f32(accum1V,inV,wV);
+ accum2V = vaddq_f32(accum2V,wV);
+ blkCnt--;
+ }
+
+ tempV = vpadd_f32(vget_low_f32(accum1V),vget_high_f32(accum1V));
+ accum1 = vget_lane_f32(tempV, 0) + vget_lane_f32(tempV, 1);
+
+ tempV = vpadd_f32(vget_low_f32(accum2V),vget_high_f32(accum2V));
+ accum2 = vget_lane_f32(tempV, 0) + vget_lane_f32(tempV, 1);
+
+ blkCnt = blockSize & 3;
+ while(blkCnt > 0)
+ {
+ accum1 += *pIn++ * *pW;
+ accum2 += *pW++;
+ blkCnt--;
+ }
+
+
+ return(accum1 / accum2);
+}
+#else
+float32_t arm_weighted_sum_f32(const float32_t *in, const float32_t *weigths, uint32_t blockSize)
+{
+
+ float32_t accum1, accum2;
+ const float32_t *pIn, *pW;
+ uint32_t blkCnt;
+
+
+ pIn = in;
+ pW = weigths;
+
+ accum1=0.0f;
+ accum2=0.0f;
+
+ blkCnt = blockSize;
+ while(blkCnt > 0)
+ {
+ accum1 += *pIn++ * *pW;
+ accum2 += *pW++;
+ blkCnt--;
+ }
+
+ return(accum1 / accum2);
+}
+#endif
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ * @} end of weightedsum group
+ */