summaryrefslogtreecommitdiffstats
path: root/Drivers/CMSIS/DSP/Source/SVMFunctions
diff options
context:
space:
mode:
Diffstat (limited to 'Drivers/CMSIS/DSP/Source/SVMFunctions')
-rw-r--r--Drivers/CMSIS/DSP/Source/SVMFunctions/CMakeLists.txt41
-rw-r--r--Drivers/CMSIS/DSP/Source/SVMFunctions/SVMFunctions.c36
-rw-r--r--Drivers/CMSIS/DSP/Source/SVMFunctions/SVMFunctionsF16.c36
-rw-r--r--Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_linear_init_f16.c98
-rw-r--r--Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_linear_init_f32.c92
-rw-r--r--Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_linear_predict_f16.c314
-rw-r--r--Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_linear_predict_f32.c461
-rw-r--r--Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_polynomial_init_f16.c103
-rw-r--r--Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_polynomial_init_f32.c97
-rw-r--r--Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_polynomial_predict_f16.c369
-rw-r--r--Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_polynomial_predict_f32.c490
-rw-r--r--Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_rbf_init_f16.c97
-rw-r--r--Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_rbf_init_f32.c91
-rw-r--r--Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_rbf_predict_f16.c352
-rw-r--r--Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_rbf_predict_f32.c523
-rw-r--r--Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_sigmoid_init_f16.c98
-rw-r--r--Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_sigmoid_init_f32.c92
-rw-r--r--Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_sigmoid_predict_f16.c333
-rw-r--r--Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_sigmoid_predict_f32.c487
19 files changed, 4210 insertions, 0 deletions
diff --git a/Drivers/CMSIS/DSP/Source/SVMFunctions/CMakeLists.txt b/Drivers/CMSIS/DSP/Source/SVMFunctions/CMakeLists.txt
new file mode 100644
index 0000000..eb1e0ca
--- /dev/null
+++ b/Drivers/CMSIS/DSP/Source/SVMFunctions/CMakeLists.txt
@@ -0,0 +1,41 @@
+cmake_minimum_required (VERSION 3.14)
+
+project(CMSISDSPSVM)
+
+include(configLib)
+include(configDsp)
+
+
+add_library(CMSISDSPSVM STATIC)
+
+target_sources(CMSISDSPSVM PRIVATE arm_svm_linear_init_f32.c)
+target_sources(CMSISDSPSVM PRIVATE arm_svm_rbf_init_f32.c)
+target_sources(CMSISDSPSVM PRIVATE arm_svm_linear_predict_f32.c)
+target_sources(CMSISDSPSVM PRIVATE arm_svm_rbf_predict_f32.c)
+target_sources(CMSISDSPSVM PRIVATE arm_svm_polynomial_init_f32.c)
+target_sources(CMSISDSPSVM PRIVATE arm_svm_sigmoid_init_f32.c)
+target_sources(CMSISDSPSVM PRIVATE arm_svm_polynomial_predict_f32.c)
+target_sources(CMSISDSPSVM PRIVATE arm_svm_sigmoid_predict_f32.c)
+
+
+
+configLib(CMSISDSPSVM ${ROOT})
+configDsp(CMSISDSPSVM ${ROOT})
+
+### Includes
+target_include_directories(CMSISDSPSVM PUBLIC "${DSP}/Include")
+
+if ((NOT ARMAC5) AND (NOT DISABLEFLOAT16))
+target_sources(CMSISDSPSVM PRIVATE arm_svm_linear_init_f16.c)
+target_sources(CMSISDSPSVM PRIVATE arm_svm_rbf_init_f16.c)
+target_sources(CMSISDSPSVM PRIVATE arm_svm_linear_predict_f16.c)
+target_sources(CMSISDSPSVM PRIVATE arm_svm_rbf_predict_f16.c)
+target_sources(CMSISDSPSVM PRIVATE arm_svm_polynomial_init_f16.c)
+target_sources(CMSISDSPSVM PRIVATE arm_svm_sigmoid_init_f16.c)
+target_sources(CMSISDSPSVM PRIVATE arm_svm_polynomial_predict_f16.c)
+target_sources(CMSISDSPSVM PRIVATE arm_svm_sigmoid_predict_f16.c)
+endif()
+
+
+
+
diff --git a/Drivers/CMSIS/DSP/Source/SVMFunctions/SVMFunctions.c b/Drivers/CMSIS/DSP/Source/SVMFunctions/SVMFunctions.c
new file mode 100644
index 0000000..85f19b9
--- /dev/null
+++ b/Drivers/CMSIS/DSP/Source/SVMFunctions/SVMFunctions.c
@@ -0,0 +1,36 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: BayesFunctions.c
+ * Description: Combination of all SVM function source files.
+ *
+ * $Date: 16. March 2020
+ * $Revision: V1.0.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2020 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_svm_linear_init_f32.c"
+#include "arm_svm_linear_predict_f32.c"
+#include "arm_svm_polynomial_init_f32.c"
+#include "arm_svm_polynomial_predict_f32.c"
+#include "arm_svm_rbf_init_f32.c"
+#include "arm_svm_rbf_predict_f32.c"
+#include "arm_svm_sigmoid_init_f32.c"
+#include "arm_svm_sigmoid_predict_f32.c"
diff --git a/Drivers/CMSIS/DSP/Source/SVMFunctions/SVMFunctionsF16.c b/Drivers/CMSIS/DSP/Source/SVMFunctions/SVMFunctionsF16.c
new file mode 100644
index 0000000..74d2665
--- /dev/null
+++ b/Drivers/CMSIS/DSP/Source/SVMFunctions/SVMFunctionsF16.c
@@ -0,0 +1,36 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: BayesFunctions.c
+ * Description: Combination of all SVM function source files.
+ *
+ * $Date: 16. March 2020
+ * $Revision: V1.0.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2020 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_svm_linear_init_f16.c"
+#include "arm_svm_linear_predict_f16.c"
+#include "arm_svm_polynomial_init_f16.c"
+#include "arm_svm_polynomial_predict_f16.c"
+#include "arm_svm_rbf_init_f16.c"
+#include "arm_svm_rbf_predict_f16.c"
+#include "arm_svm_sigmoid_init_f16.c"
+#include "arm_svm_sigmoid_predict_f16.c"
diff --git a/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_linear_init_f16.c b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_linear_init_f16.c
new file mode 100644
index 0000000..a3ebc7f
--- /dev/null
+++ b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_linear_init_f16.c
@@ -0,0 +1,98 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_svm_linear_init_f16.c
+ * Description: SVM Linear Instance Initialization
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/svm_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+#include <limits.h>
+#include <math.h>
+
+/**
+ * @defgroup groupSVM SVM Functions
+ *
+ */
+
+/**
+ @ingroup groupSVM
+ */
+
+/**
+ @defgroup linearsvm Linear SVM
+
+ Linear SVM classifier
+ */
+
+/**
+ * @addtogroup linearsvm
+ * @{
+ */
+
+
+/**
+ * @brief SVM linear instance init function
+ *
+ * Classes are integer used as output of the function (instead of having -1,1
+ * as class values).
+ *
+ * @param[in] S Parameters for the SVM function
+ * @param[in] nbOfSupportVectors Number of support vectors
+ * @param[in] vectorDimension Dimension of vector space
+ * @param[in] intercept Intercept
+ * @param[in] dualCoefficients Array of dual coefficients
+ * @param[in] supportVectors Array of support vectors
+ * @param[in] classes Array of 2 classes ID
+ * @return none.
+ *
+ */
+
+
+void arm_svm_linear_init_f16(arm_svm_linear_instance_f16 *S,
+ uint32_t nbOfSupportVectors,
+ uint32_t vectorDimension,
+ float16_t intercept,
+ const float16_t *dualCoefficients,
+ const float16_t *supportVectors,
+ const int32_t *classes)
+{
+ S->nbOfSupportVectors = nbOfSupportVectors;
+ S->vectorDimension = vectorDimension;
+ S->intercept = intercept;
+ S->dualCoefficients = dualCoefficients;
+ S->supportVectors = supportVectors;
+ S->classes = classes;
+}
+
+
+
+/**
+ * @} end of linearsvm group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
+
diff --git a/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_linear_init_f32.c b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_linear_init_f32.c
new file mode 100644
index 0000000..75395aa
--- /dev/null
+++ b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_linear_init_f32.c
@@ -0,0 +1,92 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_svm_linear_init_f32.c
+ * Description: SVM Linear Instance Initialization
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/svm_functions.h"
+#include <limits.h>
+#include <math.h>
+
+/**
+ * @defgroup groupSVM SVM Functions
+ *
+ */
+
+/**
+ @ingroup groupSVM
+ */
+
+/**
+ @defgroup linearsvm Linear SVM
+
+ Linear SVM classifier
+ */
+
+/**
+ * @addtogroup linearsvm
+ * @{
+ */
+
+
+/**
+ * @brief SVM linear instance init function
+ *
+ * Classes are integer used as output of the function (instead of having -1,1
+ * as class values).
+ *
+ * @param[in] S Parameters for the SVM function
+ * @param[in] nbOfSupportVectors Number of support vectors
+ * @param[in] vectorDimension Dimension of vector space
+ * @param[in] intercept Intercept
+ * @param[in] dualCoefficients Array of dual coefficients
+ * @param[in] supportVectors Array of support vectors
+ * @param[in] classes Array of 2 classes ID
+ * @return none.
+ *
+ */
+
+
+void arm_svm_linear_init_f32(arm_svm_linear_instance_f32 *S,
+ uint32_t nbOfSupportVectors,
+ uint32_t vectorDimension,
+ float32_t intercept,
+ const float32_t *dualCoefficients,
+ const float32_t *supportVectors,
+ const int32_t *classes)
+{
+ S->nbOfSupportVectors = nbOfSupportVectors;
+ S->vectorDimension = vectorDimension;
+ S->intercept = intercept;
+ S->dualCoefficients = dualCoefficients;
+ S->supportVectors = supportVectors;
+ S->classes = classes;
+}
+
+
+
+/**
+ * @} end of linearsvm group
+ */
diff --git a/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_linear_predict_f16.c b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_linear_predict_f16.c
new file mode 100644
index 0000000..fe907e5
--- /dev/null
+++ b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_linear_predict_f16.c
@@ -0,0 +1,314 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_svm_linear_predict_f16.c
+ * Description: SVM Linear Classifier
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/svm_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+#include <limits.h>
+#include <math.h>
+
+
+/**
+ * @addtogroup linearsvm
+ * @{
+ */
+
+
+/**
+ * @brief SVM linear prediction
+ * @param[in] S Pointer to an instance of the linear SVM structure.
+ * @param[in] in Pointer to input vector
+ * @param[out] pResult Decision value
+ * @return none.
+ *
+ */
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_svm_linear_predict_f16(
+ const arm_svm_linear_instance_f16 *S,
+ const float16_t * in,
+ int32_t * pResult)
+{
+ /* inlined Matrix x Vector function interleaved with dot prod */
+ uint32_t numRows = S->nbOfSupportVectors;
+ uint32_t numCols = S->vectorDimension;
+ const float16_t *pSupport = S->supportVectors;
+ const float16_t *pSrcA = pSupport;
+ const float16_t *pInA0;
+ const float16_t *pInA1;
+ uint32_t row;
+ uint32_t blkCnt; /* loop counters */
+ const float16_t *pDualCoef = S->dualCoefficients;
+ _Float16 sum = S->intercept;
+ row = numRows;
+
+ /*
+ * compute 4 rows in parrallel
+ */
+ while (row >= 4)
+ {
+ const float16_t *pInA2, *pInA3;
+ float16_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec;
+ f16x8_t vecIn, acc0, acc1, acc2, acc3;
+ float16_t const *pSrcVecPtr = in;
+
+ /*
+ * Initialize the pointers to 4 consecutive MatrixA rows
+ */
+ pInA0 = pSrcA;
+ pInA1 = pInA0 + numCols;
+ pInA2 = pInA1 + numCols;
+ pInA3 = pInA2 + numCols;
+ /*
+ * Initialize the vector pointer
+ */
+ pInVec = pSrcVecPtr;
+ /*
+ * reset accumulators
+ */
+ acc0 = vdupq_n_f16(0.0f);
+ acc1 = vdupq_n_f16(0.0f);
+ acc2 = vdupq_n_f16(0.0f);
+ acc3 = vdupq_n_f16(0.0f);
+
+ pSrcA0Vec = pInA0;
+ pSrcA1Vec = pInA1;
+ pSrcA2Vec = pInA2;
+ pSrcA3Vec = pInA3;
+
+ blkCnt = numCols >> 3;
+ while (blkCnt > 0U) {
+ f16x8_t vecA;
+
+ vecIn = vld1q(pInVec);
+ pInVec += 8;
+ vecA = vld1q(pSrcA0Vec);
+ pSrcA0Vec += 8;
+ acc0 = vfmaq(acc0, vecIn, vecA);
+ vecA = vld1q(pSrcA1Vec);
+ pSrcA1Vec += 8;
+ acc1 = vfmaq(acc1, vecIn, vecA);
+ vecA = vld1q(pSrcA2Vec);
+ pSrcA2Vec += 8;
+ acc2 = vfmaq(acc2, vecIn, vecA);
+ vecA = vld1q(pSrcA3Vec);
+ pSrcA3Vec += 8;
+ acc3 = vfmaq(acc3, vecIn, vecA);
+
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numCols & 7;
+ if (blkCnt > 0U) {
+ mve_pred16_t p0 = vctp16q(blkCnt);
+ f16x8_t vecA;
+
+ vecIn = vldrhq_z_f16(pInVec, p0);
+ vecA = vldrhq_z_f16(pSrcA0Vec, p0);
+ acc0 = vfmaq(acc0, vecIn, vecA);
+ vecA = vldrhq_z_f16(pSrcA1Vec, p0);
+ acc1 = vfmaq(acc1, vecIn, vecA);
+ vecA = vldrhq_z_f16(pSrcA2Vec, p0);
+ acc2 = vfmaq(acc2, vecIn, vecA);
+ vecA = vldrhq_z_f16(pSrcA3Vec, p0);
+ acc3 = vfmaq(acc3, vecIn, vecA);
+ }
+ /*
+ * Sum the partial parts
+ */
+ acc0 = vmulq_n_f16(acc0,*pDualCoef++);
+ acc0 = vfmaq_n_f16(acc0,acc1,*pDualCoef++);
+ acc0 = vfmaq_n_f16(acc0,acc2,*pDualCoef++);
+ acc0 = vfmaq_n_f16(acc0,acc3,*pDualCoef++);
+
+ sum += (_Float16)vecAddAcrossF16Mve(acc0);
+
+ pSrcA += numCols * 4;
+ /*
+ * Decrement the row loop counter
+ */
+ row -= 4;
+ }
+
+ /*
+ * compute 2 rows in parallel
+ */
+ if (row >= 2) {
+ float16_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec;
+ f16x8_t vecIn, acc0, acc1;
+ float16_t const *pSrcVecPtr = in;
+
+ /*
+ * Initialize the pointers to 2 consecutive MatrixA rows
+ */
+ pInA0 = pSrcA;
+ pInA1 = pInA0 + numCols;
+ /*
+ * Initialize the vector pointer
+ */
+ pInVec = pSrcVecPtr;
+ /*
+ * reset accumulators
+ */
+ acc0 = vdupq_n_f16(0.0f);
+ acc1 = vdupq_n_f16(0.0f);
+ pSrcA0Vec = pInA0;
+ pSrcA1Vec = pInA1;
+
+ blkCnt = numCols >> 3;
+ while (blkCnt > 0U) {
+ f16x8_t vecA;
+
+ vecIn = vld1q(pInVec);
+ pInVec += 8;
+ vecA = vld1q(pSrcA0Vec);
+ pSrcA0Vec += 8;
+ acc0 = vfmaq(acc0, vecIn, vecA);
+ vecA = vld1q(pSrcA1Vec);
+ pSrcA1Vec += 8;
+ acc1 = vfmaq(acc1, vecIn, vecA);
+
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numCols & 7;
+ if (blkCnt > 0U) {
+ mve_pred16_t p0 = vctp16q(blkCnt);
+ f16x8_t vecA;
+
+ vecIn = vldrhq_z_f16(pInVec, p0);
+ vecA = vldrhq_z_f16(pSrcA0Vec, p0);
+ acc0 = vfmaq(acc0, vecIn, vecA);
+ vecA = vldrhq_z_f16(pSrcA1Vec, p0);
+ acc1 = vfmaq(acc1, vecIn, vecA);
+ }
+ /*
+ * Sum the partial parts
+ */
+ acc0 = vmulq_n_f16(acc0,*pDualCoef++);
+ acc0 = vfmaq_n_f16(acc0,acc1,*pDualCoef++);
+
+ sum += (_Float16)vecAddAcrossF16Mve(acc0);
+
+ pSrcA += numCols * 2;
+ row -= 2;
+ }
+
+ if (row >= 1) {
+ f16x8_t vecIn, acc0;
+ float16_t const *pSrcA0Vec, *pInVec;
+ float16_t const *pSrcVecPtr = in;
+ /*
+ * Initialize the pointers to last MatrixA row
+ */
+ pInA0 = pSrcA;
+ /*
+ * Initialize the vector pointer
+ */
+ pInVec = pSrcVecPtr;
+ /*
+ * reset accumulators
+ */
+ acc0 = vdupq_n_f16(0.0f);
+
+ pSrcA0Vec = pInA0;
+
+ blkCnt = numCols >> 3;
+ while (blkCnt > 0U) {
+ f16x8_t vecA;
+
+ vecIn = vld1q(pInVec);
+ pInVec += 8;
+ vecA = vld1q(pSrcA0Vec);
+ pSrcA0Vec += 8;
+ acc0 = vfmaq(acc0, vecIn, vecA);
+
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numCols & 7;
+ if (blkCnt > 0U) {
+ mve_pred16_t p0 = vctp16q(blkCnt);
+ f16x8_t vecA;
+
+ vecIn = vldrhq_z_f16(pInVec, p0);
+ vecA = vldrhq_z_f16(pSrcA0Vec, p0);
+ acc0 = vfmaq(acc0, vecIn, vecA);
+ }
+ /*
+ * Sum the partial parts
+ */
+ sum += (_Float16)*pDualCoef++ * (_Float16)vecAddAcrossF16Mve(acc0);
+
+ }
+
+ *pResult = S->classes[STEP(sum)];
+}
+
+#else
+void arm_svm_linear_predict_f16(
+ const arm_svm_linear_instance_f16 *S,
+ const float16_t * in,
+ int32_t * pResult)
+{
+ _Float16 sum=S->intercept;
+ _Float16 dot=0;
+ uint32_t i,j;
+ const float16_t *pSupport = S->supportVectors;
+
+ for(i=0; i < S->nbOfSupportVectors; i++)
+ {
+ dot=0;
+ for(j=0; j < S->vectorDimension; j++)
+ {
+ dot = (_Float16)dot + (_Float16)in[j]* (_Float16)*pSupport++;
+ }
+ sum += (_Float16)S->dualCoefficients[i] * (_Float16)dot;
+ }
+ *pResult=S->classes[STEP(sum)];
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ * @} end of linearsvm group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
+
diff --git a/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_linear_predict_f32.c b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_linear_predict_f32.c
new file mode 100644
index 0000000..caf09df
--- /dev/null
+++ b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_linear_predict_f32.c
@@ -0,0 +1,461 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_svm_linear_predict_f32.c
+ * Description: SVM Linear Classifier
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/svm_functions.h"
+#include <limits.h>
+#include <math.h>
+
+
+/**
+ * @addtogroup linearsvm
+ * @{
+ */
+
+
+/**
+ * @brief SVM linear prediction
+ * @param[in] S Pointer to an instance of the linear SVM structure.
+ * @param[in] in Pointer to input vector
+ * @param[out] pResult Decision value
+ * @return none.
+ *
+ */
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_svm_linear_predict_f32(
+ const arm_svm_linear_instance_f32 *S,
+ const float32_t * in,
+ int32_t * pResult)
+{
+ /* inlined Matrix x Vector function interleaved with dot prod */
+ uint32_t numRows = S->nbOfSupportVectors;
+ uint32_t numCols = S->vectorDimension;
+ const float32_t *pSupport = S->supportVectors;
+ const float32_t *pSrcA = pSupport;
+ const float32_t *pInA0;
+ const float32_t *pInA1;
+ uint32_t row;
+ uint32_t blkCnt; /* loop counters */
+ const float32_t *pDualCoef = S->dualCoefficients;
+ float32_t sum = S->intercept;
+ row = numRows;
+
+ /*
+ * compute 4 rows in parrallel
+ */
+ while (row >= 4)
+ {
+ const float32_t *pInA2, *pInA3;
+ float32_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec;
+ f32x4_t vecIn, acc0, acc1, acc2, acc3;
+ float32_t const *pSrcVecPtr = in;
+
+ /*
+ * Initialize the pointers to 4 consecutive MatrixA rows
+ */
+ pInA0 = pSrcA;
+ pInA1 = pInA0 + numCols;
+ pInA2 = pInA1 + numCols;
+ pInA3 = pInA2 + numCols;
+ /*
+ * Initialize the vector pointer
+ */
+ pInVec = pSrcVecPtr;
+ /*
+ * reset accumulators
+ */
+ acc0 = vdupq_n_f32(0.0f);
+ acc1 = vdupq_n_f32(0.0f);
+ acc2 = vdupq_n_f32(0.0f);
+ acc3 = vdupq_n_f32(0.0f);
+
+ pSrcA0Vec = pInA0;
+ pSrcA1Vec = pInA1;
+ pSrcA2Vec = pInA2;
+ pSrcA3Vec = pInA3;
+
+ blkCnt = numCols >> 2;
+ while (blkCnt > 0U) {
+ f32x4_t vecA;
+
+ vecIn = vld1q(pInVec);
+ pInVec += 4;
+ vecA = vld1q(pSrcA0Vec);
+ pSrcA0Vec += 4;
+ acc0 = vfmaq(acc0, vecIn, vecA);
+ vecA = vld1q(pSrcA1Vec);
+ pSrcA1Vec += 4;
+ acc1 = vfmaq(acc1, vecIn, vecA);
+ vecA = vld1q(pSrcA2Vec);
+ pSrcA2Vec += 4;
+ acc2 = vfmaq(acc2, vecIn, vecA);
+ vecA = vld1q(pSrcA3Vec);
+ pSrcA3Vec += 4;
+ acc3 = vfmaq(acc3, vecIn, vecA);
+
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numCols & 3;
+ if (blkCnt > 0U) {
+ mve_pred16_t p0 = vctp32q(blkCnt);
+ f32x4_t vecA;
+
+ vecIn = vldrwq_z_f32(pInVec, p0);
+ vecA = vldrwq_z_f32(pSrcA0Vec, p0);
+ acc0 = vfmaq(acc0, vecIn, vecA);
+ vecA = vldrwq_z_f32(pSrcA1Vec, p0);
+ acc1 = vfmaq(acc1, vecIn, vecA);
+ vecA = vldrwq_z_f32(pSrcA2Vec, p0);
+ acc2 = vfmaq(acc2, vecIn, vecA);
+ vecA = vldrwq_z_f32(pSrcA3Vec, p0);
+ acc3 = vfmaq(acc3, vecIn, vecA);
+ }
+ /*
+ * Sum the partial parts
+ */
+
+ acc0 = vmulq_n_f32(acc0,*pDualCoef++);
+ acc0 = vfmaq_n_f32(acc0,acc1,*pDualCoef++);
+ acc0 = vfmaq_n_f32(acc0,acc2,*pDualCoef++);
+ acc0 = vfmaq_n_f32(acc0,acc3,*pDualCoef++);
+
+ sum += vecAddAcrossF32Mve(acc0);
+
+ pSrcA += numCols * 4;
+ /*
+ * Decrement the row loop counter
+ */
+ row -= 4;
+ }
+
+ /*
+ * compute 2 rows in parallel
+ */
+ if (row >= 2) {
+ float32_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec;
+ f32x4_t vecIn, acc0, acc1;
+ float32_t const *pSrcVecPtr = in;
+
+ /*
+ * Initialize the pointers to 2 consecutive MatrixA rows
+ */
+ pInA0 = pSrcA;
+ pInA1 = pInA0 + numCols;
+ /*
+ * Initialize the vector pointer
+ */
+ pInVec = pSrcVecPtr;
+ /*
+ * reset accumulators
+ */
+ acc0 = vdupq_n_f32(0.0f);
+ acc1 = vdupq_n_f32(0.0f);
+ pSrcA0Vec = pInA0;
+ pSrcA1Vec = pInA1;
+
+ blkCnt = numCols >> 2;
+ while (blkCnt > 0U) {
+ f32x4_t vecA;
+
+ vecIn = vld1q(pInVec);
+ pInVec += 4;
+ vecA = vld1q(pSrcA0Vec);
+ pSrcA0Vec += 4;
+ acc0 = vfmaq(acc0, vecIn, vecA);
+ vecA = vld1q(pSrcA1Vec);
+ pSrcA1Vec += 4;
+ acc1 = vfmaq(acc1, vecIn, vecA);
+
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numCols & 3;
+ if (blkCnt > 0U) {
+ mve_pred16_t p0 = vctp32q(blkCnt);
+ f32x4_t vecA;
+
+ vecIn = vldrwq_z_f32(pInVec, p0);
+ vecA = vldrwq_z_f32(pSrcA0Vec, p0);
+ acc0 = vfmaq(acc0, vecIn, vecA);
+ vecA = vldrwq_z_f32(pSrcA1Vec, p0);
+ acc1 = vfmaq(acc1, vecIn, vecA);
+ }
+ /*
+ * Sum the partial parts
+ */
+ acc0 = vmulq_n_f32(acc0,*pDualCoef++);
+ acc0 = vfmaq_n_f32(acc0,acc1,*pDualCoef++);
+
+ sum += vecAddAcrossF32Mve(acc0);
+
+
+ pSrcA += numCols * 2;
+ row -= 2;
+ }
+
+ if (row >= 1) {
+ f32x4_t vecIn, acc0;
+ float32_t const *pSrcA0Vec, *pInVec;
+ float32_t const *pSrcVecPtr = in;
+ /*
+ * Initialize the pointers to last MatrixA row
+ */
+ pInA0 = pSrcA;
+ /*
+ * Initialize the vector pointer
+ */
+ pInVec = pSrcVecPtr;
+ /*
+ * reset accumulators
+ */
+ acc0 = vdupq_n_f32(0.0f);
+
+ pSrcA0Vec = pInA0;
+
+ blkCnt = numCols >> 2;
+ while (blkCnt > 0U) {
+ f32x4_t vecA;
+
+ vecIn = vld1q(pInVec);
+ pInVec += 4;
+ vecA = vld1q(pSrcA0Vec);
+ pSrcA0Vec += 4;
+ acc0 = vfmaq(acc0, vecIn, vecA);
+
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numCols & 3;
+ if (blkCnt > 0U) {
+ mve_pred16_t p0 = vctp32q(blkCnt);
+ f32x4_t vecA;
+
+ vecIn = vldrwq_z_f32(pInVec, p0);
+ vecA = vldrwq_z_f32(pSrcA0Vec, p0);
+ acc0 = vfmaq(acc0, vecIn, vecA);
+ }
+ /*
+ * Sum the partial parts
+ */
+ sum += *pDualCoef++ * vecAddAcrossF32Mve(acc0);
+
+ }
+
+ *pResult = S->classes[STEP(sum)];
+}
+
+#else
+#if defined(ARM_MATH_NEON)
+void arm_svm_linear_predict_f32(
+ const arm_svm_linear_instance_f32 *S,
+ const float32_t * in,
+ int32_t * pResult)
+{
+ float32_t sum = S->intercept;
+
+ float32_t dot;
+ float32x4_t dotV;
+
+ float32x4_t accuma,accumb,accumc,accumd,accum;
+ float32x2_t accum2;
+ float32x4_t vec1;
+
+ float32x4_t vec2,vec2a,vec2b,vec2c,vec2d;
+
+ uint32_t blkCnt;
+ uint32_t vectorBlkCnt;
+
+ const float32_t *pIn = in;
+
+ const float32_t *pSupport = S->supportVectors;
+
+ const float32_t *pSupporta = S->supportVectors;
+ const float32_t *pSupportb;
+ const float32_t *pSupportc;
+ const float32_t *pSupportd;
+
+ pSupportb = pSupporta + S->vectorDimension;
+ pSupportc = pSupportb + S->vectorDimension;
+ pSupportd = pSupportc + S->vectorDimension;
+
+ const float32_t *pDualCoefs = S->dualCoefficients;
+
+ vectorBlkCnt = S->nbOfSupportVectors >> 2;
+
+ while (vectorBlkCnt > 0U)
+ {
+ accuma = vdupq_n_f32(0);
+ accumb = vdupq_n_f32(0);
+ accumc = vdupq_n_f32(0);
+ accumd = vdupq_n_f32(0);
+
+ pIn = in;
+
+ blkCnt = S->vectorDimension >> 2;
+ while (blkCnt > 0U)
+ {
+
+ vec1 = vld1q_f32(pIn);
+ vec2a = vld1q_f32(pSupporta);
+ vec2b = vld1q_f32(pSupportb);
+ vec2c = vld1q_f32(pSupportc);
+ vec2d = vld1q_f32(pSupportd);
+
+ pIn += 4;
+ pSupporta += 4;
+ pSupportb += 4;
+ pSupportc += 4;
+ pSupportd += 4;
+
+ accuma = vmlaq_f32(accuma, vec1,vec2a);
+ accumb = vmlaq_f32(accumb, vec1,vec2b);
+ accumc = vmlaq_f32(accumc, vec1,vec2c);
+ accumd = vmlaq_f32(accumd, vec1,vec2d);
+
+ blkCnt -- ;
+ }
+ accum2 = vpadd_f32(vget_low_f32(accuma),vget_high_f32(accuma));
+ dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,0);
+
+ accum2 = vpadd_f32(vget_low_f32(accumb),vget_high_f32(accumb));
+ dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,1);
+
+ accum2 = vpadd_f32(vget_low_f32(accumc),vget_high_f32(accumc));
+ dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,2);
+
+ accum2 = vpadd_f32(vget_low_f32(accumd),vget_high_f32(accumd));
+ dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,3);
+
+
+ blkCnt = S->vectorDimension & 3;
+ while (blkCnt > 0U)
+ {
+ dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,0) + *pIn * *pSupporta++, dotV,0);
+ dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,1) + *pIn * *pSupportb++, dotV,1);
+ dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,2) + *pIn * *pSupportc++, dotV,2);
+ dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,3) + *pIn * *pSupportd++, dotV,3);
+
+ pIn++;
+
+ blkCnt -- ;
+ }
+
+ vec1 = vld1q_f32(pDualCoefs);
+ pDualCoefs += 4;
+
+ accum = vmulq_f32(vec1,dotV);
+ accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum));
+ sum += vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1);
+
+ pSupporta += 3*S->vectorDimension;
+ pSupportb += 3*S->vectorDimension;
+ pSupportc += 3*S->vectorDimension;
+ pSupportd += 3*S->vectorDimension;
+
+ vectorBlkCnt -- ;
+ }
+
+ pSupport = pSupporta;
+ vectorBlkCnt = S->nbOfSupportVectors & 3;
+ while (vectorBlkCnt > 0U)
+ {
+ accum = vdupq_n_f32(0);
+ dot = 0.0f;
+ pIn = in;
+
+ blkCnt = S->vectorDimension >> 2;
+ while (blkCnt > 0U)
+ {
+
+ vec1 = vld1q_f32(pIn);
+ vec2 = vld1q_f32(pSupport);
+ pIn += 4;
+ pSupport += 4;
+
+ accum = vmlaq_f32(accum, vec1,vec2);
+
+ blkCnt -- ;
+ }
+ accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum));
+ dot = vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1);
+
+
+ blkCnt = S->vectorDimension & 3;
+ while (blkCnt > 0U)
+ {
+ dot = dot + *pIn++ * *pSupport++;
+
+ blkCnt -- ;
+ }
+
+ sum += *pDualCoefs++ * dot;
+ vectorBlkCnt -- ;
+ }
+
+ *pResult=S->classes[STEP(sum)];
+}
+#else
+void arm_svm_linear_predict_f32(
+ const arm_svm_linear_instance_f32 *S,
+ const float32_t * in,
+ int32_t * pResult)
+{
+ float32_t sum=S->intercept;
+ float32_t dot=0;
+ uint32_t i,j;
+ const float32_t *pSupport = S->supportVectors;
+
+ for(i=0; i < S->nbOfSupportVectors; i++)
+ {
+ dot=0;
+ for(j=0; j < S->vectorDimension; j++)
+ {
+ dot = dot + in[j]* *pSupport++;
+ }
+ sum += S->dualCoefficients[i] * dot;
+ }
+ *pResult=S->classes[STEP(sum)];
+}
+#endif
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ * @} end of linearsvm group
+ */
diff --git a/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_polynomial_init_f16.c b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_polynomial_init_f16.c
new file mode 100644
index 0000000..558ab45
--- /dev/null
+++ b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_polynomial_init_f16.c
@@ -0,0 +1,103 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_svm_polynomial_init_f16.c
+ * Description: SVM Polynomial Instance Initialization
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/svm_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+#include <limits.h>
+#include <math.h>
+
+/**
+ @ingroup groupSVM
+ */
+
+/**
+ @defgroup polysvm Polynomial SVM
+
+ Polynomial SVM classifier
+ */
+
+/**
+ * @addtogroup polysvm
+ * @{
+ */
+
+
+/**
+ * @brief SVM polynomial instance init function
+ *
+ * Classes are integer used as output of the function (instead of having -1,1
+ * as class values).
+ *
+ * @param[in] S points to an instance of the polynomial SVM structure.
+ * @param[in] nbOfSupportVectors Number of support vectors
+ * @param[in] vectorDimension Dimension of vector space
+ * @param[in] intercept Intercept
+ * @param[in] dualCoefficients Array of dual coefficients
+ * @param[in] supportVectors Array of support vectors
+ * @param[in] classes Array of 2 classes ID
+ * @param[in] degree Polynomial degree
+ * @param[in] coef0 coeff0 (scikit-learn terminology)
+ * @param[in] gamma gamma (scikit-learn terminology)
+ * @return none.
+ *
+ */
+
+
+void arm_svm_polynomial_init_f16(arm_svm_polynomial_instance_f16 *S,
+ uint32_t nbOfSupportVectors,
+ uint32_t vectorDimension,
+ float16_t intercept,
+ const float16_t *dualCoefficients,
+ const float16_t *supportVectors,
+ const int32_t *classes,
+ int32_t degree,
+ float16_t coef0,
+ float16_t gamma
+ )
+{
+ S->nbOfSupportVectors = nbOfSupportVectors;
+ S->vectorDimension = vectorDimension;
+ S->intercept = intercept;
+ S->dualCoefficients = dualCoefficients;
+ S->supportVectors = supportVectors;
+ S->classes = classes;
+ S->degree = degree;
+ S->coef0 = coef0;
+ S->gamma = gamma;
+}
+
+
+
+/**
+ * @} end of polysvm group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
+
diff --git a/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_polynomial_init_f32.c b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_polynomial_init_f32.c
new file mode 100644
index 0000000..7d33cd7
--- /dev/null
+++ b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_polynomial_init_f32.c
@@ -0,0 +1,97 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_svm_polynomial_init_f32.c
+ * Description: SVM Polynomial Instance Initialization
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/svm_functions.h"
+#include <limits.h>
+#include <math.h>
+
+/**
+ @ingroup groupSVM
+ */
+
+/**
+ @defgroup polysvm Polynomial SVM
+
+ Polynomial SVM classifier
+ */
+
+/**
+ * @addtogroup polysvm
+ * @{
+ */
+
+
+/**
+ * @brief SVM polynomial instance init function
+ *
+ * Classes are integer used as output of the function (instead of having -1,1
+ * as class values).
+ *
+ * @param[in] S points to an instance of the polynomial SVM structure.
+ * @param[in] nbOfSupportVectors Number of support vectors
+ * @param[in] vectorDimension Dimension of vector space
+ * @param[in] intercept Intercept
+ * @param[in] dualCoefficients Array of dual coefficients
+ * @param[in] supportVectors Array of support vectors
+ * @param[in] classes Array of 2 classes ID
+ * @param[in] degree Polynomial degree
+ * @param[in] coef0 coeff0 (scikit-learn terminology)
+ * @param[in] gamma gamma (scikit-learn terminology)
+ * @return none.
+ *
+ */
+
+
+void arm_svm_polynomial_init_f32(arm_svm_polynomial_instance_f32 *S,
+ uint32_t nbOfSupportVectors,
+ uint32_t vectorDimension,
+ float32_t intercept,
+ const float32_t *dualCoefficients,
+ const float32_t *supportVectors,
+ const int32_t *classes,
+ int32_t degree,
+ float32_t coef0,
+ float32_t gamma
+ )
+{
+ S->nbOfSupportVectors = nbOfSupportVectors;
+ S->vectorDimension = vectorDimension;
+ S->intercept = intercept;
+ S->dualCoefficients = dualCoefficients;
+ S->supportVectors = supportVectors;
+ S->classes = classes;
+ S->degree = degree;
+ S->coef0 = coef0;
+ S->gamma = gamma;
+}
+
+
+
+/**
+ * @} end of polysvm group
+ */
diff --git a/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_polynomial_predict_f16.c b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_polynomial_predict_f16.c
new file mode 100644
index 0000000..724f286
--- /dev/null
+++ b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_polynomial_predict_f16.c
@@ -0,0 +1,369 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_svm_polynomial_predict_f16.c
+ * Description: SVM Polynomial Classifier
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/svm_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+#include <limits.h>
+#include <math.h>
+
+#if !defined(ARM_MATH_MVE_FLOAT16) || defined(ARM_MATH_AUTOVECTORIZE)
+
+/*
+
+_Float16 is not supported in g++ so we avoid putting _Float16 definitions
+in the public headers.
+
+This function should at some point be moved in FastMath.
+
+*/
+__STATIC_INLINE float16_t arm_exponent_f16(float16_t x, int32_t nb)
+{
+ float16_t r = x;
+ nb --;
+ while(nb > 0)
+ {
+ r = (_Float16)r * (_Float16)x;
+ nb--;
+ }
+ return(r);
+}
+#endif
+
+/**
+ * @addtogroup polysvm
+ * @{
+ */
+
+
+
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+#include "arm_vec_math_f16.h"
+
+/**
+ * @brief SVM polynomial prediction
+ * @param[in] S Pointer to an instance of the polynomial SVM structure.
+ * @param[in] in Pointer to input vector
+ * @param[out] pResult Decision value
+ * @return none.
+ *
+ */
+void arm_svm_polynomial_predict_f16(
+ const arm_svm_polynomial_instance_f16 *S,
+ const float16_t * in,
+ int32_t * pResult)
+{
+ /* inlined Matrix x Vector function interleaved with dot prod */
+ uint32_t numRows = S->nbOfSupportVectors;
+ uint32_t numCols = S->vectorDimension;
+ const float16_t *pSupport = S->supportVectors;
+ const float16_t *pSrcA = pSupport;
+ const float16_t *pInA0;
+ const float16_t *pInA1;
+ uint32_t row;
+ uint32_t blkCnt; /* loop counters */
+ const float16_t *pDualCoef = S->dualCoefficients;
+ _Float16 sum = S->intercept;
+ f16x8_t vSum = vdupq_n_f16(0.0f);
+
+ row = numRows;
+
+ /*
+ * compute 4 rows in parrallel
+ */
+ while (row >= 4) {
+ const float16_t *pInA2, *pInA3;
+ float16_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec;
+ f16x8_t vecIn, acc0, acc1, acc2, acc3;
+ float16_t const *pSrcVecPtr = in;
+
+ /*
+ * Initialize the pointers to 4 consecutive MatrixA rows
+ */
+ pInA0 = pSrcA;
+ pInA1 = pInA0 + numCols;
+ pInA2 = pInA1 + numCols;
+ pInA3 = pInA2 + numCols;
+ /*
+ * Initialize the vector pointer
+ */
+ pInVec = pSrcVecPtr;
+ /*
+ * reset accumulators
+ */
+ acc0 = vdupq_n_f16(0.0f);
+ acc1 = vdupq_n_f16(0.0f);
+ acc2 = vdupq_n_f16(0.0f);
+ acc3 = vdupq_n_f16(0.0f);
+
+ pSrcA0Vec = pInA0;
+ pSrcA1Vec = pInA1;
+ pSrcA2Vec = pInA2;
+ pSrcA3Vec = pInA3;
+
+ blkCnt = numCols >> 3;
+ while (blkCnt > 0U) {
+ f16x8_t vecA;
+
+ vecIn = vld1q(pInVec);
+ pInVec += 8;
+ vecA = vld1q(pSrcA0Vec);
+ pSrcA0Vec += 8;
+ acc0 = vfmaq(acc0, vecIn, vecA);
+ vecA = vld1q(pSrcA1Vec);
+ pSrcA1Vec += 8;
+ acc1 = vfmaq(acc1, vecIn, vecA);
+ vecA = vld1q(pSrcA2Vec);
+ pSrcA2Vec += 8;
+ acc2 = vfmaq(acc2, vecIn, vecA);
+ vecA = vld1q(pSrcA3Vec);
+ pSrcA3Vec += 8;
+ acc3 = vfmaq(acc3, vecIn, vecA);
+
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numCols & 7;
+ if (blkCnt > 0U) {
+ mve_pred16_t p0 = vctp16q(blkCnt);
+ f16x8_t vecA;
+
+ vecIn = vldrhq_z_f16(pInVec, p0);
+ vecA = vldrhq_z_f16(pSrcA0Vec, p0);
+ acc0 = vfmaq(acc0, vecIn, vecA);
+ vecA = vldrhq_z_f16(pSrcA1Vec, p0);
+ acc1 = vfmaq(acc1, vecIn, vecA);
+ vecA = vldrhq_z_f16(pSrcA2Vec, p0);
+ acc2 = vfmaq(acc2, vecIn, vecA);
+ vecA = vldrhq_z_f16(pSrcA3Vec, p0);
+ acc3 = vfmaq(acc3, vecIn, vecA);
+ }
+ /*
+ * Sum the partial parts
+ */
+ f16x8_t vtmp = vuninitializedq_f16();
+ vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0);
+ vtmp = vsetq_lane(vecAddAcrossF16Mve(acc1), vtmp, 1);
+ vtmp = vsetq_lane(vecAddAcrossF16Mve(acc2), vtmp, 2);
+ vtmp = vsetq_lane(vecAddAcrossF16Mve(acc3), vtmp, 3);
+
+ vSum = vfmaq_m_f16(vSum, vld1q(pDualCoef),
+ arm_vec_exponent_f16
+ (vaddq_n_f16(vmulq_n_f16(vtmp, S->gamma), S->coef0),
+ S->degree),vctp16q(4));
+
+ pDualCoef += 4;
+
+ pSrcA += numCols * 4;
+ /*
+ * Decrement the row loop counter
+ */
+ row -= 4;
+ }
+
+ /*
+ * compute 2 rows in parrallel
+ */
+ if (row >= 2) {
+ float16_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec;
+ f16x8_t vecIn, acc0, acc1;
+ float16_t const *pSrcVecPtr = in;
+
+ /*
+ * Initialize the pointers to 2 consecutive MatrixA rows
+ */
+ pInA0 = pSrcA;
+ pInA1 = pInA0 + numCols;
+ /*
+ * Initialize the vector pointer
+ */
+ pInVec = pSrcVecPtr;
+ /*
+ * reset accumulators
+ */
+ acc0 = vdupq_n_f16(0.0f);
+ acc1 = vdupq_n_f16(0.0f);
+ pSrcA0Vec = pInA0;
+ pSrcA1Vec = pInA1;
+
+ blkCnt = numCols >> 3;
+ while (blkCnt > 0U) {
+ f16x8_t vecA;
+
+ vecIn = vld1q(pInVec);
+ pInVec += 8;
+ vecA = vld1q(pSrcA0Vec);
+ pSrcA0Vec += 8;
+ acc0 = vfmaq(acc0, vecIn, vecA);
+ vecA = vld1q(pSrcA1Vec);
+ pSrcA1Vec += 8;
+ acc1 = vfmaq(acc1, vecIn, vecA);
+
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numCols & 7;
+ if (blkCnt > 0U) {
+ mve_pred16_t p0 = vctp16q(blkCnt);
+ f16x8_t vecA;
+
+ vecIn = vldrhq_z_f16(pInVec, p0);
+ vecA = vldrhq_z_f16(pSrcA0Vec, p0);
+ acc0 = vfmaq(acc0, vecIn, vecA);
+ vecA = vldrhq_z_f16(pSrcA1Vec, p0);
+ acc1 = vfmaq(acc1, vecIn, vecA);
+ }
+ /*
+ * Sum the partial parts
+ */
+ f16x8_t vtmp = vuninitializedq_f16();
+ vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0);
+ vtmp = vsetq_lane(vecAddAcrossF16Mve(acc1), vtmp, 1);
+
+ vSum = vfmaq_m_f16(vSum, vld1q(pDualCoef),
+ arm_vec_exponent_f16
+ (vaddq_n_f16(vmulq_n_f16(vtmp, S->gamma), S->coef0), S->degree),
+ vctp16q(2));
+
+ pDualCoef += 2;
+ pSrcA += numCols * 2;
+ row -= 2;
+ }
+
+ if (row >= 1) {
+ f16x8_t vecIn, acc0;
+ float16_t const *pSrcA0Vec, *pInVec;
+ float16_t const *pSrcVecPtr = in;
+ /*
+ * Initialize the pointers to last MatrixA row
+ */
+ pInA0 = pSrcA;
+ /*
+ * Initialize the vector pointer
+ */
+ pInVec = pSrcVecPtr;
+ /*
+ * reset accumulators
+ */
+ acc0 = vdupq_n_f16(0.0f);
+
+ pSrcA0Vec = pInA0;
+
+ blkCnt = numCols >> 3;
+ while (blkCnt > 0U) {
+ f16x8_t vecA;
+
+ vecIn = vld1q(pInVec);
+ pInVec += 8;
+ vecA = vld1q(pSrcA0Vec);
+ pSrcA0Vec += 8;
+ acc0 = vfmaq(acc0, vecIn, vecA);
+
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numCols & 7;
+ if (blkCnt > 0U) {
+ mve_pred16_t p0 = vctp16q(blkCnt);
+ f16x8_t vecA;
+
+ vecIn = vldrhq_z_f16(pInVec, p0);
+ vecA = vldrhq_z_f16(pSrcA0Vec, p0);
+ acc0 = vfmaq(acc0, vecIn, vecA);
+ }
+ /*
+ * Sum the partial parts
+ */
+ f16x8_t vtmp = vuninitializedq_f16();
+ vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0);
+ vSum = vfmaq_m_f16(vSum, vld1q(pDualCoef),
+ arm_vec_exponent_f16
+ (vaddq_n_f16(vmulq_n_f16(vtmp, S->gamma), S->coef0), S->degree),
+ vctp16q(1));
+ }
+ sum += (_Float16)vecAddAcrossF16Mve(vSum);
+
+
+ *pResult = S->classes[STEP(sum)];
+}
+
+#else
+
+
+/**
+ * @brief SVM polynomial prediction
+ * @param[in] S Pointer to an instance of the polynomial SVM structure.
+ * @param[in] in Pointer to input vector
+ * @param[out] pResult Decision value
+ * @return none.
+ *
+ */
+void arm_svm_polynomial_predict_f16(
+ const arm_svm_polynomial_instance_f16 *S,
+ const float16_t * in,
+ int32_t * pResult)
+{
+ _Float16 sum=S->intercept;
+ _Float16 dot=0;
+ uint32_t i,j;
+ const float16_t *pSupport = S->supportVectors;
+
+ for(i=0; i < S->nbOfSupportVectors; i++)
+ {
+ dot=0;
+ for(j=0; j < S->vectorDimension; j++)
+ {
+ dot = (_Float16)dot + (_Float16)in[j]* (_Float16)*pSupport++;
+ }
+ sum += (_Float16)S->dualCoefficients[i] * (_Float16)arm_exponent_f16((_Float16)S->gamma * (_Float16)dot + (_Float16)S->coef0, S->degree);
+ }
+
+ *pResult=S->classes[STEP(sum)];
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+
+/**
+ * @} end of polysvm group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
+
diff --git a/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_polynomial_predict_f32.c b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_polynomial_predict_f32.c
new file mode 100644
index 0000000..13b1f84
--- /dev/null
+++ b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_polynomial_predict_f32.c
@@ -0,0 +1,490 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_svm_polynomial_predict_f32.c
+ * Description: SVM Polynomial Classifier
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/svm_functions.h"
+#include <limits.h>
+#include <math.h>
+
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
+#include "arm_vec_math.h"
+#endif
+
+/**
+ * @addtogroup polysvm
+ * @{
+ */
+
+
+/**
+ * @brief SVM polynomial prediction
+ * @param[in] S Pointer to an instance of the polynomial SVM structure.
+ * @param[in] in Pointer to input vector
+ * @param[out] pResult Decision value
+ * @return none.
+ *
+ */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+#include "arm_vec_math.h"
+
+void arm_svm_polynomial_predict_f32(
+ const arm_svm_polynomial_instance_f32 *S,
+ const float32_t * in,
+ int32_t * pResult)
+{
+ /* inlined Matrix x Vector function interleaved with dot prod */
+ uint32_t numRows = S->nbOfSupportVectors;
+ uint32_t numCols = S->vectorDimension;
+ const float32_t *pSupport = S->supportVectors;
+ const float32_t *pSrcA = pSupport;
+ const float32_t *pInA0;
+ const float32_t *pInA1;
+ uint32_t row;
+ uint32_t blkCnt; /* loop counters */
+ const float32_t *pDualCoef = S->dualCoefficients;
+ float32_t sum = S->intercept;
+ f32x4_t vSum = vdupq_n_f32(0.0f);
+
+ row = numRows;
+
+ /*
+ * compute 4 rows in parrallel
+ */
+ while (row >= 4) {
+ const float32_t *pInA2, *pInA3;
+ float32_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec;
+ f32x4_t vecIn, acc0, acc1, acc2, acc3;
+ float32_t const *pSrcVecPtr = in;
+
+ /*
+ * Initialize the pointers to 4 consecutive MatrixA rows
+ */
+ pInA0 = pSrcA;
+ pInA1 = pInA0 + numCols;
+ pInA2 = pInA1 + numCols;
+ pInA3 = pInA2 + numCols;
+ /*
+ * Initialize the vector pointer
+ */
+ pInVec = pSrcVecPtr;
+ /*
+ * reset accumulators
+ */
+ acc0 = vdupq_n_f32(0.0f);
+ acc1 = vdupq_n_f32(0.0f);
+ acc2 = vdupq_n_f32(0.0f);
+ acc3 = vdupq_n_f32(0.0f);
+
+ pSrcA0Vec = pInA0;
+ pSrcA1Vec = pInA1;
+ pSrcA2Vec = pInA2;
+ pSrcA3Vec = pInA3;
+
+ blkCnt = numCols >> 2;
+ while (blkCnt > 0U) {
+ f32x4_t vecA;
+
+ vecIn = vld1q(pInVec);
+ pInVec += 4;
+ vecA = vld1q(pSrcA0Vec);
+ pSrcA0Vec += 4;
+ acc0 = vfmaq(acc0, vecIn, vecA);
+ vecA = vld1q(pSrcA1Vec);
+ pSrcA1Vec += 4;
+ acc1 = vfmaq(acc1, vecIn, vecA);
+ vecA = vld1q(pSrcA2Vec);
+ pSrcA2Vec += 4;
+ acc2 = vfmaq(acc2, vecIn, vecA);
+ vecA = vld1q(pSrcA3Vec);
+ pSrcA3Vec += 4;
+ acc3 = vfmaq(acc3, vecIn, vecA);
+
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numCols & 3;
+ if (blkCnt > 0U) {
+ mve_pred16_t p0 = vctp32q(blkCnt);
+ f32x4_t vecA;
+
+ vecIn = vldrwq_z_f32(pInVec, p0);
+ vecA = vldrwq_z_f32(pSrcA0Vec, p0);
+ acc0 = vfmaq(acc0, vecIn, vecA);
+ vecA = vldrwq_z_f32(pSrcA1Vec, p0);
+ acc1 = vfmaq(acc1, vecIn, vecA);
+ vecA = vldrwq_z_f32(pSrcA2Vec, p0);
+ acc2 = vfmaq(acc2, vecIn, vecA);
+ vecA = vldrwq_z_f32(pSrcA3Vec, p0);
+ acc3 = vfmaq(acc3, vecIn, vecA);
+ }
+ /*
+ * Sum the partial parts
+ */
+ f32x4_t vtmp = vuninitializedq_f32();
+ vtmp = vsetq_lane(vecAddAcrossF32Mve(acc0), vtmp, 0);
+ vtmp = vsetq_lane(vecAddAcrossF32Mve(acc1), vtmp, 1);
+ vtmp = vsetq_lane(vecAddAcrossF32Mve(acc2), vtmp, 2);
+ vtmp = vsetq_lane(vecAddAcrossF32Mve(acc3), vtmp, 3);
+
+ vSum = vfmaq_f32(vSum, vld1q(pDualCoef),
+ arm_vec_exponent_f32
+ (vaddq_n_f32(vmulq_n_f32(vtmp, S->gamma), S->coef0), S->degree));
+
+ pDualCoef += 4;
+
+ pSrcA += numCols * 4;
+ /*
+ * Decrement the row loop counter
+ */
+ row -= 4;
+ }
+
+ /*
+ * compute 2 rows in parrallel
+ */
+ if (row >= 2) {
+ float32_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec;
+ f32x4_t vecIn, acc0, acc1;
+ float32_t const *pSrcVecPtr = in;
+
+ /*
+ * Initialize the pointers to 2 consecutive MatrixA rows
+ */
+ pInA0 = pSrcA;
+ pInA1 = pInA0 + numCols;
+ /*
+ * Initialize the vector pointer
+ */
+ pInVec = pSrcVecPtr;
+ /*
+ * reset accumulators
+ */
+ acc0 = vdupq_n_f32(0.0f);
+ acc1 = vdupq_n_f32(0.0f);
+ pSrcA0Vec = pInA0;
+ pSrcA1Vec = pInA1;
+
+ blkCnt = numCols >> 2;
+ while (blkCnt > 0U) {
+ f32x4_t vecA;
+
+ vecIn = vld1q(pInVec);
+ pInVec += 4;
+ vecA = vld1q(pSrcA0Vec);
+ pSrcA0Vec += 4;
+ acc0 = vfmaq(acc0, vecIn, vecA);
+ vecA = vld1q(pSrcA1Vec);
+ pSrcA1Vec += 4;
+ acc1 = vfmaq(acc1, vecIn, vecA);
+
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numCols & 3;
+ if (blkCnt > 0U) {
+ mve_pred16_t p0 = vctp32q(blkCnt);
+ f32x4_t vecA;
+
+ vecIn = vldrwq_z_f32(pInVec, p0);
+ vecA = vldrwq_z_f32(pSrcA0Vec, p0);
+ acc0 = vfmaq(acc0, vecIn, vecA);
+ vecA = vldrwq_z_f32(pSrcA1Vec, p0);
+ acc1 = vfmaq(acc1, vecIn, vecA);
+ }
+ /*
+ * Sum the partial parts
+ */
+ f32x4_t vtmp = vuninitializedq_f32();
+ vtmp = vsetq_lane(vecAddAcrossF32Mve(acc0), vtmp, 0);
+ vtmp = vsetq_lane(vecAddAcrossF32Mve(acc1), vtmp, 1);
+
+ vSum = vfmaq_m_f32(vSum, vld1q(pDualCoef),
+ arm_vec_exponent_f32
+ (vaddq_n_f32(vmulq_n_f32(vtmp, S->gamma), S->coef0), S->degree),
+ vctp32q(2));
+
+ pDualCoef += 2;
+ pSrcA += numCols * 2;
+ row -= 2;
+ }
+
+ if (row >= 1) {
+ f32x4_t vecIn, acc0;
+ float32_t const *pSrcA0Vec, *pInVec;
+ float32_t const *pSrcVecPtr = in;
+ /*
+ * Initialize the pointers to last MatrixA row
+ */
+ pInA0 = pSrcA;
+ /*
+ * Initialize the vector pointer
+ */
+ pInVec = pSrcVecPtr;
+ /*
+ * reset accumulators
+ */
+ acc0 = vdupq_n_f32(0.0f);
+
+ pSrcA0Vec = pInA0;
+
+ blkCnt = numCols >> 2;
+ while (blkCnt > 0U) {
+ f32x4_t vecA;
+
+ vecIn = vld1q(pInVec);
+ pInVec += 4;
+ vecA = vld1q(pSrcA0Vec);
+ pSrcA0Vec += 4;
+ acc0 = vfmaq(acc0, vecIn, vecA);
+
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numCols & 3;
+ if (blkCnt > 0U) {
+ mve_pred16_t p0 = vctp32q(blkCnt);
+ f32x4_t vecA;
+
+ vecIn = vldrwq_z_f32(pInVec, p0);
+ vecA = vldrwq_z_f32(pSrcA0Vec, p0);
+ acc0 = vfmaq(acc0, vecIn, vecA);
+ }
+ /*
+ * Sum the partial parts
+ */
+ f32x4_t vtmp = vuninitializedq_f32();
+ vtmp = vsetq_lane(vecAddAcrossF32Mve(acc0), vtmp, 0);
+ vSum = vfmaq_m_f32(vSum, vld1q(pDualCoef),
+ arm_vec_exponent_f32
+ (vaddq_n_f32(vmulq_n_f32(vtmp, S->gamma), S->coef0), S->degree),
+ vctp32q(1));
+ }
+ sum += vecAddAcrossF32Mve(vSum);
+
+
+ *pResult = S->classes[STEP(sum)];
+}
+
+#else
+#if defined(ARM_MATH_NEON)
+void arm_svm_polynomial_predict_f32(
+ const arm_svm_polynomial_instance_f32 *S,
+ const float32_t * in,
+ int32_t * pResult)
+{
+ float32_t sum = S->intercept;
+
+ float32_t dot;
+ float32x4_t dotV;
+
+ float32x4_t accuma,accumb,accumc,accumd,accum;
+ float32x2_t accum2;
+ float32x4_t vec1;
+ float32x4_t coef0 = vdupq_n_f32(S->coef0);
+
+ float32x4_t vec2,vec2a,vec2b,vec2c,vec2d;
+
+ uint32_t blkCnt;
+ uint32_t vectorBlkCnt;
+
+ const float32_t *pIn = in;
+
+ const float32_t *pSupport = S->supportVectors;
+
+ const float32_t *pSupporta = S->supportVectors;
+ const float32_t *pSupportb;
+ const float32_t *pSupportc;
+ const float32_t *pSupportd;
+
+ pSupportb = pSupporta + S->vectorDimension;
+ pSupportc = pSupportb + S->vectorDimension;
+ pSupportd = pSupportc + S->vectorDimension;
+
+ const float32_t *pDualCoefs = S->dualCoefficients;
+
+ vectorBlkCnt = S->nbOfSupportVectors >> 2;
+ while (vectorBlkCnt > 0U)
+ {
+ accuma = vdupq_n_f32(0);
+ accumb = vdupq_n_f32(0);
+ accumc = vdupq_n_f32(0);
+ accumd = vdupq_n_f32(0);
+
+ pIn = in;
+
+ blkCnt = S->vectorDimension >> 2;
+ while (blkCnt > 0U)
+ {
+
+ vec1 = vld1q_f32(pIn);
+ vec2a = vld1q_f32(pSupporta);
+ vec2b = vld1q_f32(pSupportb);
+ vec2c = vld1q_f32(pSupportc);
+ vec2d = vld1q_f32(pSupportd);
+
+ pIn += 4;
+ pSupporta += 4;
+ pSupportb += 4;
+ pSupportc += 4;
+ pSupportd += 4;
+
+ accuma = vmlaq_f32(accuma, vec1,vec2a);
+ accumb = vmlaq_f32(accumb, vec1,vec2b);
+ accumc = vmlaq_f32(accumc, vec1,vec2c);
+ accumd = vmlaq_f32(accumd, vec1,vec2d);
+
+ blkCnt -- ;
+ }
+ accum2 = vpadd_f32(vget_low_f32(accuma),vget_high_f32(accuma));
+ dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,0);
+
+ accum2 = vpadd_f32(vget_low_f32(accumb),vget_high_f32(accumb));
+ dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,1);
+
+ accum2 = vpadd_f32(vget_low_f32(accumc),vget_high_f32(accumc));
+ dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,2);
+
+ accum2 = vpadd_f32(vget_low_f32(accumd),vget_high_f32(accumd));
+ dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,3);
+
+
+ blkCnt = S->vectorDimension & 3;
+ while (blkCnt > 0U)
+ {
+ dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,0) + *pIn * *pSupporta++, dotV,0);
+ dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,1) + *pIn * *pSupportb++, dotV,1);
+ dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,2) + *pIn * *pSupportc++, dotV,2);
+ dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,3) + *pIn * *pSupportd++, dotV,3);
+
+ pIn++;
+
+ blkCnt -- ;
+ }
+
+ vec1 = vld1q_f32(pDualCoefs);
+ pDualCoefs += 4;
+
+ // To vectorize later
+ dotV = vmulq_n_f32(dotV, S->gamma);
+ dotV = vaddq_f32(dotV, coef0);
+
+ dotV = arm_vec_exponent_f32(dotV,S->degree);
+
+ accum = vmulq_f32(vec1,dotV);
+ accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum));
+ sum += vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1);
+
+ pSupporta += 3*S->vectorDimension;
+ pSupportb += 3*S->vectorDimension;
+ pSupportc += 3*S->vectorDimension;
+ pSupportd += 3*S->vectorDimension;
+
+ vectorBlkCnt -- ;
+ }
+
+ pSupport = pSupporta;
+ vectorBlkCnt = S->nbOfSupportVectors & 3;
+
+ while (vectorBlkCnt > 0U)
+ {
+ accum = vdupq_n_f32(0);
+ dot = 0.0f;
+ pIn = in;
+
+ blkCnt = S->vectorDimension >> 2;
+ while (blkCnt > 0U)
+ {
+
+ vec1 = vld1q_f32(pIn);
+ vec2 = vld1q_f32(pSupport);
+ pIn += 4;
+ pSupport += 4;
+
+ accum = vmlaq_f32(accum, vec1,vec2);
+
+ blkCnt -- ;
+ }
+ accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum));
+ dot = vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1);
+
+
+ blkCnt = S->vectorDimension & 3;
+ while (blkCnt > 0U)
+ {
+ dot = dot + *pIn++ * *pSupport++;
+
+ blkCnt -- ;
+ }
+
+ sum += *pDualCoefs++ * arm_exponent_f32(S->gamma * dot + S->coef0, S->degree);
+ vectorBlkCnt -- ;
+ }
+
+ *pResult=S->classes[STEP(sum)];
+}
+#else
+void arm_svm_polynomial_predict_f32(
+ const arm_svm_polynomial_instance_f32 *S,
+ const float32_t * in,
+ int32_t * pResult)
+{
+ float32_t sum=S->intercept;
+ float32_t dot=0;
+ uint32_t i,j;
+ const float32_t *pSupport = S->supportVectors;
+
+ for(i=0; i < S->nbOfSupportVectors; i++)
+ {
+ dot=0;
+ for(j=0; j < S->vectorDimension; j++)
+ {
+ dot = dot + in[j]* *pSupport++;
+ }
+ sum += S->dualCoefficients[i] * arm_exponent_f32(S->gamma * dot + S->coef0, S->degree);
+ }
+
+ *pResult=S->classes[STEP(sum)];
+}
+#endif
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+
+/**
+ * @} end of polysvm group
+ */
diff --git a/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_rbf_init_f16.c b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_rbf_init_f16.c
new file mode 100644
index 0000000..43de249
--- /dev/null
+++ b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_rbf_init_f16.c
@@ -0,0 +1,97 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_svm_rbf_init_f16.c
+ * Description: SVM Radial Basis Function Instance Initialization
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/svm_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+#include <limits.h>
+#include <math.h>
+
+/**
+ @ingroup groupSVM
+ */
+
+/**
+ @defgroup rbfsvm RBF SVM
+
+ RBF SVM classifier
+ */
+
+
+/**
+ * @addtogroup rbfsvm
+ * @{
+ */
+
+
+/**
+ * @brief SVM radial basis function instance init function
+ *
+ * Classes are integer used as output of the function (instead of having -1,1
+ * as class values).
+ *
+ * @param[in] S points to an instance of the polynomial SVM structure.
+ * @param[in] nbOfSupportVectors Number of support vectors
+ * @param[in] vectorDimension Dimension of vector space
+ * @param[in] intercept Intercept
+ * @param[in] dualCoefficients Array of dual coefficients
+ * @param[in] supportVectors Array of support vectors
+ * @param[in] classes Array of 2 classes ID
+ * @param[in] gamma gamma (scikit-learn terminology)
+ * @return none.
+ *
+ */
+
+void arm_svm_rbf_init_f16(arm_svm_rbf_instance_f16 *S,
+ uint32_t nbOfSupportVectors,
+ uint32_t vectorDimension,
+ float16_t intercept,
+ const float16_t *dualCoefficients,
+ const float16_t *supportVectors,
+ const int32_t *classes,
+ float16_t gamma
+ )
+{
+ S->nbOfSupportVectors = nbOfSupportVectors;
+ S->vectorDimension = vectorDimension;
+ S->intercept = intercept;
+ S->dualCoefficients = dualCoefficients;
+ S->supportVectors = supportVectors;
+ S->classes = classes;
+ S->gamma = gamma;
+}
+
+
+
+/**
+ * @} end of rbfsvm group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
+
diff --git a/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_rbf_init_f32.c b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_rbf_init_f32.c
new file mode 100644
index 0000000..77bf282
--- /dev/null
+++ b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_rbf_init_f32.c
@@ -0,0 +1,91 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_svm_rbf_init_f32.c
+ * Description: SVM Radial Basis Function Instance Initialization
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/svm_functions.h"
+#include <limits.h>
+#include <math.h>
+
+/**
+ @ingroup groupSVM
+ */
+
+/**
+ @defgroup rbfsvm RBF SVM
+
+ RBF SVM classifier
+ */
+
+
+/**
+ * @addtogroup rbfsvm
+ * @{
+ */
+
+
+/**
+ * @brief SVM radial basis function instance init function
+ *
+ * Classes are integer used as output of the function (instead of having -1,1
+ * as class values).
+ *
+ * @param[in] S points to an instance of the polynomial SVM structure.
+ * @param[in] nbOfSupportVectors Number of support vectors
+ * @param[in] vectorDimension Dimension of vector space
+ * @param[in] intercept Intercept
+ * @param[in] dualCoefficients Array of dual coefficients
+ * @param[in] supportVectors Array of support vectors
+ * @param[in] classes Array of 2 classes ID
+ * @param[in] gamma gamma (scikit-learn terminology)
+ * @return none.
+ *
+ */
+
+void arm_svm_rbf_init_f32(arm_svm_rbf_instance_f32 *S,
+ uint32_t nbOfSupportVectors,
+ uint32_t vectorDimension,
+ float32_t intercept,
+ const float32_t *dualCoefficients,
+ const float32_t *supportVectors,
+ const int32_t *classes,
+ float32_t gamma
+ )
+{
+ S->nbOfSupportVectors = nbOfSupportVectors;
+ S->vectorDimension = vectorDimension;
+ S->intercept = intercept;
+ S->dualCoefficients = dualCoefficients;
+ S->supportVectors = supportVectors;
+ S->classes = classes;
+ S->gamma = gamma;
+}
+
+
+
+/**
+ * @} end of rbfsvm group
+ */
diff --git a/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_rbf_predict_f16.c b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_rbf_predict_f16.c
new file mode 100644
index 0000000..91afcc1
--- /dev/null
+++ b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_rbf_predict_f16.c
@@ -0,0 +1,352 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_svm_rbf_predict_f16.c
+ * Description: SVM Radial Basis Function Classifier
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/svm_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+#include <limits.h>
+#include <math.h>
+
+
+/**
+ * @addtogroup rbfsvm
+ * @{
+ */
+
+
+/**
+ * @brief SVM rbf prediction
+ * @param[in] S Pointer to an instance of the rbf SVM structure.
+ * @param[in] in Pointer to input vector
+ * @param[out] pResult decision value
+ * @return none.
+ *
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+#include "arm_vec_math_f16.h"
+
+void arm_svm_rbf_predict_f16(
+ const arm_svm_rbf_instance_f16 *S,
+ const float16_t * in,
+ int32_t * pResult)
+{
+ /* inlined Matrix x Vector function interleaved with dot prod */
+ uint32_t numRows = S->nbOfSupportVectors;
+ uint32_t numCols = S->vectorDimension;
+ const float16_t *pSupport = S->supportVectors;
+ const float16_t *pSrcA = pSupport;
+ const float16_t *pInA0;
+ const float16_t *pInA1;
+ uint32_t row;
+ uint32_t blkCnt; /* loop counters */
+ const float16_t *pDualCoef = S->dualCoefficients;
+ _Float16 sum = S->intercept;
+ f16x8_t vSum = vdupq_n_f16(0.0f16);
+
+ row = numRows;
+
+ /*
+ * compute 4 rows in parrallel
+ */
+ while (row >= 4) {
+ const float16_t *pInA2, *pInA3;
+ float16_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec;
+ f16x8_t vecIn, acc0, acc1, acc2, acc3;
+ float16_t const *pSrcVecPtr = in;
+
+ /*
+ * Initialize the pointers to 4 consecutive MatrixA rows
+ */
+ pInA0 = pSrcA;
+ pInA1 = pInA0 + numCols;
+ pInA2 = pInA1 + numCols;
+ pInA3 = pInA2 + numCols;
+ /*
+ * Initialize the vector pointer
+ */
+ pInVec = pSrcVecPtr;
+ /*
+ * reset accumulators
+ */
+ acc0 = vdupq_n_f16(0.0f16);
+ acc1 = vdupq_n_f16(0.0f16);
+ acc2 = vdupq_n_f16(0.0f16);
+ acc3 = vdupq_n_f16(0.0f16);
+
+ pSrcA0Vec = pInA0;
+ pSrcA1Vec = pInA1;
+ pSrcA2Vec = pInA2;
+ pSrcA3Vec = pInA3;
+
+ blkCnt = numCols >> 3;
+ while (blkCnt > 0U) {
+ f16x8_t vecA;
+ f16x8_t vecDif;
+
+ vecIn = vld1q(pInVec);
+ pInVec += 8;
+ vecA = vld1q(pSrcA0Vec);
+ pSrcA0Vec += 8;
+ vecDif = vsubq(vecIn, vecA);
+ acc0 = vfmaq(acc0, vecDif, vecDif);
+ vecA = vld1q(pSrcA1Vec);
+ pSrcA1Vec += 8;
+ vecDif = vsubq(vecIn, vecA);
+ acc1 = vfmaq(acc1, vecDif, vecDif);
+ vecA = vld1q(pSrcA2Vec);
+ pSrcA2Vec += 8;
+ vecDif = vsubq(vecIn, vecA);
+ acc2 = vfmaq(acc2, vecDif, vecDif);
+ vecA = vld1q(pSrcA3Vec);
+ pSrcA3Vec += 8;
+ vecDif = vsubq(vecIn, vecA);
+ acc3 = vfmaq(acc3, vecDif, vecDif);
+
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numCols & 7;
+ if (blkCnt > 0U) {
+ mve_pred16_t p0 = vctp16q(blkCnt);
+ f16x8_t vecA;
+ f16x8_t vecDif;
+
+ vecIn = vldrhq_z_f16(pInVec, p0);
+ vecA = vldrhq_z_f16(pSrcA0Vec, p0);
+ vecDif = vsubq(vecIn, vecA);
+ acc0 = vfmaq(acc0, vecDif, vecDif);
+ vecA = vldrhq_z_f16(pSrcA1Vec, p0);
+ vecDif = vsubq(vecIn, vecA);
+ acc1 = vfmaq(acc1, vecDif, vecDif);
+ vecA = vldrhq_z_f16(pSrcA2Vec, p0);;
+ vecDif = vsubq(vecIn, vecA);
+ acc2 = vfmaq(acc2, vecDif, vecDif);
+ vecA = vldrhq_z_f16(pSrcA3Vec, p0);
+ vecDif = vsubq(vecIn, vecA);
+ acc3 = vfmaq(acc3, vecDif, vecDif);
+ }
+ /*
+ * Sum the partial parts
+ */
+
+ //sum += *pDualCoef++ * expf(-S->gamma * vecReduceF16Mve(acc0));
+ f16x8_t vtmp = vuninitializedq_f16();
+ vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0);
+ vtmp = vsetq_lane(vecAddAcrossF16Mve(acc1), vtmp, 1);
+ vtmp = vsetq_lane(vecAddAcrossF16Mve(acc2), vtmp, 2);
+ vtmp = vsetq_lane(vecAddAcrossF16Mve(acc3), vtmp, 3);
+
+ vSum =
+ vfmaq_m_f16(vSum, vld1q(pDualCoef),
+ vexpq_f16(vmulq_n_f16(vtmp, -(_Float16)S->gamma)),vctp16q(4));
+ pDualCoef += 4;
+ pSrcA += numCols * 4;
+ /*
+ * Decrement the row loop counter
+ */
+ row -= 4;
+ }
+
+ /*
+ * compute 2 rows in parrallel
+ */
+ if (row >= 2) {
+ float16_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec;
+ f16x8_t vecIn, acc0, acc1;
+ float16_t const *pSrcVecPtr = in;
+
+ /*
+ * Initialize the pointers to 2 consecutive MatrixA rows
+ */
+ pInA0 = pSrcA;
+ pInA1 = pInA0 + numCols;
+ /*
+ * Initialize the vector pointer
+ */
+ pInVec = pSrcVecPtr;
+ /*
+ * reset accumulators
+ */
+ acc0 = vdupq_n_f16(0.0f16);
+ acc1 = vdupq_n_f16(0.0f16);
+ pSrcA0Vec = pInA0;
+ pSrcA1Vec = pInA1;
+
+ blkCnt = numCols >> 3;
+ while (blkCnt > 0U) {
+ f16x8_t vecA;
+ f16x8_t vecDif;
+
+ vecIn = vld1q(pInVec);
+ pInVec += 8;
+ vecA = vld1q(pSrcA0Vec);
+ pSrcA0Vec += 8;
+ vecDif = vsubq(vecIn, vecA);
+ acc0 = vfmaq(acc0, vecDif, vecDif);;
+ vecA = vld1q(pSrcA1Vec);
+ pSrcA1Vec += 8;
+ vecDif = vsubq(vecIn, vecA);
+ acc1 = vfmaq(acc1, vecDif, vecDif);
+
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numCols & 7;
+ if (blkCnt > 0U) {
+ mve_pred16_t p0 = vctp16q(blkCnt);
+ f16x8_t vecA, vecDif;
+
+ vecIn = vldrhq_z_f16(pInVec, p0);
+ vecA = vldrhq_z_f16(pSrcA0Vec, p0);
+ vecDif = vsubq(vecIn, vecA);
+ acc0 = vfmaq(acc0, vecDif, vecDif);
+ vecA = vldrhq_z_f16(pSrcA1Vec, p0);
+ vecDif = vsubq(vecIn, vecA);
+ acc1 = vfmaq(acc1, vecDif, vecDif);
+ }
+ /*
+ * Sum the partial parts
+ */
+ f16x8_t vtmp = vuninitializedq_f16();
+ vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0);
+ vtmp = vsetq_lane(vecAddAcrossF16Mve(acc1), vtmp, 1);
+
+ vSum =
+ vfmaq_m_f16(vSum, vld1q(pDualCoef),
+ vexpq_f16(vmulq_n_f16(vtmp, -(_Float16)S->gamma)), vctp16q(2));
+ pDualCoef += 2;
+
+ pSrcA += numCols * 2;
+ row -= 2;
+ }
+
+ if (row >= 1) {
+ f16x8_t vecIn, acc0;
+ float16_t const *pSrcA0Vec, *pInVec;
+ float16_t const *pSrcVecPtr = in;
+ /*
+ * Initialize the pointers to last MatrixA row
+ */
+ pInA0 = pSrcA;
+ /*
+ * Initialize the vector pointer
+ */
+ pInVec = pSrcVecPtr;
+ /*
+ * reset accumulators
+ */
+ acc0 = vdupq_n_f16(0.0f);
+
+ pSrcA0Vec = pInA0;
+
+ blkCnt = numCols >> 3;
+ while (blkCnt > 0U) {
+ f16x8_t vecA, vecDif;
+
+ vecIn = vld1q(pInVec);
+ pInVec += 8;
+ vecA = vld1q(pSrcA0Vec);
+ pSrcA0Vec += 8;
+ vecDif = vsubq(vecIn, vecA);
+ acc0 = vfmaq(acc0, vecDif, vecDif);
+
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numCols & 7;
+ if (blkCnt > 0U) {
+ mve_pred16_t p0 = vctp16q(blkCnt);
+ f16x8_t vecA, vecDif;
+
+ vecIn = vldrhq_z_f16(pInVec, p0);
+ vecA = vldrhq_z_f16(pSrcA0Vec, p0);
+ vecDif = vsubq(vecIn, vecA);
+ acc0 = vfmaq(acc0, vecDif, vecDif);
+ }
+ /*
+ * Sum the partial parts
+ */
+ f16x8_t vtmp = vuninitializedq_f16();
+ vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0);
+
+ vSum =
+ vfmaq_m_f16(vSum, vld1q(pDualCoef),
+ vexpq_f16(vmulq_n_f16(vtmp, -(_Float16)S->gamma)), vctp16q(1));
+
+ }
+
+
+ sum += (_Float16)vecAddAcrossF16Mve(vSum);
+ *pResult = S->classes[STEP(sum)];
+}
+
+#else
+void arm_svm_rbf_predict_f16(
+ const arm_svm_rbf_instance_f16 *S,
+ const float16_t * in,
+ int32_t * pResult)
+{
+ _Float16 sum=S->intercept;
+ _Float16 dot=00.f16;
+ uint32_t i,j;
+ const float16_t *pSupport = S->supportVectors;
+
+ for(i=0; i < S->nbOfSupportVectors; i++)
+ {
+ dot=0.0f16;
+ for(j=0; j < S->vectorDimension; j++)
+ {
+ dot = dot + SQ((_Float16)in[j] - (_Float16) *pSupport);
+ pSupport++;
+ }
+ sum += (_Float16)S->dualCoefficients[i] * (_Float16)expf((float32_t)(-(_Float16)S->gamma * (_Float16)dot));
+ }
+ *pResult=S->classes[STEP(sum)];
+}
+
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ * @} end of rbfsvm group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
+
diff --git a/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_rbf_predict_f32.c b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_rbf_predict_f32.c
new file mode 100644
index 0000000..d3c43bf
--- /dev/null
+++ b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_rbf_predict_f32.c
@@ -0,0 +1,523 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_svm_rbf_predict_f32.c
+ * Description: SVM Radial Basis Function Classifier
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/svm_functions.h"
+#include <limits.h>
+#include <math.h>
+
+
+/**
+ * @addtogroup rbfsvm
+ * @{
+ */
+
+
+/**
+ * @brief SVM rbf prediction
+ * @param[in] S Pointer to an instance of the rbf SVM structure.
+ * @param[in] in Pointer to input vector
+ * @param[out] pResult decision value
+ * @return none.
+ *
+ */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+#include "arm_vec_math.h"
+
+void arm_svm_rbf_predict_f32(
+ const arm_svm_rbf_instance_f32 *S,
+ const float32_t * in,
+ int32_t * pResult)
+{
+ /* inlined Matrix x Vector function interleaved with dot prod */
+ uint32_t numRows = S->nbOfSupportVectors;
+ uint32_t numCols = S->vectorDimension;
+ const float32_t *pSupport = S->supportVectors;
+ const float32_t *pSrcA = pSupport;
+ const float32_t *pInA0;
+ const float32_t *pInA1;
+ uint32_t row;
+ uint32_t blkCnt; /* loop counters */
+ const float32_t *pDualCoef = S->dualCoefficients;
+ float32_t sum = S->intercept;
+ f32x4_t vSum = vdupq_n_f32(0);
+
+ row = numRows;
+
+ /*
+ * compute 4 rows in parrallel
+ */
+ while (row >= 4) {
+ const float32_t *pInA2, *pInA3;
+ float32_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec;
+ f32x4_t vecIn, acc0, acc1, acc2, acc3;
+ float32_t const *pSrcVecPtr = in;
+
+ /*
+ * Initialize the pointers to 4 consecutive MatrixA rows
+ */
+ pInA0 = pSrcA;
+ pInA1 = pInA0 + numCols;
+ pInA2 = pInA1 + numCols;
+ pInA3 = pInA2 + numCols;
+ /*
+ * Initialize the vector pointer
+ */
+ pInVec = pSrcVecPtr;
+ /*
+ * reset accumulators
+ */
+ acc0 = vdupq_n_f32(0.0f);
+ acc1 = vdupq_n_f32(0.0f);
+ acc2 = vdupq_n_f32(0.0f);
+ acc3 = vdupq_n_f32(0.0f);
+
+ pSrcA0Vec = pInA0;
+ pSrcA1Vec = pInA1;
+ pSrcA2Vec = pInA2;
+ pSrcA3Vec = pInA3;
+
+ blkCnt = numCols >> 2;
+ while (blkCnt > 0U) {
+ f32x4_t vecA;
+ f32x4_t vecDif;
+
+ vecIn = vld1q(pInVec);
+ pInVec += 4;
+ vecA = vld1q(pSrcA0Vec);
+ pSrcA0Vec += 4;
+ vecDif = vsubq(vecIn, vecA);
+ acc0 = vfmaq(acc0, vecDif, vecDif);
+ vecA = vld1q(pSrcA1Vec);
+ pSrcA1Vec += 4;
+ vecDif = vsubq(vecIn, vecA);
+ acc1 = vfmaq(acc1, vecDif, vecDif);
+ vecA = vld1q(pSrcA2Vec);
+ pSrcA2Vec += 4;
+ vecDif = vsubq(vecIn, vecA);
+ acc2 = vfmaq(acc2, vecDif, vecDif);
+ vecA = vld1q(pSrcA3Vec);
+ pSrcA3Vec += 4;
+ vecDif = vsubq(vecIn, vecA);
+ acc3 = vfmaq(acc3, vecDif, vecDif);
+
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numCols & 3;
+ if (blkCnt > 0U) {
+ mve_pred16_t p0 = vctp32q(blkCnt);
+ f32x4_t vecA;
+ f32x4_t vecDif;
+
+ vecIn = vldrwq_z_f32(pInVec, p0);
+ vecA = vldrwq_z_f32(pSrcA0Vec, p0);
+ vecDif = vsubq(vecIn, vecA);
+ acc0 = vfmaq(acc0, vecDif, vecDif);
+ vecA = vldrwq_z_f32(pSrcA1Vec, p0);
+ vecDif = vsubq(vecIn, vecA);
+ acc1 = vfmaq(acc1, vecDif, vecDif);
+ vecA = vldrwq_z_f32(pSrcA2Vec, p0);;
+ vecDif = vsubq(vecIn, vecA);
+ acc2 = vfmaq(acc2, vecDif, vecDif);
+ vecA = vldrwq_z_f32(pSrcA3Vec, p0);
+ vecDif = vsubq(vecIn, vecA);
+ acc3 = vfmaq(acc3, vecDif, vecDif);
+ }
+ /*
+ * Sum the partial parts
+ */
+
+ //sum += *pDualCoef++ * expf(-S->gamma * vecReduceF32Mve(acc0));
+ f32x4_t vtmp = vuninitializedq_f32();
+ vtmp = vsetq_lane(vecAddAcrossF32Mve(acc0), vtmp, 0);
+ vtmp = vsetq_lane(vecAddAcrossF32Mve(acc1), vtmp, 1);
+ vtmp = vsetq_lane(vecAddAcrossF32Mve(acc2), vtmp, 2);
+ vtmp = vsetq_lane(vecAddAcrossF32Mve(acc3), vtmp, 3);
+
+ vSum =
+ vfmaq_f32(vSum, vld1q(pDualCoef),
+ vexpq_f32(vmulq_n_f32(vtmp, -S->gamma)));
+ pDualCoef += 4;
+ pSrcA += numCols * 4;
+ /*
+ * Decrement the row loop counter
+ */
+ row -= 4;
+ }
+
+ /*
+ * compute 2 rows in parrallel
+ */
+ if (row >= 2) {
+ float32_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec;
+ f32x4_t vecIn, acc0, acc1;
+ float32_t const *pSrcVecPtr = in;
+
+ /*
+ * Initialize the pointers to 2 consecutive MatrixA rows
+ */
+ pInA0 = pSrcA;
+ pInA1 = pInA0 + numCols;
+ /*
+ * Initialize the vector pointer
+ */
+ pInVec = pSrcVecPtr;
+ /*
+ * reset accumulators
+ */
+ acc0 = vdupq_n_f32(0.0f);
+ acc1 = vdupq_n_f32(0.0f);
+ pSrcA0Vec = pInA0;
+ pSrcA1Vec = pInA1;
+
+ blkCnt = numCols >> 2;
+ while (blkCnt > 0U) {
+ f32x4_t vecA;
+ f32x4_t vecDif;
+
+ vecIn = vld1q(pInVec);
+ pInVec += 4;
+ vecA = vld1q(pSrcA0Vec);
+ pSrcA0Vec += 4;
+ vecDif = vsubq(vecIn, vecA);
+ acc0 = vfmaq(acc0, vecDif, vecDif);;
+ vecA = vld1q(pSrcA1Vec);
+ pSrcA1Vec += 4;
+ vecDif = vsubq(vecIn, vecA);
+ acc1 = vfmaq(acc1, vecDif, vecDif);
+
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numCols & 3;
+ if (blkCnt > 0U) {
+ mve_pred16_t p0 = vctp32q(blkCnt);
+ f32x4_t vecA, vecDif;
+
+ vecIn = vldrwq_z_f32(pInVec, p0);
+ vecA = vldrwq_z_f32(pSrcA0Vec, p0);
+ vecDif = vsubq(vecIn, vecA);
+ acc0 = vfmaq(acc0, vecDif, vecDif);
+ vecA = vldrwq_z_f32(pSrcA1Vec, p0);
+ vecDif = vsubq(vecIn, vecA);
+ acc1 = vfmaq(acc1, vecDif, vecDif);
+ }
+ /*
+ * Sum the partial parts
+ */
+ f32x4_t vtmp = vuninitializedq_f32();
+ vtmp = vsetq_lane(vecAddAcrossF32Mve(acc0), vtmp, 0);
+ vtmp = vsetq_lane(vecAddAcrossF32Mve(acc1), vtmp, 1);
+
+ vSum =
+ vfmaq_m_f32(vSum, vld1q(pDualCoef),
+ vexpq_f32(vmulq_n_f32(vtmp, -S->gamma)), vctp32q(2));
+ pDualCoef += 2;
+
+ pSrcA += numCols * 2;
+ row -= 2;
+ }
+
+ if (row >= 1) {
+ f32x4_t vecIn, acc0;
+ float32_t const *pSrcA0Vec, *pInVec;
+ float32_t const *pSrcVecPtr = in;
+ /*
+ * Initialize the pointers to last MatrixA row
+ */
+ pInA0 = pSrcA;
+ /*
+ * Initialize the vector pointer
+ */
+ pInVec = pSrcVecPtr;
+ /*
+ * reset accumulators
+ */
+ acc0 = vdupq_n_f32(0.0f);
+
+ pSrcA0Vec = pInA0;
+
+ blkCnt = numCols >> 2;
+ while (blkCnt > 0U) {
+ f32x4_t vecA, vecDif;
+
+ vecIn = vld1q(pInVec);
+ pInVec += 4;
+ vecA = vld1q(pSrcA0Vec);
+ pSrcA0Vec += 4;
+ vecDif = vsubq(vecIn, vecA);
+ acc0 = vfmaq(acc0, vecDif, vecDif);
+
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numCols & 3;
+ if (blkCnt > 0U) {
+ mve_pred16_t p0 = vctp32q(blkCnt);
+ f32x4_t vecA, vecDif;
+
+ vecIn = vldrwq_z_f32(pInVec, p0);
+ vecA = vldrwq_z_f32(pSrcA0Vec, p0);
+ vecDif = vsubq(vecIn, vecA);
+ acc0 = vfmaq(acc0, vecDif, vecDif);
+ }
+ /*
+ * Sum the partial parts
+ */
+ f32x4_t vtmp = vuninitializedq_f32();
+ vtmp = vsetq_lane(vecAddAcrossF32Mve(acc0), vtmp, 0);
+
+ vSum =
+ vfmaq_m_f32(vSum, vld1q(pDualCoef),
+ vexpq_f32(vmulq_n_f32(vtmp, -S->gamma)), vctp32q(1));
+
+ }
+
+
+ sum += vecAddAcrossF32Mve(vSum);
+ *pResult = S->classes[STEP(sum)];
+}
+
+
+#else
+#if defined(ARM_MATH_NEON)
+
+#include "NEMath.h"
+
+void arm_svm_rbf_predict_f32(
+ const arm_svm_rbf_instance_f32 *S,
+ const float32_t * in,
+ int32_t * pResult)
+{
+ float32_t sum = S->intercept;
+
+ float32_t dot;
+ float32x4_t dotV;
+
+ float32x4_t accuma,accumb,accumc,accumd,accum;
+ float32x2_t accum2;
+ float32x4_t temp;
+ float32x4_t vec1;
+
+ float32x4_t vec2,vec2a,vec2b,vec2c,vec2d;
+
+ uint32_t blkCnt;
+ uint32_t vectorBlkCnt;
+
+ const float32_t *pIn = in;
+
+ const float32_t *pSupport = S->supportVectors;
+
+ const float32_t *pSupporta = S->supportVectors;
+ const float32_t *pSupportb;
+ const float32_t *pSupportc;
+ const float32_t *pSupportd;
+
+ pSupportb = pSupporta + S->vectorDimension;
+ pSupportc = pSupportb + S->vectorDimension;
+ pSupportd = pSupportc + S->vectorDimension;
+
+ const float32_t *pDualCoefs = S->dualCoefficients;
+
+
+ vectorBlkCnt = S->nbOfSupportVectors >> 2;
+ while (vectorBlkCnt > 0U)
+ {
+ accuma = vdupq_n_f32(0);
+ accumb = vdupq_n_f32(0);
+ accumc = vdupq_n_f32(0);
+ accumd = vdupq_n_f32(0);
+
+ pIn = in;
+
+ blkCnt = S->vectorDimension >> 2;
+ while (blkCnt > 0U)
+ {
+
+ vec1 = vld1q_f32(pIn);
+ vec2a = vld1q_f32(pSupporta);
+ vec2b = vld1q_f32(pSupportb);
+ vec2c = vld1q_f32(pSupportc);
+ vec2d = vld1q_f32(pSupportd);
+
+ pIn += 4;
+ pSupporta += 4;
+ pSupportb += 4;
+ pSupportc += 4;
+ pSupportd += 4;
+
+ temp = vsubq_f32(vec1, vec2a);
+ accuma = vmlaq_f32(accuma, temp, temp);
+
+ temp = vsubq_f32(vec1, vec2b);
+ accumb = vmlaq_f32(accumb, temp, temp);
+
+ temp = vsubq_f32(vec1, vec2c);
+ accumc = vmlaq_f32(accumc, temp, temp);
+
+ temp = vsubq_f32(vec1, vec2d);
+ accumd = vmlaq_f32(accumd, temp, temp);
+
+ blkCnt -- ;
+ }
+ accum2 = vpadd_f32(vget_low_f32(accuma),vget_high_f32(accuma));
+ dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,0);
+
+ accum2 = vpadd_f32(vget_low_f32(accumb),vget_high_f32(accumb));
+ dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,1);
+
+ accum2 = vpadd_f32(vget_low_f32(accumc),vget_high_f32(accumc));
+ dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,2);
+
+ accum2 = vpadd_f32(vget_low_f32(accumd),vget_high_f32(accumd));
+ dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,3);
+
+
+ blkCnt = S->vectorDimension & 3;
+ while (blkCnt > 0U)
+ {
+ dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,0) + SQ(*pIn - *pSupporta), dotV,0);
+ dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,1) + SQ(*pIn - *pSupportb), dotV,1);
+ dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,2) + SQ(*pIn - *pSupportc), dotV,2);
+ dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,3) + SQ(*pIn - *pSupportd), dotV,3);
+
+ pSupporta++;
+ pSupportb++;
+ pSupportc++;
+ pSupportd++;
+
+ pIn++;
+
+ blkCnt -- ;
+ }
+
+ vec1 = vld1q_f32(pDualCoefs);
+ pDualCoefs += 4;
+
+ // To vectorize later
+ dotV = vmulq_n_f32(dotV, -S->gamma);
+ dotV = vexpq_f32(dotV);
+
+ accum = vmulq_f32(vec1,dotV);
+ accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum));
+ sum += vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1);
+
+ pSupporta += 3*S->vectorDimension;
+ pSupportb += 3*S->vectorDimension;
+ pSupportc += 3*S->vectorDimension;
+ pSupportd += 3*S->vectorDimension;
+
+ vectorBlkCnt -- ;
+ }
+
+ pSupport = pSupporta;
+ vectorBlkCnt = S->nbOfSupportVectors & 3;
+
+ while (vectorBlkCnt > 0U)
+ {
+ accum = vdupq_n_f32(0);
+ dot = 0.0f;
+ pIn = in;
+
+ blkCnt = S->vectorDimension >> 2;
+ while (blkCnt > 0U)
+ {
+
+ vec1 = vld1q_f32(pIn);
+ vec2 = vld1q_f32(pSupport);
+ pIn += 4;
+ pSupport += 4;
+
+ temp = vsubq_f32(vec1,vec2);
+ accum = vmlaq_f32(accum, temp,temp);
+
+ blkCnt -- ;
+ }
+ accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum));
+ dot = vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1);
+
+
+ blkCnt = S->vectorDimension & 3;
+ while (blkCnt > 0U)
+ {
+
+ dot = dot + SQ(*pIn - *pSupport);
+ pIn++;
+ pSupport++;
+
+ blkCnt -- ;
+ }
+
+ sum += *pDualCoefs++ * expf(-S->gamma * dot);
+ vectorBlkCnt -- ;
+ }
+
+ *pResult=S->classes[STEP(sum)];
+}
+#else
+void arm_svm_rbf_predict_f32(
+ const arm_svm_rbf_instance_f32 *S,
+ const float32_t * in,
+ int32_t * pResult)
+{
+ float32_t sum=S->intercept;
+ float32_t dot=0;
+ uint32_t i,j;
+ const float32_t *pSupport = S->supportVectors;
+
+ for(i=0; i < S->nbOfSupportVectors; i++)
+ {
+ dot=0;
+ for(j=0; j < S->vectorDimension; j++)
+ {
+ dot = dot + SQ(in[j] - *pSupport);
+ pSupport++;
+ }
+ sum += S->dualCoefficients[i] * expf(-S->gamma * dot);
+ }
+ *pResult=S->classes[STEP(sum)];
+}
+#endif
+
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ * @} end of rbfsvm group
+ */
diff --git a/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_sigmoid_init_f16.c b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_sigmoid_init_f16.c
new file mode 100644
index 0000000..7a27417
--- /dev/null
+++ b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_sigmoid_init_f16.c
@@ -0,0 +1,98 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_svm_sigmoid_predict_f16.c
+ * Description: SVM Sigmoid Instance Initialization
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/svm_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+#include <limits.h>
+#include <math.h>
+
+/**
+ @ingroup groupSVM
+ */
+
+/**
+ @defgroup sigmoidsvm Sigmoid SVM
+
+ Sigmoid SVM classifier
+ */
+
+/**
+ * @addtogroup sigmoidsvm
+ * @{
+ */
+
+
+/**
+ * @brief SVM sigmoid instance init function
+ *
+ * Classes are integer used as output of the function (instead of having -1,1
+ * as class values).
+ *
+ * @param[in] S points to an instance of the rbf SVM structure.
+ * @param[in] nbOfSupportVectors Number of support vectors
+ * @param[in] vectorDimension Dimension of vector space
+ * @param[in] intercept Intercept
+ * @param[in] dualCoefficients Array of dual coefficients
+ * @param[in] supportVectors Array of support vectors
+ * @param[in] classes Array of 2 classes ID
+ * @param[in] coef0 coeff0 (scikit-learn terminology)
+ * @param[in] gamma gamma (scikit-learn terminology)
+ * @return none.
+ *
+ */
+
+void arm_svm_sigmoid_init_f16(arm_svm_sigmoid_instance_f16 *S,
+ uint32_t nbOfSupportVectors,
+ uint32_t vectorDimension,
+ float16_t intercept,
+ const float16_t *dualCoefficients,
+ const float16_t *supportVectors,
+ const int32_t *classes,
+ float16_t coef0,
+ float16_t gamma
+ )
+{
+ S->nbOfSupportVectors = nbOfSupportVectors;
+ S->vectorDimension = vectorDimension;
+ S->intercept = intercept;
+ S->dualCoefficients = dualCoefficients;
+ S->supportVectors = supportVectors;
+ S->classes = classes;
+ S->coef0 = coef0;
+ S->gamma = gamma;
+}
+
+
+/**
+ * @} end of sigmoidsvm group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
+
diff --git a/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_sigmoid_init_f32.c b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_sigmoid_init_f32.c
new file mode 100644
index 0000000..a7f16c2
--- /dev/null
+++ b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_sigmoid_init_f32.c
@@ -0,0 +1,92 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_svm_sigmoid_predict_f32.c
+ * Description: SVM Sigmoid Instance Initialization
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/svm_functions.h"
+#include <limits.h>
+#include <math.h>
+
+/**
+ @ingroup groupSVM
+ */
+
+/**
+ @defgroup sigmoidsvm Sigmoid SVM
+
+ Sigmoid SVM classifier
+ */
+
+/**
+ * @addtogroup sigmoidsvm
+ * @{
+ */
+
+
+/**
+ * @brief SVM sigmoid instance init function
+ *
+ * Classes are integer used as output of the function (instead of having -1,1
+ * as class values).
+ *
+ * @param[in] S points to an instance of the rbf SVM structure.
+ * @param[in] nbOfSupportVectors Number of support vectors
+ * @param[in] vectorDimension Dimension of vector space
+ * @param[in] intercept Intercept
+ * @param[in] dualCoefficients Array of dual coefficients
+ * @param[in] supportVectors Array of support vectors
+ * @param[in] classes Array of 2 classes ID
+ * @param[in] coef0 coeff0 (scikit-learn terminology)
+ * @param[in] gamma gamma (scikit-learn terminology)
+ * @return none.
+ *
+ */
+
+void arm_svm_sigmoid_init_f32(arm_svm_sigmoid_instance_f32 *S,
+ uint32_t nbOfSupportVectors,
+ uint32_t vectorDimension,
+ float32_t intercept,
+ const float32_t *dualCoefficients,
+ const float32_t *supportVectors,
+ const int32_t *classes,
+ float32_t coef0,
+ float32_t gamma
+ )
+{
+ S->nbOfSupportVectors = nbOfSupportVectors;
+ S->vectorDimension = vectorDimension;
+ S->intercept = intercept;
+ S->dualCoefficients = dualCoefficients;
+ S->supportVectors = supportVectors;
+ S->classes = classes;
+ S->coef0 = coef0;
+ S->gamma = gamma;
+}
+
+
+/**
+ * @} end of sigmoidsvm group
+ */
diff --git a/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_sigmoid_predict_f16.c b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_sigmoid_predict_f16.c
new file mode 100644
index 0000000..e2d541f
--- /dev/null
+++ b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_sigmoid_predict_f16.c
@@ -0,0 +1,333 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_svm_sigmoid_predict_f16.c
+ * Description: SVM Sigmoid Classifier
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/svm_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+#include <limits.h>
+#include <math.h>
+
+/**
+ * @addtogroup sigmoidsvm
+ * @{
+ */
+
+
+
+/**
+ * @brief SVM sigmoid prediction
+ * @param[in] S Pointer to an instance of the rbf SVM structure.
+ * @param[in] in Pointer to input vector
+ * @param[out] pResult Decision value
+ * @return none.
+ *
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+#include "arm_vec_math_f16.h"
+
+void arm_svm_sigmoid_predict_f16(
+ const arm_svm_sigmoid_instance_f16 *S,
+ const float16_t * in,
+ int32_t * pResult)
+{
+ /* inlined Matrix x Vector function interleaved with dot prod */
+ uint32_t numRows = S->nbOfSupportVectors;
+ uint32_t numCols = S->vectorDimension;
+ const float16_t *pSupport = S->supportVectors;
+ const float16_t *pSrcA = pSupport;
+ const float16_t *pInA0;
+ const float16_t *pInA1;
+ uint32_t row;
+ uint32_t blkCnt; /* loop counters */
+ const float16_t *pDualCoef = S->dualCoefficients;
+ _Float16 sum = S->intercept;
+ f16x8_t vSum = vdupq_n_f16(0.0f);
+
+ row = numRows;
+
+ /*
+ * compute 4 rows in parrallel
+ */
+ while (row >= 4) {
+ const float16_t *pInA2, *pInA3;
+ float16_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec;
+ f16x8_t vecIn, acc0, acc1, acc2, acc3;
+ float16_t const *pSrcVecPtr = in;
+
+ /*
+ * Initialize the pointers to 4 consecutive MatrixA rows
+ */
+ pInA0 = pSrcA;
+ pInA1 = pInA0 + numCols;
+ pInA2 = pInA1 + numCols;
+ pInA3 = pInA2 + numCols;
+ /*
+ * Initialize the vector pointer
+ */
+ pInVec = pSrcVecPtr;
+ /*
+ * reset accumulators
+ */
+ acc0 = vdupq_n_f16(0.0f);
+ acc1 = vdupq_n_f16(0.0f);
+ acc2 = vdupq_n_f16(0.0f);
+ acc3 = vdupq_n_f16(0.0f);
+
+ pSrcA0Vec = pInA0;
+ pSrcA1Vec = pInA1;
+ pSrcA2Vec = pInA2;
+ pSrcA3Vec = pInA3;
+
+ blkCnt = numCols >> 3;
+ while (blkCnt > 0U) {
+ f16x8_t vecA;
+
+ vecIn = vld1q(pInVec);
+ pInVec += 8;
+ vecA = vld1q(pSrcA0Vec);
+ pSrcA0Vec += 8;
+ acc0 = vfmaq(acc0, vecIn, vecA);
+ vecA = vld1q(pSrcA1Vec);
+ pSrcA1Vec += 8;
+ acc1 = vfmaq(acc1, vecIn, vecA);
+ vecA = vld1q(pSrcA2Vec);
+ pSrcA2Vec += 8;
+ acc2 = vfmaq(acc2, vecIn, vecA);
+ vecA = vld1q(pSrcA3Vec);
+ pSrcA3Vec += 8;
+ acc3 = vfmaq(acc3, vecIn, vecA);
+
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numCols & 7;
+ if (blkCnt > 0U) {
+ mve_pred16_t p0 = vctp16q(blkCnt);
+ f16x8_t vecA;
+
+ vecIn = vldrhq_z_f16(pInVec, p0);
+ vecA = vldrhq_z_f16(pSrcA0Vec, p0);
+ acc0 = vfmaq(acc0, vecIn, vecA);
+ vecA = vldrhq_z_f16(pSrcA1Vec, p0);
+ acc1 = vfmaq(acc1, vecIn, vecA);
+ vecA = vldrhq_z_f16(pSrcA2Vec, p0);
+ acc2 = vfmaq(acc2, vecIn, vecA);
+ vecA = vldrhq_z_f16(pSrcA3Vec, p0);
+ acc3 = vfmaq(acc3, vecIn, vecA);
+ }
+ /*
+ * Sum the partial parts
+ */
+ f16x8_t vtmp = vuninitializedq_f16();
+ vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0);
+ vtmp = vsetq_lane(vecAddAcrossF16Mve(acc1), vtmp, 1);
+ vtmp = vsetq_lane(vecAddAcrossF16Mve(acc2), vtmp, 2);
+ vtmp = vsetq_lane(vecAddAcrossF16Mve(acc3), vtmp, 3);
+
+ vSum =
+ vfmaq_m_f16(vSum, vld1q(pDualCoef),
+ vtanhq_f16(vaddq_n_f16(vmulq_n_f16(vtmp, S->gamma), S->coef0)),vctp16q(4));
+
+ pDualCoef += 4;
+
+ pSrcA += numCols * 4;
+ /*
+ * Decrement the row loop counter
+ */
+ row -= 4;
+ }
+
+ /*
+ * compute 2 rows in parrallel
+ */
+ if (row >= 2) {
+ float16_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec;
+ f16x8_t vecIn, acc0, acc1;
+ float16_t const *pSrcVecPtr = in;
+
+ /*
+ * Initialize the pointers to 2 consecutive MatrixA rows
+ */
+ pInA0 = pSrcA;
+ pInA1 = pInA0 + numCols;
+ /*
+ * Initialize the vector pointer
+ */
+ pInVec = pSrcVecPtr;
+ /*
+ * reset accumulators
+ */
+ acc0 = vdupq_n_f16(0.0f);
+ acc1 = vdupq_n_f16(0.0f);
+ pSrcA0Vec = pInA0;
+ pSrcA1Vec = pInA1;
+
+ blkCnt = numCols >> 3;
+ while (blkCnt > 0U) {
+ f16x8_t vecA;
+
+ vecIn = vld1q(pInVec);
+ pInVec += 8;
+ vecA = vld1q(pSrcA0Vec);
+ pSrcA0Vec += 8;
+ acc0 = vfmaq(acc0, vecIn, vecA);
+ vecA = vld1q(pSrcA1Vec);
+ pSrcA1Vec += 8;
+ acc1 = vfmaq(acc1, vecIn, vecA);
+
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numCols & 7;
+ if (blkCnt > 0U) {
+ mve_pred16_t p0 = vctp16q(blkCnt);
+ f16x8_t vecA;
+
+ vecIn = vldrhq_z_f16(pInVec, p0);
+ vecA = vldrhq_z_f16(pSrcA0Vec, p0);
+ acc0 = vfmaq(acc0, vecIn, vecA);
+ vecA = vldrhq_z_f16(pSrcA1Vec, p0);
+ acc1 = vfmaq(acc1, vecIn, vecA);
+ }
+ /*
+ * Sum the partial parts
+ */
+ f16x8_t vtmp = vuninitializedq_f16();
+ vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0);
+ vtmp = vsetq_lane(vecAddAcrossF16Mve(acc1), vtmp, 1);
+
+ vSum =
+ vfmaq_m_f16(vSum, vld1q(pDualCoef),
+ vtanhq_f16(vaddq_n_f16(vmulq_n_f16(vtmp, S->gamma), S->coef0)),
+ vctp16q(2));
+
+ pSrcA += numCols * 2;
+ row -= 2;
+ }
+
+ if (row >= 1) {
+ f16x8_t vecIn, acc0;
+ float16_t const *pSrcA0Vec, *pInVec;
+ float16_t const *pSrcVecPtr = in;
+ /*
+ * Initialize the pointers to last MatrixA row
+ */
+ pInA0 = pSrcA;
+ /*
+ * Initialize the vector pointer
+ */
+ pInVec = pSrcVecPtr;
+ /*
+ * reset accumulators
+ */
+ acc0 = vdupq_n_f16(0.0f);
+
+ pSrcA0Vec = pInA0;
+
+ blkCnt = numCols >> 3;
+ while (blkCnt > 0U) {
+ f16x8_t vecA;
+
+ vecIn = vld1q(pInVec);
+ pInVec += 8;
+ vecA = vld1q(pSrcA0Vec);
+ pSrcA0Vec += 8;
+ acc0 = vfmaq(acc0, vecIn, vecA);
+
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numCols & 7;
+ if (blkCnt > 0U) {
+ mve_pred16_t p0 = vctp16q(blkCnt);
+ f16x8_t vecA;
+
+ vecIn = vldrhq_z_f16(pInVec, p0);
+ vecA = vldrhq_z_f16(pSrcA0Vec, p0);
+ acc0 = vfmaq(acc0, vecIn, vecA);
+ }
+ /*
+ * Sum the partial parts
+ */
+ f16x8_t vtmp = vuninitializedq_f16();
+ vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0);
+
+ vSum =
+ vfmaq_m_f16(vSum, vld1q(pDualCoef),
+ vtanhq_f16(vaddq_n_f16(vmulq_n_f16(vtmp, S->gamma), S->coef0)),
+ vctp16q(1));
+ }
+ sum += (_Float16)vecAddAcrossF16Mve(vSum);
+
+ *pResult = S->classes[STEP(sum)];
+}
+
+#else
+void arm_svm_sigmoid_predict_f16(
+ const arm_svm_sigmoid_instance_f16 *S,
+ const float16_t * in,
+ int32_t * pResult)
+{
+ _Float16 sum=S->intercept;
+ _Float16 dot=0.0f16;
+ uint32_t i,j;
+ const float16_t *pSupport = S->supportVectors;
+
+ for(i=0; i < S->nbOfSupportVectors; i++)
+ {
+ dot=0.0f16;
+ for(j=0; j < S->vectorDimension; j++)
+ {
+ dot = (_Float16)dot + (_Float16)in[j] * (_Float16)*pSupport++;
+ }
+ sum += (_Float16)S->dualCoefficients[i] * (_Float16)tanhf((float32_t)((_Float16)S->gamma * (_Float16)dot + (_Float16)S->coef0));
+ }
+ *pResult=S->classes[STEP(sum)];
+}
+
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ * @} end of sigmoidsvm group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
+
diff --git a/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_sigmoid_predict_f32.c b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_sigmoid_predict_f32.c
new file mode 100644
index 0000000..37c8a08
--- /dev/null
+++ b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_sigmoid_predict_f32.c
@@ -0,0 +1,487 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_svm_sigmoid_predict_f32.c
+ * Description: SVM Sigmoid Classifier
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/svm_functions.h"
+#include <limits.h>
+#include <math.h>
+
+/**
+ * @addtogroup sigmoidsvm
+ * @{
+ */
+
+
+
+/**
+ * @brief SVM sigmoid prediction
+ * @param[in] S Pointer to an instance of the rbf SVM structure.
+ * @param[in] in Pointer to input vector
+ * @param[out] pResult Decision value
+ * @return none.
+ *
+ */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+#include "arm_vec_math.h"
+
+void arm_svm_sigmoid_predict_f32(
+ const arm_svm_sigmoid_instance_f32 *S,
+ const float32_t * in,
+ int32_t * pResult)
+{
+ /* inlined Matrix x Vector function interleaved with dot prod */
+ uint32_t numRows = S->nbOfSupportVectors;
+ uint32_t numCols = S->vectorDimension;
+ const float32_t *pSupport = S->supportVectors;
+ const float32_t *pSrcA = pSupport;
+ const float32_t *pInA0;
+ const float32_t *pInA1;
+ uint32_t row;
+ uint32_t blkCnt; /* loop counters */
+ const float32_t *pDualCoef = S->dualCoefficients;
+ float32_t sum = S->intercept;
+ f32x4_t vSum = vdupq_n_f32(0.0f);
+
+ row = numRows;
+
+ /*
+ * compute 4 rows in parrallel
+ */
+ while (row >= 4) {
+ const float32_t *pInA2, *pInA3;
+ float32_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec;
+ f32x4_t vecIn, acc0, acc1, acc2, acc3;
+ float32_t const *pSrcVecPtr = in;
+
+ /*
+ * Initialize the pointers to 4 consecutive MatrixA rows
+ */
+ pInA0 = pSrcA;
+ pInA1 = pInA0 + numCols;
+ pInA2 = pInA1 + numCols;
+ pInA3 = pInA2 + numCols;
+ /*
+ * Initialize the vector pointer
+ */
+ pInVec = pSrcVecPtr;
+ /*
+ * reset accumulators
+ */
+ acc0 = vdupq_n_f32(0.0f);
+ acc1 = vdupq_n_f32(0.0f);
+ acc2 = vdupq_n_f32(0.0f);
+ acc3 = vdupq_n_f32(0.0f);
+
+ pSrcA0Vec = pInA0;
+ pSrcA1Vec = pInA1;
+ pSrcA2Vec = pInA2;
+ pSrcA3Vec = pInA3;
+
+ blkCnt = numCols >> 2;
+ while (blkCnt > 0U) {
+ f32x4_t vecA;
+
+ vecIn = vld1q(pInVec);
+ pInVec += 4;
+ vecA = vld1q(pSrcA0Vec);
+ pSrcA0Vec += 4;
+ acc0 = vfmaq(acc0, vecIn, vecA);
+ vecA = vld1q(pSrcA1Vec);
+ pSrcA1Vec += 4;
+ acc1 = vfmaq(acc1, vecIn, vecA);
+ vecA = vld1q(pSrcA2Vec);
+ pSrcA2Vec += 4;
+ acc2 = vfmaq(acc2, vecIn, vecA);
+ vecA = vld1q(pSrcA3Vec);
+ pSrcA3Vec += 4;
+ acc3 = vfmaq(acc3, vecIn, vecA);
+
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numCols & 3;
+ if (blkCnt > 0U) {
+ mve_pred16_t p0 = vctp32q(blkCnt);
+ f32x4_t vecA;
+
+ vecIn = vldrwq_z_f32(pInVec, p0);
+ vecA = vldrwq_z_f32(pSrcA0Vec, p0);
+ acc0 = vfmaq(acc0, vecIn, vecA);
+ vecA = vldrwq_z_f32(pSrcA1Vec, p0);
+ acc1 = vfmaq(acc1, vecIn, vecA);
+ vecA = vldrwq_z_f32(pSrcA2Vec, p0);
+ acc2 = vfmaq(acc2, vecIn, vecA);
+ vecA = vldrwq_z_f32(pSrcA3Vec, p0);
+ acc3 = vfmaq(acc3, vecIn, vecA);
+ }
+ /*
+ * Sum the partial parts
+ */
+ f32x4_t vtmp = vuninitializedq_f32();
+ vtmp = vsetq_lane(vecAddAcrossF32Mve(acc0), vtmp, 0);
+ vtmp = vsetq_lane(vecAddAcrossF32Mve(acc1), vtmp, 1);
+ vtmp = vsetq_lane(vecAddAcrossF32Mve(acc2), vtmp, 2);
+ vtmp = vsetq_lane(vecAddAcrossF32Mve(acc3), vtmp, 3);
+
+ vSum =
+ vfmaq_f32(vSum, vld1q(pDualCoef),
+ vtanhq_f32(vaddq_n_f32(vmulq_n_f32(vtmp, S->gamma), S->coef0)));
+
+ pDualCoef += 4;
+
+ pSrcA += numCols * 4;
+ /*
+ * Decrement the row loop counter
+ */
+ row -= 4;
+ }
+
+ /*
+ * compute 2 rows in parrallel
+ */
+ if (row >= 2) {
+ float32_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec;
+ f32x4_t vecIn, acc0, acc1;
+ float32_t const *pSrcVecPtr = in;
+
+ /*
+ * Initialize the pointers to 2 consecutive MatrixA rows
+ */
+ pInA0 = pSrcA;
+ pInA1 = pInA0 + numCols;
+ /*
+ * Initialize the vector pointer
+ */
+ pInVec = pSrcVecPtr;
+ /*
+ * reset accumulators
+ */
+ acc0 = vdupq_n_f32(0.0f);
+ acc1 = vdupq_n_f32(0.0f);
+ pSrcA0Vec = pInA0;
+ pSrcA1Vec = pInA1;
+
+ blkCnt = numCols >> 2;
+ while (blkCnt > 0U) {
+ f32x4_t vecA;
+
+ vecIn = vld1q(pInVec);
+ pInVec += 4;
+ vecA = vld1q(pSrcA0Vec);
+ pSrcA0Vec += 4;
+ acc0 = vfmaq(acc0, vecIn, vecA);
+ vecA = vld1q(pSrcA1Vec);
+ pSrcA1Vec += 4;
+ acc1 = vfmaq(acc1, vecIn, vecA);
+
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numCols & 3;
+ if (blkCnt > 0U) {
+ mve_pred16_t p0 = vctp32q(blkCnt);
+ f32x4_t vecA;
+
+ vecIn = vldrwq_z_f32(pInVec, p0);
+ vecA = vldrwq_z_f32(pSrcA0Vec, p0);
+ acc0 = vfmaq(acc0, vecIn, vecA);
+ vecA = vldrwq_z_f32(pSrcA1Vec, p0);
+ acc1 = vfmaq(acc1, vecIn, vecA);
+ }
+ /*
+ * Sum the partial parts
+ */
+ f32x4_t vtmp = vuninitializedq_f32();
+ vtmp = vsetq_lane(vecAddAcrossF32Mve(acc0), vtmp, 0);
+ vtmp = vsetq_lane(vecAddAcrossF32Mve(acc1), vtmp, 1);
+
+ vSum =
+ vfmaq_m_f32(vSum, vld1q(pDualCoef),
+ vtanhq_f32(vaddq_n_f32(vmulq_n_f32(vtmp, S->gamma), S->coef0)),
+ vctp32q(2));
+
+ pSrcA += numCols * 2;
+ row -= 2;
+ }
+
+ if (row >= 1) {
+ f32x4_t vecIn, acc0;
+ float32_t const *pSrcA0Vec, *pInVec;
+ float32_t const *pSrcVecPtr = in;
+ /*
+ * Initialize the pointers to last MatrixA row
+ */
+ pInA0 = pSrcA;
+ /*
+ * Initialize the vector pointer
+ */
+ pInVec = pSrcVecPtr;
+ /*
+ * reset accumulators
+ */
+ acc0 = vdupq_n_f32(0.0f);
+
+ pSrcA0Vec = pInA0;
+
+ blkCnt = numCols >> 2;
+ while (blkCnt > 0U) {
+ f32x4_t vecA;
+
+ vecIn = vld1q(pInVec);
+ pInVec += 4;
+ vecA = vld1q(pSrcA0Vec);
+ pSrcA0Vec += 4;
+ acc0 = vfmaq(acc0, vecIn, vecA);
+
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numCols & 3;
+ if (blkCnt > 0U) {
+ mve_pred16_t p0 = vctp32q(blkCnt);
+ f32x4_t vecA;
+
+ vecIn = vldrwq_z_f32(pInVec, p0);
+ vecA = vldrwq_z_f32(pSrcA0Vec, p0);
+ acc0 = vfmaq(acc0, vecIn, vecA);
+ }
+ /*
+ * Sum the partial parts
+ */
+ f32x4_t vtmp = vuninitializedq_f32();
+ vtmp = vsetq_lane(vecAddAcrossF32Mve(acc0), vtmp, 0);
+
+ vSum =
+ vfmaq_m_f32(vSum, vld1q(pDualCoef),
+ vtanhq_f32(vaddq_n_f32(vmulq_n_f32(vtmp, S->gamma), S->coef0)),
+ vctp32q(1));
+ }
+ sum += vecAddAcrossF32Mve(vSum);
+
+ *pResult = S->classes[STEP(sum)];
+}
+
+#else
+#if defined(ARM_MATH_NEON)
+#include "NEMath.h"
+
+void arm_svm_sigmoid_predict_f32(
+ const arm_svm_sigmoid_instance_f32 *S,
+ const float32_t * in,
+ int32_t * pResult)
+{
+ float32_t sum = S->intercept;
+
+ float32_t dot;
+ float32x4_t dotV;
+
+ float32x4_t accuma,accumb,accumc,accumd,accum;
+ float32x2_t accum2;
+ float32x4_t vec1;
+ float32x4_t coef0 = vdupq_n_f32(S->coef0);
+
+ float32x4_t vec2,vec2a,vec2b,vec2c,vec2d;
+
+ uint32_t blkCnt;
+ uint32_t vectorBlkCnt;
+
+ const float32_t *pIn = in;
+
+ const float32_t *pSupport = S->supportVectors;
+
+ const float32_t *pSupporta = S->supportVectors;
+ const float32_t *pSupportb;
+ const float32_t *pSupportc;
+ const float32_t *pSupportd;
+
+ pSupportb = pSupporta + S->vectorDimension;
+ pSupportc = pSupportb + S->vectorDimension;
+ pSupportd = pSupportc + S->vectorDimension;
+
+ const float32_t *pDualCoefs = S->dualCoefficients;
+
+ vectorBlkCnt = S->nbOfSupportVectors >> 2;
+ while (vectorBlkCnt > 0U)
+ {
+ accuma = vdupq_n_f32(0);
+ accumb = vdupq_n_f32(0);
+ accumc = vdupq_n_f32(0);
+ accumd = vdupq_n_f32(0);
+
+ pIn = in;
+
+ blkCnt = S->vectorDimension >> 2;
+ while (blkCnt > 0U)
+ {
+
+ vec1 = vld1q_f32(pIn);
+ vec2a = vld1q_f32(pSupporta);
+ vec2b = vld1q_f32(pSupportb);
+ vec2c = vld1q_f32(pSupportc);
+ vec2d = vld1q_f32(pSupportd);
+
+ pIn += 4;
+ pSupporta += 4;
+ pSupportb += 4;
+ pSupportc += 4;
+ pSupportd += 4;
+
+ accuma = vmlaq_f32(accuma, vec1,vec2a);
+ accumb = vmlaq_f32(accumb, vec1,vec2b);
+ accumc = vmlaq_f32(accumc, vec1,vec2c);
+ accumd = vmlaq_f32(accumd, vec1,vec2d);
+
+ blkCnt -- ;
+ }
+ accum2 = vpadd_f32(vget_low_f32(accuma),vget_high_f32(accuma));
+ dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,0);
+
+ accum2 = vpadd_f32(vget_low_f32(accumb),vget_high_f32(accumb));
+ dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,1);
+
+ accum2 = vpadd_f32(vget_low_f32(accumc),vget_high_f32(accumc));
+ dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,2);
+
+ accum2 = vpadd_f32(vget_low_f32(accumd),vget_high_f32(accumd));
+ dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,3);
+
+
+ blkCnt = S->vectorDimension & 3;
+ while (blkCnt > 0U)
+ {
+ dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,0) + *pIn * *pSupporta++, dotV,0);
+ dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,1) + *pIn * *pSupportb++, dotV,1);
+ dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,2) + *pIn * *pSupportc++, dotV,2);
+ dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,3) + *pIn * *pSupportd++, dotV,3);
+
+ pIn++;
+
+ blkCnt -- ;
+ }
+
+ vec1 = vld1q_f32(pDualCoefs);
+ pDualCoefs += 4;
+
+ // To vectorize later
+ dotV = vmulq_n_f32(dotV, S->gamma);
+ dotV = vaddq_f32(dotV, coef0);
+
+ dotV = vtanhq_f32(dotV);
+
+ accum = vmulq_f32(vec1,dotV);
+ accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum));
+ sum += vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1);
+
+ pSupporta += 3*S->vectorDimension;
+ pSupportb += 3*S->vectorDimension;
+ pSupportc += 3*S->vectorDimension;
+ pSupportd += 3*S->vectorDimension;
+
+ vectorBlkCnt -- ;
+ }
+
+ pSupport = pSupporta;
+ vectorBlkCnt = S->nbOfSupportVectors & 3;
+
+ while (vectorBlkCnt > 0U)
+ {
+ accum = vdupq_n_f32(0);
+ dot = 0.0f;
+ pIn = in;
+
+ blkCnt = S->vectorDimension >> 2;
+ while (blkCnt > 0U)
+ {
+
+ vec1 = vld1q_f32(pIn);
+ vec2 = vld1q_f32(pSupport);
+ pIn += 4;
+ pSupport += 4;
+
+ accum = vmlaq_f32(accum, vec1,vec2);
+
+ blkCnt -- ;
+ }
+ accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum));
+ dot = vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1);
+
+
+ blkCnt = S->vectorDimension & 3;
+ while (blkCnt > 0U)
+ {
+ dot = dot + *pIn++ * *pSupport++;
+
+ blkCnt -- ;
+ }
+
+ sum += *pDualCoefs++ * tanhf(S->gamma * dot + S->coef0);
+ vectorBlkCnt -- ;
+ }
+
+ *pResult=S->classes[STEP(sum)];
+}
+#else
+void arm_svm_sigmoid_predict_f32(
+ const arm_svm_sigmoid_instance_f32 *S,
+ const float32_t * in,
+ int32_t * pResult)
+{
+ float32_t sum=S->intercept;
+ float32_t dot=0;
+ uint32_t i,j;
+ const float32_t *pSupport = S->supportVectors;
+
+ for(i=0; i < S->nbOfSupportVectors; i++)
+ {
+ dot=0;
+ for(j=0; j < S->vectorDimension; j++)
+ {
+ dot = dot + in[j]* *pSupport++;
+ }
+ sum += S->dualCoefficients[i] * tanhf(S->gamma * dot + S->coef0);
+ }
+ *pResult=S->classes[STEP(sum)];
+}
+
+#endif
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ * @} end of sigmoidsvm group
+ */