From 5b81bc8ccbd342b8566d88fc9f17a73aec03b5b6 Mon Sep 17 00:00:00 2001 From: Clyne Sullivan Date: Wed, 29 Jan 2025 21:34:25 -0500 Subject: initial commit --- .../CMSIS/DSP/Source/SVMFunctions/CMakeLists.txt | 41 ++ .../CMSIS/DSP/Source/SVMFunctions/SVMFunctions.c | 36 ++ .../DSP/Source/SVMFunctions/SVMFunctionsF16.c | 36 ++ .../Source/SVMFunctions/arm_svm_linear_init_f16.c | 98 ++++ .../Source/SVMFunctions/arm_svm_linear_init_f32.c | 92 ++++ .../SVMFunctions/arm_svm_linear_predict_f16.c | 314 +++++++++++++ .../SVMFunctions/arm_svm_linear_predict_f32.c | 461 ++++++++++++++++++ .../SVMFunctions/arm_svm_polynomial_init_f16.c | 103 ++++ .../SVMFunctions/arm_svm_polynomial_init_f32.c | 97 ++++ .../SVMFunctions/arm_svm_polynomial_predict_f16.c | 369 +++++++++++++++ .../SVMFunctions/arm_svm_polynomial_predict_f32.c | 490 +++++++++++++++++++ .../DSP/Source/SVMFunctions/arm_svm_rbf_init_f16.c | 97 ++++ .../DSP/Source/SVMFunctions/arm_svm_rbf_init_f32.c | 91 ++++ .../Source/SVMFunctions/arm_svm_rbf_predict_f16.c | 352 ++++++++++++++ .../Source/SVMFunctions/arm_svm_rbf_predict_f32.c | 523 +++++++++++++++++++++ .../Source/SVMFunctions/arm_svm_sigmoid_init_f16.c | 98 ++++ .../Source/SVMFunctions/arm_svm_sigmoid_init_f32.c | 92 ++++ .../SVMFunctions/arm_svm_sigmoid_predict_f16.c | 333 +++++++++++++ .../SVMFunctions/arm_svm_sigmoid_predict_f32.c | 487 +++++++++++++++++++ 19 files changed, 4210 insertions(+) create mode 100644 Drivers/CMSIS/DSP/Source/SVMFunctions/CMakeLists.txt create mode 100644 Drivers/CMSIS/DSP/Source/SVMFunctions/SVMFunctions.c create mode 100644 Drivers/CMSIS/DSP/Source/SVMFunctions/SVMFunctionsF16.c create mode 100644 Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_linear_init_f16.c create mode 100644 Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_linear_init_f32.c create mode 100644 Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_linear_predict_f16.c create mode 100644 Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_linear_predict_f32.c create mode 100644 Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_polynomial_init_f16.c create mode 100644 Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_polynomial_init_f32.c create mode 100644 Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_polynomial_predict_f16.c create mode 100644 Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_polynomial_predict_f32.c create mode 100644 Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_rbf_init_f16.c create mode 100644 Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_rbf_init_f32.c create mode 100644 Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_rbf_predict_f16.c create mode 100644 Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_rbf_predict_f32.c create mode 100644 Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_sigmoid_init_f16.c create mode 100644 Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_sigmoid_init_f32.c create mode 100644 Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_sigmoid_predict_f16.c create mode 100644 Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_sigmoid_predict_f32.c (limited to 'Drivers/CMSIS/DSP/Source/SVMFunctions') diff --git a/Drivers/CMSIS/DSP/Source/SVMFunctions/CMakeLists.txt b/Drivers/CMSIS/DSP/Source/SVMFunctions/CMakeLists.txt new file mode 100644 index 0000000..eb1e0ca --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/SVMFunctions/CMakeLists.txt @@ -0,0 +1,41 @@ +cmake_minimum_required (VERSION 3.14) + +project(CMSISDSPSVM) + +include(configLib) +include(configDsp) + + +add_library(CMSISDSPSVM STATIC) + +target_sources(CMSISDSPSVM PRIVATE arm_svm_linear_init_f32.c) +target_sources(CMSISDSPSVM PRIVATE arm_svm_rbf_init_f32.c) +target_sources(CMSISDSPSVM PRIVATE arm_svm_linear_predict_f32.c) +target_sources(CMSISDSPSVM PRIVATE arm_svm_rbf_predict_f32.c) +target_sources(CMSISDSPSVM PRIVATE arm_svm_polynomial_init_f32.c) +target_sources(CMSISDSPSVM PRIVATE arm_svm_sigmoid_init_f32.c) +target_sources(CMSISDSPSVM PRIVATE arm_svm_polynomial_predict_f32.c) +target_sources(CMSISDSPSVM PRIVATE arm_svm_sigmoid_predict_f32.c) + + + +configLib(CMSISDSPSVM ${ROOT}) +configDsp(CMSISDSPSVM ${ROOT}) + +### Includes +target_include_directories(CMSISDSPSVM PUBLIC "${DSP}/Include") + +if ((NOT ARMAC5) AND (NOT DISABLEFLOAT16)) +target_sources(CMSISDSPSVM PRIVATE arm_svm_linear_init_f16.c) +target_sources(CMSISDSPSVM PRIVATE arm_svm_rbf_init_f16.c) +target_sources(CMSISDSPSVM PRIVATE arm_svm_linear_predict_f16.c) +target_sources(CMSISDSPSVM PRIVATE arm_svm_rbf_predict_f16.c) +target_sources(CMSISDSPSVM PRIVATE arm_svm_polynomial_init_f16.c) +target_sources(CMSISDSPSVM PRIVATE arm_svm_sigmoid_init_f16.c) +target_sources(CMSISDSPSVM PRIVATE arm_svm_polynomial_predict_f16.c) +target_sources(CMSISDSPSVM PRIVATE arm_svm_sigmoid_predict_f16.c) +endif() + + + + diff --git a/Drivers/CMSIS/DSP/Source/SVMFunctions/SVMFunctions.c b/Drivers/CMSIS/DSP/Source/SVMFunctions/SVMFunctions.c new file mode 100644 index 0000000..85f19b9 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/SVMFunctions/SVMFunctions.c @@ -0,0 +1,36 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: BayesFunctions.c + * Description: Combination of all SVM function source files. + * + * $Date: 16. March 2020 + * $Revision: V1.0.0 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2020 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_svm_linear_init_f32.c" +#include "arm_svm_linear_predict_f32.c" +#include "arm_svm_polynomial_init_f32.c" +#include "arm_svm_polynomial_predict_f32.c" +#include "arm_svm_rbf_init_f32.c" +#include "arm_svm_rbf_predict_f32.c" +#include "arm_svm_sigmoid_init_f32.c" +#include "arm_svm_sigmoid_predict_f32.c" diff --git a/Drivers/CMSIS/DSP/Source/SVMFunctions/SVMFunctionsF16.c b/Drivers/CMSIS/DSP/Source/SVMFunctions/SVMFunctionsF16.c new file mode 100644 index 0000000..74d2665 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/SVMFunctions/SVMFunctionsF16.c @@ -0,0 +1,36 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: BayesFunctions.c + * Description: Combination of all SVM function source files. + * + * $Date: 16. March 2020 + * $Revision: V1.0.0 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2020 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_svm_linear_init_f16.c" +#include "arm_svm_linear_predict_f16.c" +#include "arm_svm_polynomial_init_f16.c" +#include "arm_svm_polynomial_predict_f16.c" +#include "arm_svm_rbf_init_f16.c" +#include "arm_svm_rbf_predict_f16.c" +#include "arm_svm_sigmoid_init_f16.c" +#include "arm_svm_sigmoid_predict_f16.c" diff --git a/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_linear_init_f16.c b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_linear_init_f16.c new file mode 100644 index 0000000..a3ebc7f --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_linear_init_f16.c @@ -0,0 +1,98 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_svm_linear_init_f16.c + * Description: SVM Linear Instance Initialization + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/svm_functions_f16.h" + +#if defined(ARM_FLOAT16_SUPPORTED) + +#include +#include + +/** + * @defgroup groupSVM SVM Functions + * + */ + +/** + @ingroup groupSVM + */ + +/** + @defgroup linearsvm Linear SVM + + Linear SVM classifier + */ + +/** + * @addtogroup linearsvm + * @{ + */ + + +/** + * @brief SVM linear instance init function + * + * Classes are integer used as output of the function (instead of having -1,1 + * as class values). + * + * @param[in] S Parameters for the SVM function + * @param[in] nbOfSupportVectors Number of support vectors + * @param[in] vectorDimension Dimension of vector space + * @param[in] intercept Intercept + * @param[in] dualCoefficients Array of dual coefficients + * @param[in] supportVectors Array of support vectors + * @param[in] classes Array of 2 classes ID + * @return none. + * + */ + + +void arm_svm_linear_init_f16(arm_svm_linear_instance_f16 *S, + uint32_t nbOfSupportVectors, + uint32_t vectorDimension, + float16_t intercept, + const float16_t *dualCoefficients, + const float16_t *supportVectors, + const int32_t *classes) +{ + S->nbOfSupportVectors = nbOfSupportVectors; + S->vectorDimension = vectorDimension; + S->intercept = intercept; + S->dualCoefficients = dualCoefficients; + S->supportVectors = supportVectors; + S->classes = classes; +} + + + +/** + * @} end of linearsvm group + */ + +#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ + diff --git a/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_linear_init_f32.c b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_linear_init_f32.c new file mode 100644 index 0000000..75395aa --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_linear_init_f32.c @@ -0,0 +1,92 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_svm_linear_init_f32.c + * Description: SVM Linear Instance Initialization + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/svm_functions.h" +#include +#include + +/** + * @defgroup groupSVM SVM Functions + * + */ + +/** + @ingroup groupSVM + */ + +/** + @defgroup linearsvm Linear SVM + + Linear SVM classifier + */ + +/** + * @addtogroup linearsvm + * @{ + */ + + +/** + * @brief SVM linear instance init function + * + * Classes are integer used as output of the function (instead of having -1,1 + * as class values). + * + * @param[in] S Parameters for the SVM function + * @param[in] nbOfSupportVectors Number of support vectors + * @param[in] vectorDimension Dimension of vector space + * @param[in] intercept Intercept + * @param[in] dualCoefficients Array of dual coefficients + * @param[in] supportVectors Array of support vectors + * @param[in] classes Array of 2 classes ID + * @return none. + * + */ + + +void arm_svm_linear_init_f32(arm_svm_linear_instance_f32 *S, + uint32_t nbOfSupportVectors, + uint32_t vectorDimension, + float32_t intercept, + const float32_t *dualCoefficients, + const float32_t *supportVectors, + const int32_t *classes) +{ + S->nbOfSupportVectors = nbOfSupportVectors; + S->vectorDimension = vectorDimension; + S->intercept = intercept; + S->dualCoefficients = dualCoefficients; + S->supportVectors = supportVectors; + S->classes = classes; +} + + + +/** + * @} end of linearsvm group + */ diff --git a/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_linear_predict_f16.c b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_linear_predict_f16.c new file mode 100644 index 0000000..fe907e5 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_linear_predict_f16.c @@ -0,0 +1,314 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_svm_linear_predict_f16.c + * Description: SVM Linear Classifier + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/svm_functions_f16.h" + +#if defined(ARM_FLOAT16_SUPPORTED) + +#include +#include + + +/** + * @addtogroup linearsvm + * @{ + */ + + +/** + * @brief SVM linear prediction + * @param[in] S Pointer to an instance of the linear SVM structure. + * @param[in] in Pointer to input vector + * @param[out] pResult Decision value + * @return none. + * + */ +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) + +#include "arm_helium_utils.h" + +void arm_svm_linear_predict_f16( + const arm_svm_linear_instance_f16 *S, + const float16_t * in, + int32_t * pResult) +{ + /* inlined Matrix x Vector function interleaved with dot prod */ + uint32_t numRows = S->nbOfSupportVectors; + uint32_t numCols = S->vectorDimension; + const float16_t *pSupport = S->supportVectors; + const float16_t *pSrcA = pSupport; + const float16_t *pInA0; + const float16_t *pInA1; + uint32_t row; + uint32_t blkCnt; /* loop counters */ + const float16_t *pDualCoef = S->dualCoefficients; + _Float16 sum = S->intercept; + row = numRows; + + /* + * compute 4 rows in parrallel + */ + while (row >= 4) + { + const float16_t *pInA2, *pInA3; + float16_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec; + f16x8_t vecIn, acc0, acc1, acc2, acc3; + float16_t const *pSrcVecPtr = in; + + /* + * Initialize the pointers to 4 consecutive MatrixA rows + */ + pInA0 = pSrcA; + pInA1 = pInA0 + numCols; + pInA2 = pInA1 + numCols; + pInA3 = pInA2 + numCols; + /* + * Initialize the vector pointer + */ + pInVec = pSrcVecPtr; + /* + * reset accumulators + */ + acc0 = vdupq_n_f16(0.0f); + acc1 = vdupq_n_f16(0.0f); + acc2 = vdupq_n_f16(0.0f); + acc3 = vdupq_n_f16(0.0f); + + pSrcA0Vec = pInA0; + pSrcA1Vec = pInA1; + pSrcA2Vec = pInA2; + pSrcA3Vec = pInA3; + + blkCnt = numCols >> 3; + while (blkCnt > 0U) { + f16x8_t vecA; + + vecIn = vld1q(pInVec); + pInVec += 8; + vecA = vld1q(pSrcA0Vec); + pSrcA0Vec += 8; + acc0 = vfmaq(acc0, vecIn, vecA); + vecA = vld1q(pSrcA1Vec); + pSrcA1Vec += 8; + acc1 = vfmaq(acc1, vecIn, vecA); + vecA = vld1q(pSrcA2Vec); + pSrcA2Vec += 8; + acc2 = vfmaq(acc2, vecIn, vecA); + vecA = vld1q(pSrcA3Vec); + pSrcA3Vec += 8; + acc3 = vfmaq(acc3, vecIn, vecA); + + blkCnt--; + } + /* + * tail + * (will be merged thru tail predication) + */ + blkCnt = numCols & 7; + if (blkCnt > 0U) { + mve_pred16_t p0 = vctp16q(blkCnt); + f16x8_t vecA; + + vecIn = vldrhq_z_f16(pInVec, p0); + vecA = vldrhq_z_f16(pSrcA0Vec, p0); + acc0 = vfmaq(acc0, vecIn, vecA); + vecA = vldrhq_z_f16(pSrcA1Vec, p0); + acc1 = vfmaq(acc1, vecIn, vecA); + vecA = vldrhq_z_f16(pSrcA2Vec, p0); + acc2 = vfmaq(acc2, vecIn, vecA); + vecA = vldrhq_z_f16(pSrcA3Vec, p0); + acc3 = vfmaq(acc3, vecIn, vecA); + } + /* + * Sum the partial parts + */ + acc0 = vmulq_n_f16(acc0,*pDualCoef++); + acc0 = vfmaq_n_f16(acc0,acc1,*pDualCoef++); + acc0 = vfmaq_n_f16(acc0,acc2,*pDualCoef++); + acc0 = vfmaq_n_f16(acc0,acc3,*pDualCoef++); + + sum += (_Float16)vecAddAcrossF16Mve(acc0); + + pSrcA += numCols * 4; + /* + * Decrement the row loop counter + */ + row -= 4; + } + + /* + * compute 2 rows in parallel + */ + if (row >= 2) { + float16_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec; + f16x8_t vecIn, acc0, acc1; + float16_t const *pSrcVecPtr = in; + + /* + * Initialize the pointers to 2 consecutive MatrixA rows + */ + pInA0 = pSrcA; + pInA1 = pInA0 + numCols; + /* + * Initialize the vector pointer + */ + pInVec = pSrcVecPtr; + /* + * reset accumulators + */ + acc0 = vdupq_n_f16(0.0f); + acc1 = vdupq_n_f16(0.0f); + pSrcA0Vec = pInA0; + pSrcA1Vec = pInA1; + + blkCnt = numCols >> 3; + while (blkCnt > 0U) { + f16x8_t vecA; + + vecIn = vld1q(pInVec); + pInVec += 8; + vecA = vld1q(pSrcA0Vec); + pSrcA0Vec += 8; + acc0 = vfmaq(acc0, vecIn, vecA); + vecA = vld1q(pSrcA1Vec); + pSrcA1Vec += 8; + acc1 = vfmaq(acc1, vecIn, vecA); + + blkCnt--; + } + /* + * tail + * (will be merged thru tail predication) + */ + blkCnt = numCols & 7; + if (blkCnt > 0U) { + mve_pred16_t p0 = vctp16q(blkCnt); + f16x8_t vecA; + + vecIn = vldrhq_z_f16(pInVec, p0); + vecA = vldrhq_z_f16(pSrcA0Vec, p0); + acc0 = vfmaq(acc0, vecIn, vecA); + vecA = vldrhq_z_f16(pSrcA1Vec, p0); + acc1 = vfmaq(acc1, vecIn, vecA); + } + /* + * Sum the partial parts + */ + acc0 = vmulq_n_f16(acc0,*pDualCoef++); + acc0 = vfmaq_n_f16(acc0,acc1,*pDualCoef++); + + sum += (_Float16)vecAddAcrossF16Mve(acc0); + + pSrcA += numCols * 2; + row -= 2; + } + + if (row >= 1) { + f16x8_t vecIn, acc0; + float16_t const *pSrcA0Vec, *pInVec; + float16_t const *pSrcVecPtr = in; + /* + * Initialize the pointers to last MatrixA row + */ + pInA0 = pSrcA; + /* + * Initialize the vector pointer + */ + pInVec = pSrcVecPtr; + /* + * reset accumulators + */ + acc0 = vdupq_n_f16(0.0f); + + pSrcA0Vec = pInA0; + + blkCnt = numCols >> 3; + while (blkCnt > 0U) { + f16x8_t vecA; + + vecIn = vld1q(pInVec); + pInVec += 8; + vecA = vld1q(pSrcA0Vec); + pSrcA0Vec += 8; + acc0 = vfmaq(acc0, vecIn, vecA); + + blkCnt--; + } + /* + * tail + * (will be merged thru tail predication) + */ + blkCnt = numCols & 7; + if (blkCnt > 0U) { + mve_pred16_t p0 = vctp16q(blkCnt); + f16x8_t vecA; + + vecIn = vldrhq_z_f16(pInVec, p0); + vecA = vldrhq_z_f16(pSrcA0Vec, p0); + acc0 = vfmaq(acc0, vecIn, vecA); + } + /* + * Sum the partial parts + */ + sum += (_Float16)*pDualCoef++ * (_Float16)vecAddAcrossF16Mve(acc0); + + } + + *pResult = S->classes[STEP(sum)]; +} + +#else +void arm_svm_linear_predict_f16( + const arm_svm_linear_instance_f16 *S, + const float16_t * in, + int32_t * pResult) +{ + _Float16 sum=S->intercept; + _Float16 dot=0; + uint32_t i,j; + const float16_t *pSupport = S->supportVectors; + + for(i=0; i < S->nbOfSupportVectors; i++) + { + dot=0; + for(j=0; j < S->vectorDimension; j++) + { + dot = (_Float16)dot + (_Float16)in[j]* (_Float16)*pSupport++; + } + sum += (_Float16)S->dualCoefficients[i] * (_Float16)dot; + } + *pResult=S->classes[STEP(sum)]; +} +#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */ + +/** + * @} end of linearsvm group + */ + +#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ + diff --git a/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_linear_predict_f32.c b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_linear_predict_f32.c new file mode 100644 index 0000000..caf09df --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_linear_predict_f32.c @@ -0,0 +1,461 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_svm_linear_predict_f32.c + * Description: SVM Linear Classifier + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/svm_functions.h" +#include +#include + + +/** + * @addtogroup linearsvm + * @{ + */ + + +/** + * @brief SVM linear prediction + * @param[in] S Pointer to an instance of the linear SVM structure. + * @param[in] in Pointer to input vector + * @param[out] pResult Decision value + * @return none. + * + */ +#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) + +#include "arm_helium_utils.h" + +void arm_svm_linear_predict_f32( + const arm_svm_linear_instance_f32 *S, + const float32_t * in, + int32_t * pResult) +{ + /* inlined Matrix x Vector function interleaved with dot prod */ + uint32_t numRows = S->nbOfSupportVectors; + uint32_t numCols = S->vectorDimension; + const float32_t *pSupport = S->supportVectors; + const float32_t *pSrcA = pSupport; + const float32_t *pInA0; + const float32_t *pInA1; + uint32_t row; + uint32_t blkCnt; /* loop counters */ + const float32_t *pDualCoef = S->dualCoefficients; + float32_t sum = S->intercept; + row = numRows; + + /* + * compute 4 rows in parrallel + */ + while (row >= 4) + { + const float32_t *pInA2, *pInA3; + float32_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec; + f32x4_t vecIn, acc0, acc1, acc2, acc3; + float32_t const *pSrcVecPtr = in; + + /* + * Initialize the pointers to 4 consecutive MatrixA rows + */ + pInA0 = pSrcA; + pInA1 = pInA0 + numCols; + pInA2 = pInA1 + numCols; + pInA3 = pInA2 + numCols; + /* + * Initialize the vector pointer + */ + pInVec = pSrcVecPtr; + /* + * reset accumulators + */ + acc0 = vdupq_n_f32(0.0f); + acc1 = vdupq_n_f32(0.0f); + acc2 = vdupq_n_f32(0.0f); + acc3 = vdupq_n_f32(0.0f); + + pSrcA0Vec = pInA0; + pSrcA1Vec = pInA1; + pSrcA2Vec = pInA2; + pSrcA3Vec = pInA3; + + blkCnt = numCols >> 2; + while (blkCnt > 0U) { + f32x4_t vecA; + + vecIn = vld1q(pInVec); + pInVec += 4; + vecA = vld1q(pSrcA0Vec); + pSrcA0Vec += 4; + acc0 = vfmaq(acc0, vecIn, vecA); + vecA = vld1q(pSrcA1Vec); + pSrcA1Vec += 4; + acc1 = vfmaq(acc1, vecIn, vecA); + vecA = vld1q(pSrcA2Vec); + pSrcA2Vec += 4; + acc2 = vfmaq(acc2, vecIn, vecA); + vecA = vld1q(pSrcA3Vec); + pSrcA3Vec += 4; + acc3 = vfmaq(acc3, vecIn, vecA); + + blkCnt--; + } + /* + * tail + * (will be merged thru tail predication) + */ + blkCnt = numCols & 3; + if (blkCnt > 0U) { + mve_pred16_t p0 = vctp32q(blkCnt); + f32x4_t vecA; + + vecIn = vldrwq_z_f32(pInVec, p0); + vecA = vldrwq_z_f32(pSrcA0Vec, p0); + acc0 = vfmaq(acc0, vecIn, vecA); + vecA = vldrwq_z_f32(pSrcA1Vec, p0); + acc1 = vfmaq(acc1, vecIn, vecA); + vecA = vldrwq_z_f32(pSrcA2Vec, p0); + acc2 = vfmaq(acc2, vecIn, vecA); + vecA = vldrwq_z_f32(pSrcA3Vec, p0); + acc3 = vfmaq(acc3, vecIn, vecA); + } + /* + * Sum the partial parts + */ + + acc0 = vmulq_n_f32(acc0,*pDualCoef++); + acc0 = vfmaq_n_f32(acc0,acc1,*pDualCoef++); + acc0 = vfmaq_n_f32(acc0,acc2,*pDualCoef++); + acc0 = vfmaq_n_f32(acc0,acc3,*pDualCoef++); + + sum += vecAddAcrossF32Mve(acc0); + + pSrcA += numCols * 4; + /* + * Decrement the row loop counter + */ + row -= 4; + } + + /* + * compute 2 rows in parallel + */ + if (row >= 2) { + float32_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec; + f32x4_t vecIn, acc0, acc1; + float32_t const *pSrcVecPtr = in; + + /* + * Initialize the pointers to 2 consecutive MatrixA rows + */ + pInA0 = pSrcA; + pInA1 = pInA0 + numCols; + /* + * Initialize the vector pointer + */ + pInVec = pSrcVecPtr; + /* + * reset accumulators + */ + acc0 = vdupq_n_f32(0.0f); + acc1 = vdupq_n_f32(0.0f); + pSrcA0Vec = pInA0; + pSrcA1Vec = pInA1; + + blkCnt = numCols >> 2; + while (blkCnt > 0U) { + f32x4_t vecA; + + vecIn = vld1q(pInVec); + pInVec += 4; + vecA = vld1q(pSrcA0Vec); + pSrcA0Vec += 4; + acc0 = vfmaq(acc0, vecIn, vecA); + vecA = vld1q(pSrcA1Vec); + pSrcA1Vec += 4; + acc1 = vfmaq(acc1, vecIn, vecA); + + blkCnt--; + } + /* + * tail + * (will be merged thru tail predication) + */ + blkCnt = numCols & 3; + if (blkCnt > 0U) { + mve_pred16_t p0 = vctp32q(blkCnt); + f32x4_t vecA; + + vecIn = vldrwq_z_f32(pInVec, p0); + vecA = vldrwq_z_f32(pSrcA0Vec, p0); + acc0 = vfmaq(acc0, vecIn, vecA); + vecA = vldrwq_z_f32(pSrcA1Vec, p0); + acc1 = vfmaq(acc1, vecIn, vecA); + } + /* + * Sum the partial parts + */ + acc0 = vmulq_n_f32(acc0,*pDualCoef++); + acc0 = vfmaq_n_f32(acc0,acc1,*pDualCoef++); + + sum += vecAddAcrossF32Mve(acc0); + + + pSrcA += numCols * 2; + row -= 2; + } + + if (row >= 1) { + f32x4_t vecIn, acc0; + float32_t const *pSrcA0Vec, *pInVec; + float32_t const *pSrcVecPtr = in; + /* + * Initialize the pointers to last MatrixA row + */ + pInA0 = pSrcA; + /* + * Initialize the vector pointer + */ + pInVec = pSrcVecPtr; + /* + * reset accumulators + */ + acc0 = vdupq_n_f32(0.0f); + + pSrcA0Vec = pInA0; + + blkCnt = numCols >> 2; + while (blkCnt > 0U) { + f32x4_t vecA; + + vecIn = vld1q(pInVec); + pInVec += 4; + vecA = vld1q(pSrcA0Vec); + pSrcA0Vec += 4; + acc0 = vfmaq(acc0, vecIn, vecA); + + blkCnt--; + } + /* + * tail + * (will be merged thru tail predication) + */ + blkCnt = numCols & 3; + if (blkCnt > 0U) { + mve_pred16_t p0 = vctp32q(blkCnt); + f32x4_t vecA; + + vecIn = vldrwq_z_f32(pInVec, p0); + vecA = vldrwq_z_f32(pSrcA0Vec, p0); + acc0 = vfmaq(acc0, vecIn, vecA); + } + /* + * Sum the partial parts + */ + sum += *pDualCoef++ * vecAddAcrossF32Mve(acc0); + + } + + *pResult = S->classes[STEP(sum)]; +} + +#else +#if defined(ARM_MATH_NEON) +void arm_svm_linear_predict_f32( + const arm_svm_linear_instance_f32 *S, + const float32_t * in, + int32_t * pResult) +{ + float32_t sum = S->intercept; + + float32_t dot; + float32x4_t dotV; + + float32x4_t accuma,accumb,accumc,accumd,accum; + float32x2_t accum2; + float32x4_t vec1; + + float32x4_t vec2,vec2a,vec2b,vec2c,vec2d; + + uint32_t blkCnt; + uint32_t vectorBlkCnt; + + const float32_t *pIn = in; + + const float32_t *pSupport = S->supportVectors; + + const float32_t *pSupporta = S->supportVectors; + const float32_t *pSupportb; + const float32_t *pSupportc; + const float32_t *pSupportd; + + pSupportb = pSupporta + S->vectorDimension; + pSupportc = pSupportb + S->vectorDimension; + pSupportd = pSupportc + S->vectorDimension; + + const float32_t *pDualCoefs = S->dualCoefficients; + + vectorBlkCnt = S->nbOfSupportVectors >> 2; + + while (vectorBlkCnt > 0U) + { + accuma = vdupq_n_f32(0); + accumb = vdupq_n_f32(0); + accumc = vdupq_n_f32(0); + accumd = vdupq_n_f32(0); + + pIn = in; + + blkCnt = S->vectorDimension >> 2; + while (blkCnt > 0U) + { + + vec1 = vld1q_f32(pIn); + vec2a = vld1q_f32(pSupporta); + vec2b = vld1q_f32(pSupportb); + vec2c = vld1q_f32(pSupportc); + vec2d = vld1q_f32(pSupportd); + + pIn += 4; + pSupporta += 4; + pSupportb += 4; + pSupportc += 4; + pSupportd += 4; + + accuma = vmlaq_f32(accuma, vec1,vec2a); + accumb = vmlaq_f32(accumb, vec1,vec2b); + accumc = vmlaq_f32(accumc, vec1,vec2c); + accumd = vmlaq_f32(accumd, vec1,vec2d); + + blkCnt -- ; + } + accum2 = vpadd_f32(vget_low_f32(accuma),vget_high_f32(accuma)); + dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,0); + + accum2 = vpadd_f32(vget_low_f32(accumb),vget_high_f32(accumb)); + dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,1); + + accum2 = vpadd_f32(vget_low_f32(accumc),vget_high_f32(accumc)); + dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,2); + + accum2 = vpadd_f32(vget_low_f32(accumd),vget_high_f32(accumd)); + dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,3); + + + blkCnt = S->vectorDimension & 3; + while (blkCnt > 0U) + { + dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,0) + *pIn * *pSupporta++, dotV,0); + dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,1) + *pIn * *pSupportb++, dotV,1); + dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,2) + *pIn * *pSupportc++, dotV,2); + dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,3) + *pIn * *pSupportd++, dotV,3); + + pIn++; + + blkCnt -- ; + } + + vec1 = vld1q_f32(pDualCoefs); + pDualCoefs += 4; + + accum = vmulq_f32(vec1,dotV); + accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum)); + sum += vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1); + + pSupporta += 3*S->vectorDimension; + pSupportb += 3*S->vectorDimension; + pSupportc += 3*S->vectorDimension; + pSupportd += 3*S->vectorDimension; + + vectorBlkCnt -- ; + } + + pSupport = pSupporta; + vectorBlkCnt = S->nbOfSupportVectors & 3; + while (vectorBlkCnt > 0U) + { + accum = vdupq_n_f32(0); + dot = 0.0f; + pIn = in; + + blkCnt = S->vectorDimension >> 2; + while (blkCnt > 0U) + { + + vec1 = vld1q_f32(pIn); + vec2 = vld1q_f32(pSupport); + pIn += 4; + pSupport += 4; + + accum = vmlaq_f32(accum, vec1,vec2); + + blkCnt -- ; + } + accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum)); + dot = vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1); + + + blkCnt = S->vectorDimension & 3; + while (blkCnt > 0U) + { + dot = dot + *pIn++ * *pSupport++; + + blkCnt -- ; + } + + sum += *pDualCoefs++ * dot; + vectorBlkCnt -- ; + } + + *pResult=S->classes[STEP(sum)]; +} +#else +void arm_svm_linear_predict_f32( + const arm_svm_linear_instance_f32 *S, + const float32_t * in, + int32_t * pResult) +{ + float32_t sum=S->intercept; + float32_t dot=0; + uint32_t i,j; + const float32_t *pSupport = S->supportVectors; + + for(i=0; i < S->nbOfSupportVectors; i++) + { + dot=0; + for(j=0; j < S->vectorDimension; j++) + { + dot = dot + in[j]* *pSupport++; + } + sum += S->dualCoefficients[i] * dot; + } + *pResult=S->classes[STEP(sum)]; +} +#endif +#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */ + +/** + * @} end of linearsvm group + */ diff --git a/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_polynomial_init_f16.c b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_polynomial_init_f16.c new file mode 100644 index 0000000..558ab45 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_polynomial_init_f16.c @@ -0,0 +1,103 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_svm_polynomial_init_f16.c + * Description: SVM Polynomial Instance Initialization + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/svm_functions_f16.h" + +#if defined(ARM_FLOAT16_SUPPORTED) + +#include +#include + +/** + @ingroup groupSVM + */ + +/** + @defgroup polysvm Polynomial SVM + + Polynomial SVM classifier + */ + +/** + * @addtogroup polysvm + * @{ + */ + + +/** + * @brief SVM polynomial instance init function + * + * Classes are integer used as output of the function (instead of having -1,1 + * as class values). + * + * @param[in] S points to an instance of the polynomial SVM structure. + * @param[in] nbOfSupportVectors Number of support vectors + * @param[in] vectorDimension Dimension of vector space + * @param[in] intercept Intercept + * @param[in] dualCoefficients Array of dual coefficients + * @param[in] supportVectors Array of support vectors + * @param[in] classes Array of 2 classes ID + * @param[in] degree Polynomial degree + * @param[in] coef0 coeff0 (scikit-learn terminology) + * @param[in] gamma gamma (scikit-learn terminology) + * @return none. + * + */ + + +void arm_svm_polynomial_init_f16(arm_svm_polynomial_instance_f16 *S, + uint32_t nbOfSupportVectors, + uint32_t vectorDimension, + float16_t intercept, + const float16_t *dualCoefficients, + const float16_t *supportVectors, + const int32_t *classes, + int32_t degree, + float16_t coef0, + float16_t gamma + ) +{ + S->nbOfSupportVectors = nbOfSupportVectors; + S->vectorDimension = vectorDimension; + S->intercept = intercept; + S->dualCoefficients = dualCoefficients; + S->supportVectors = supportVectors; + S->classes = classes; + S->degree = degree; + S->coef0 = coef0; + S->gamma = gamma; +} + + + +/** + * @} end of polysvm group + */ + +#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ + diff --git a/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_polynomial_init_f32.c b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_polynomial_init_f32.c new file mode 100644 index 0000000..7d33cd7 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_polynomial_init_f32.c @@ -0,0 +1,97 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_svm_polynomial_init_f32.c + * Description: SVM Polynomial Instance Initialization + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/svm_functions.h" +#include +#include + +/** + @ingroup groupSVM + */ + +/** + @defgroup polysvm Polynomial SVM + + Polynomial SVM classifier + */ + +/** + * @addtogroup polysvm + * @{ + */ + + +/** + * @brief SVM polynomial instance init function + * + * Classes are integer used as output of the function (instead of having -1,1 + * as class values). + * + * @param[in] S points to an instance of the polynomial SVM structure. + * @param[in] nbOfSupportVectors Number of support vectors + * @param[in] vectorDimension Dimension of vector space + * @param[in] intercept Intercept + * @param[in] dualCoefficients Array of dual coefficients + * @param[in] supportVectors Array of support vectors + * @param[in] classes Array of 2 classes ID + * @param[in] degree Polynomial degree + * @param[in] coef0 coeff0 (scikit-learn terminology) + * @param[in] gamma gamma (scikit-learn terminology) + * @return none. + * + */ + + +void arm_svm_polynomial_init_f32(arm_svm_polynomial_instance_f32 *S, + uint32_t nbOfSupportVectors, + uint32_t vectorDimension, + float32_t intercept, + const float32_t *dualCoefficients, + const float32_t *supportVectors, + const int32_t *classes, + int32_t degree, + float32_t coef0, + float32_t gamma + ) +{ + S->nbOfSupportVectors = nbOfSupportVectors; + S->vectorDimension = vectorDimension; + S->intercept = intercept; + S->dualCoefficients = dualCoefficients; + S->supportVectors = supportVectors; + S->classes = classes; + S->degree = degree; + S->coef0 = coef0; + S->gamma = gamma; +} + + + +/** + * @} end of polysvm group + */ diff --git a/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_polynomial_predict_f16.c b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_polynomial_predict_f16.c new file mode 100644 index 0000000..724f286 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_polynomial_predict_f16.c @@ -0,0 +1,369 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_svm_polynomial_predict_f16.c + * Description: SVM Polynomial Classifier + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/svm_functions_f16.h" + +#if defined(ARM_FLOAT16_SUPPORTED) + +#include +#include + +#if !defined(ARM_MATH_MVE_FLOAT16) || defined(ARM_MATH_AUTOVECTORIZE) + +/* + +_Float16 is not supported in g++ so we avoid putting _Float16 definitions +in the public headers. + +This function should at some point be moved in FastMath. + +*/ +__STATIC_INLINE float16_t arm_exponent_f16(float16_t x, int32_t nb) +{ + float16_t r = x; + nb --; + while(nb > 0) + { + r = (_Float16)r * (_Float16)x; + nb--; + } + return(r); +} +#endif + +/** + * @addtogroup polysvm + * @{ + */ + + + + +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) + +#include "arm_helium_utils.h" +#include "arm_vec_math_f16.h" + +/** + * @brief SVM polynomial prediction + * @param[in] S Pointer to an instance of the polynomial SVM structure. + * @param[in] in Pointer to input vector + * @param[out] pResult Decision value + * @return none. + * + */ +void arm_svm_polynomial_predict_f16( + const arm_svm_polynomial_instance_f16 *S, + const float16_t * in, + int32_t * pResult) +{ + /* inlined Matrix x Vector function interleaved with dot prod */ + uint32_t numRows = S->nbOfSupportVectors; + uint32_t numCols = S->vectorDimension; + const float16_t *pSupport = S->supportVectors; + const float16_t *pSrcA = pSupport; + const float16_t *pInA0; + const float16_t *pInA1; + uint32_t row; + uint32_t blkCnt; /* loop counters */ + const float16_t *pDualCoef = S->dualCoefficients; + _Float16 sum = S->intercept; + f16x8_t vSum = vdupq_n_f16(0.0f); + + row = numRows; + + /* + * compute 4 rows in parrallel + */ + while (row >= 4) { + const float16_t *pInA2, *pInA3; + float16_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec; + f16x8_t vecIn, acc0, acc1, acc2, acc3; + float16_t const *pSrcVecPtr = in; + + /* + * Initialize the pointers to 4 consecutive MatrixA rows + */ + pInA0 = pSrcA; + pInA1 = pInA0 + numCols; + pInA2 = pInA1 + numCols; + pInA3 = pInA2 + numCols; + /* + * Initialize the vector pointer + */ + pInVec = pSrcVecPtr; + /* + * reset accumulators + */ + acc0 = vdupq_n_f16(0.0f); + acc1 = vdupq_n_f16(0.0f); + acc2 = vdupq_n_f16(0.0f); + acc3 = vdupq_n_f16(0.0f); + + pSrcA0Vec = pInA0; + pSrcA1Vec = pInA1; + pSrcA2Vec = pInA2; + pSrcA3Vec = pInA3; + + blkCnt = numCols >> 3; + while (blkCnt > 0U) { + f16x8_t vecA; + + vecIn = vld1q(pInVec); + pInVec += 8; + vecA = vld1q(pSrcA0Vec); + pSrcA0Vec += 8; + acc0 = vfmaq(acc0, vecIn, vecA); + vecA = vld1q(pSrcA1Vec); + pSrcA1Vec += 8; + acc1 = vfmaq(acc1, vecIn, vecA); + vecA = vld1q(pSrcA2Vec); + pSrcA2Vec += 8; + acc2 = vfmaq(acc2, vecIn, vecA); + vecA = vld1q(pSrcA3Vec); + pSrcA3Vec += 8; + acc3 = vfmaq(acc3, vecIn, vecA); + + blkCnt--; + } + /* + * tail + * (will be merged thru tail predication) + */ + blkCnt = numCols & 7; + if (blkCnt > 0U) { + mve_pred16_t p0 = vctp16q(blkCnt); + f16x8_t vecA; + + vecIn = vldrhq_z_f16(pInVec, p0); + vecA = vldrhq_z_f16(pSrcA0Vec, p0); + acc0 = vfmaq(acc0, vecIn, vecA); + vecA = vldrhq_z_f16(pSrcA1Vec, p0); + acc1 = vfmaq(acc1, vecIn, vecA); + vecA = vldrhq_z_f16(pSrcA2Vec, p0); + acc2 = vfmaq(acc2, vecIn, vecA); + vecA = vldrhq_z_f16(pSrcA3Vec, p0); + acc3 = vfmaq(acc3, vecIn, vecA); + } + /* + * Sum the partial parts + */ + f16x8_t vtmp = vuninitializedq_f16(); + vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0); + vtmp = vsetq_lane(vecAddAcrossF16Mve(acc1), vtmp, 1); + vtmp = vsetq_lane(vecAddAcrossF16Mve(acc2), vtmp, 2); + vtmp = vsetq_lane(vecAddAcrossF16Mve(acc3), vtmp, 3); + + vSum = vfmaq_m_f16(vSum, vld1q(pDualCoef), + arm_vec_exponent_f16 + (vaddq_n_f16(vmulq_n_f16(vtmp, S->gamma), S->coef0), + S->degree),vctp16q(4)); + + pDualCoef += 4; + + pSrcA += numCols * 4; + /* + * Decrement the row loop counter + */ + row -= 4; + } + + /* + * compute 2 rows in parrallel + */ + if (row >= 2) { + float16_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec; + f16x8_t vecIn, acc0, acc1; + float16_t const *pSrcVecPtr = in; + + /* + * Initialize the pointers to 2 consecutive MatrixA rows + */ + pInA0 = pSrcA; + pInA1 = pInA0 + numCols; + /* + * Initialize the vector pointer + */ + pInVec = pSrcVecPtr; + /* + * reset accumulators + */ + acc0 = vdupq_n_f16(0.0f); + acc1 = vdupq_n_f16(0.0f); + pSrcA0Vec = pInA0; + pSrcA1Vec = pInA1; + + blkCnt = numCols >> 3; + while (blkCnt > 0U) { + f16x8_t vecA; + + vecIn = vld1q(pInVec); + pInVec += 8; + vecA = vld1q(pSrcA0Vec); + pSrcA0Vec += 8; + acc0 = vfmaq(acc0, vecIn, vecA); + vecA = vld1q(pSrcA1Vec); + pSrcA1Vec += 8; + acc1 = vfmaq(acc1, vecIn, vecA); + + blkCnt--; + } + /* + * tail + * (will be merged thru tail predication) + */ + blkCnt = numCols & 7; + if (blkCnt > 0U) { + mve_pred16_t p0 = vctp16q(blkCnt); + f16x8_t vecA; + + vecIn = vldrhq_z_f16(pInVec, p0); + vecA = vldrhq_z_f16(pSrcA0Vec, p0); + acc0 = vfmaq(acc0, vecIn, vecA); + vecA = vldrhq_z_f16(pSrcA1Vec, p0); + acc1 = vfmaq(acc1, vecIn, vecA); + } + /* + * Sum the partial parts + */ + f16x8_t vtmp = vuninitializedq_f16(); + vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0); + vtmp = vsetq_lane(vecAddAcrossF16Mve(acc1), vtmp, 1); + + vSum = vfmaq_m_f16(vSum, vld1q(pDualCoef), + arm_vec_exponent_f16 + (vaddq_n_f16(vmulq_n_f16(vtmp, S->gamma), S->coef0), S->degree), + vctp16q(2)); + + pDualCoef += 2; + pSrcA += numCols * 2; + row -= 2; + } + + if (row >= 1) { + f16x8_t vecIn, acc0; + float16_t const *pSrcA0Vec, *pInVec; + float16_t const *pSrcVecPtr = in; + /* + * Initialize the pointers to last MatrixA row + */ + pInA0 = pSrcA; + /* + * Initialize the vector pointer + */ + pInVec = pSrcVecPtr; + /* + * reset accumulators + */ + acc0 = vdupq_n_f16(0.0f); + + pSrcA0Vec = pInA0; + + blkCnt = numCols >> 3; + while (blkCnt > 0U) { + f16x8_t vecA; + + vecIn = vld1q(pInVec); + pInVec += 8; + vecA = vld1q(pSrcA0Vec); + pSrcA0Vec += 8; + acc0 = vfmaq(acc0, vecIn, vecA); + + blkCnt--; + } + /* + * tail + * (will be merged thru tail predication) + */ + blkCnt = numCols & 7; + if (blkCnt > 0U) { + mve_pred16_t p0 = vctp16q(blkCnt); + f16x8_t vecA; + + vecIn = vldrhq_z_f16(pInVec, p0); + vecA = vldrhq_z_f16(pSrcA0Vec, p0); + acc0 = vfmaq(acc0, vecIn, vecA); + } + /* + * Sum the partial parts + */ + f16x8_t vtmp = vuninitializedq_f16(); + vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0); + vSum = vfmaq_m_f16(vSum, vld1q(pDualCoef), + arm_vec_exponent_f16 + (vaddq_n_f16(vmulq_n_f16(vtmp, S->gamma), S->coef0), S->degree), + vctp16q(1)); + } + sum += (_Float16)vecAddAcrossF16Mve(vSum); + + + *pResult = S->classes[STEP(sum)]; +} + +#else + + +/** + * @brief SVM polynomial prediction + * @param[in] S Pointer to an instance of the polynomial SVM structure. + * @param[in] in Pointer to input vector + * @param[out] pResult Decision value + * @return none. + * + */ +void arm_svm_polynomial_predict_f16( + const arm_svm_polynomial_instance_f16 *S, + const float16_t * in, + int32_t * pResult) +{ + _Float16 sum=S->intercept; + _Float16 dot=0; + uint32_t i,j; + const float16_t *pSupport = S->supportVectors; + + for(i=0; i < S->nbOfSupportVectors; i++) + { + dot=0; + for(j=0; j < S->vectorDimension; j++) + { + dot = (_Float16)dot + (_Float16)in[j]* (_Float16)*pSupport++; + } + sum += (_Float16)S->dualCoefficients[i] * (_Float16)arm_exponent_f16((_Float16)S->gamma * (_Float16)dot + (_Float16)S->coef0, S->degree); + } + + *pResult=S->classes[STEP(sum)]; +} +#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */ + + +/** + * @} end of polysvm group + */ + +#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ + diff --git a/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_polynomial_predict_f32.c b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_polynomial_predict_f32.c new file mode 100644 index 0000000..13b1f84 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_polynomial_predict_f32.c @@ -0,0 +1,490 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_svm_polynomial_predict_f32.c + * Description: SVM Polynomial Classifier + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/svm_functions.h" +#include +#include + +#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE) +#include "arm_vec_math.h" +#endif + +/** + * @addtogroup polysvm + * @{ + */ + + +/** + * @brief SVM polynomial prediction + * @param[in] S Pointer to an instance of the polynomial SVM structure. + * @param[in] in Pointer to input vector + * @param[out] pResult Decision value + * @return none. + * + */ + +#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) + +#include "arm_helium_utils.h" +#include "arm_vec_math.h" + +void arm_svm_polynomial_predict_f32( + const arm_svm_polynomial_instance_f32 *S, + const float32_t * in, + int32_t * pResult) +{ + /* inlined Matrix x Vector function interleaved with dot prod */ + uint32_t numRows = S->nbOfSupportVectors; + uint32_t numCols = S->vectorDimension; + const float32_t *pSupport = S->supportVectors; + const float32_t *pSrcA = pSupport; + const float32_t *pInA0; + const float32_t *pInA1; + uint32_t row; + uint32_t blkCnt; /* loop counters */ + const float32_t *pDualCoef = S->dualCoefficients; + float32_t sum = S->intercept; + f32x4_t vSum = vdupq_n_f32(0.0f); + + row = numRows; + + /* + * compute 4 rows in parrallel + */ + while (row >= 4) { + const float32_t *pInA2, *pInA3; + float32_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec; + f32x4_t vecIn, acc0, acc1, acc2, acc3; + float32_t const *pSrcVecPtr = in; + + /* + * Initialize the pointers to 4 consecutive MatrixA rows + */ + pInA0 = pSrcA; + pInA1 = pInA0 + numCols; + pInA2 = pInA1 + numCols; + pInA3 = pInA2 + numCols; + /* + * Initialize the vector pointer + */ + pInVec = pSrcVecPtr; + /* + * reset accumulators + */ + acc0 = vdupq_n_f32(0.0f); + acc1 = vdupq_n_f32(0.0f); + acc2 = vdupq_n_f32(0.0f); + acc3 = vdupq_n_f32(0.0f); + + pSrcA0Vec = pInA0; + pSrcA1Vec = pInA1; + pSrcA2Vec = pInA2; + pSrcA3Vec = pInA3; + + blkCnt = numCols >> 2; + while (blkCnt > 0U) { + f32x4_t vecA; + + vecIn = vld1q(pInVec); + pInVec += 4; + vecA = vld1q(pSrcA0Vec); + pSrcA0Vec += 4; + acc0 = vfmaq(acc0, vecIn, vecA); + vecA = vld1q(pSrcA1Vec); + pSrcA1Vec += 4; + acc1 = vfmaq(acc1, vecIn, vecA); + vecA = vld1q(pSrcA2Vec); + pSrcA2Vec += 4; + acc2 = vfmaq(acc2, vecIn, vecA); + vecA = vld1q(pSrcA3Vec); + pSrcA3Vec += 4; + acc3 = vfmaq(acc3, vecIn, vecA); + + blkCnt--; + } + /* + * tail + * (will be merged thru tail predication) + */ + blkCnt = numCols & 3; + if (blkCnt > 0U) { + mve_pred16_t p0 = vctp32q(blkCnt); + f32x4_t vecA; + + vecIn = vldrwq_z_f32(pInVec, p0); + vecA = vldrwq_z_f32(pSrcA0Vec, p0); + acc0 = vfmaq(acc0, vecIn, vecA); + vecA = vldrwq_z_f32(pSrcA1Vec, p0); + acc1 = vfmaq(acc1, vecIn, vecA); + vecA = vldrwq_z_f32(pSrcA2Vec, p0); + acc2 = vfmaq(acc2, vecIn, vecA); + vecA = vldrwq_z_f32(pSrcA3Vec, p0); + acc3 = vfmaq(acc3, vecIn, vecA); + } + /* + * Sum the partial parts + */ + f32x4_t vtmp = vuninitializedq_f32(); + vtmp = vsetq_lane(vecAddAcrossF32Mve(acc0), vtmp, 0); + vtmp = vsetq_lane(vecAddAcrossF32Mve(acc1), vtmp, 1); + vtmp = vsetq_lane(vecAddAcrossF32Mve(acc2), vtmp, 2); + vtmp = vsetq_lane(vecAddAcrossF32Mve(acc3), vtmp, 3); + + vSum = vfmaq_f32(vSum, vld1q(pDualCoef), + arm_vec_exponent_f32 + (vaddq_n_f32(vmulq_n_f32(vtmp, S->gamma), S->coef0), S->degree)); + + pDualCoef += 4; + + pSrcA += numCols * 4; + /* + * Decrement the row loop counter + */ + row -= 4; + } + + /* + * compute 2 rows in parrallel + */ + if (row >= 2) { + float32_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec; + f32x4_t vecIn, acc0, acc1; + float32_t const *pSrcVecPtr = in; + + /* + * Initialize the pointers to 2 consecutive MatrixA rows + */ + pInA0 = pSrcA; + pInA1 = pInA0 + numCols; + /* + * Initialize the vector pointer + */ + pInVec = pSrcVecPtr; + /* + * reset accumulators + */ + acc0 = vdupq_n_f32(0.0f); + acc1 = vdupq_n_f32(0.0f); + pSrcA0Vec = pInA0; + pSrcA1Vec = pInA1; + + blkCnt = numCols >> 2; + while (blkCnt > 0U) { + f32x4_t vecA; + + vecIn = vld1q(pInVec); + pInVec += 4; + vecA = vld1q(pSrcA0Vec); + pSrcA0Vec += 4; + acc0 = vfmaq(acc0, vecIn, vecA); + vecA = vld1q(pSrcA1Vec); + pSrcA1Vec += 4; + acc1 = vfmaq(acc1, vecIn, vecA); + + blkCnt--; + } + /* + * tail + * (will be merged thru tail predication) + */ + blkCnt = numCols & 3; + if (blkCnt > 0U) { + mve_pred16_t p0 = vctp32q(blkCnt); + f32x4_t vecA; + + vecIn = vldrwq_z_f32(pInVec, p0); + vecA = vldrwq_z_f32(pSrcA0Vec, p0); + acc0 = vfmaq(acc0, vecIn, vecA); + vecA = vldrwq_z_f32(pSrcA1Vec, p0); + acc1 = vfmaq(acc1, vecIn, vecA); + } + /* + * Sum the partial parts + */ + f32x4_t vtmp = vuninitializedq_f32(); + vtmp = vsetq_lane(vecAddAcrossF32Mve(acc0), vtmp, 0); + vtmp = vsetq_lane(vecAddAcrossF32Mve(acc1), vtmp, 1); + + vSum = vfmaq_m_f32(vSum, vld1q(pDualCoef), + arm_vec_exponent_f32 + (vaddq_n_f32(vmulq_n_f32(vtmp, S->gamma), S->coef0), S->degree), + vctp32q(2)); + + pDualCoef += 2; + pSrcA += numCols * 2; + row -= 2; + } + + if (row >= 1) { + f32x4_t vecIn, acc0; + float32_t const *pSrcA0Vec, *pInVec; + float32_t const *pSrcVecPtr = in; + /* + * Initialize the pointers to last MatrixA row + */ + pInA0 = pSrcA; + /* + * Initialize the vector pointer + */ + pInVec = pSrcVecPtr; + /* + * reset accumulators + */ + acc0 = vdupq_n_f32(0.0f); + + pSrcA0Vec = pInA0; + + blkCnt = numCols >> 2; + while (blkCnt > 0U) { + f32x4_t vecA; + + vecIn = vld1q(pInVec); + pInVec += 4; + vecA = vld1q(pSrcA0Vec); + pSrcA0Vec += 4; + acc0 = vfmaq(acc0, vecIn, vecA); + + blkCnt--; + } + /* + * tail + * (will be merged thru tail predication) + */ + blkCnt = numCols & 3; + if (blkCnt > 0U) { + mve_pred16_t p0 = vctp32q(blkCnt); + f32x4_t vecA; + + vecIn = vldrwq_z_f32(pInVec, p0); + vecA = vldrwq_z_f32(pSrcA0Vec, p0); + acc0 = vfmaq(acc0, vecIn, vecA); + } + /* + * Sum the partial parts + */ + f32x4_t vtmp = vuninitializedq_f32(); + vtmp = vsetq_lane(vecAddAcrossF32Mve(acc0), vtmp, 0); + vSum = vfmaq_m_f32(vSum, vld1q(pDualCoef), + arm_vec_exponent_f32 + (vaddq_n_f32(vmulq_n_f32(vtmp, S->gamma), S->coef0), S->degree), + vctp32q(1)); + } + sum += vecAddAcrossF32Mve(vSum); + + + *pResult = S->classes[STEP(sum)]; +} + +#else +#if defined(ARM_MATH_NEON) +void arm_svm_polynomial_predict_f32( + const arm_svm_polynomial_instance_f32 *S, + const float32_t * in, + int32_t * pResult) +{ + float32_t sum = S->intercept; + + float32_t dot; + float32x4_t dotV; + + float32x4_t accuma,accumb,accumc,accumd,accum; + float32x2_t accum2; + float32x4_t vec1; + float32x4_t coef0 = vdupq_n_f32(S->coef0); + + float32x4_t vec2,vec2a,vec2b,vec2c,vec2d; + + uint32_t blkCnt; + uint32_t vectorBlkCnt; + + const float32_t *pIn = in; + + const float32_t *pSupport = S->supportVectors; + + const float32_t *pSupporta = S->supportVectors; + const float32_t *pSupportb; + const float32_t *pSupportc; + const float32_t *pSupportd; + + pSupportb = pSupporta + S->vectorDimension; + pSupportc = pSupportb + S->vectorDimension; + pSupportd = pSupportc + S->vectorDimension; + + const float32_t *pDualCoefs = S->dualCoefficients; + + vectorBlkCnt = S->nbOfSupportVectors >> 2; + while (vectorBlkCnt > 0U) + { + accuma = vdupq_n_f32(0); + accumb = vdupq_n_f32(0); + accumc = vdupq_n_f32(0); + accumd = vdupq_n_f32(0); + + pIn = in; + + blkCnt = S->vectorDimension >> 2; + while (blkCnt > 0U) + { + + vec1 = vld1q_f32(pIn); + vec2a = vld1q_f32(pSupporta); + vec2b = vld1q_f32(pSupportb); + vec2c = vld1q_f32(pSupportc); + vec2d = vld1q_f32(pSupportd); + + pIn += 4; + pSupporta += 4; + pSupportb += 4; + pSupportc += 4; + pSupportd += 4; + + accuma = vmlaq_f32(accuma, vec1,vec2a); + accumb = vmlaq_f32(accumb, vec1,vec2b); + accumc = vmlaq_f32(accumc, vec1,vec2c); + accumd = vmlaq_f32(accumd, vec1,vec2d); + + blkCnt -- ; + } + accum2 = vpadd_f32(vget_low_f32(accuma),vget_high_f32(accuma)); + dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,0); + + accum2 = vpadd_f32(vget_low_f32(accumb),vget_high_f32(accumb)); + dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,1); + + accum2 = vpadd_f32(vget_low_f32(accumc),vget_high_f32(accumc)); + dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,2); + + accum2 = vpadd_f32(vget_low_f32(accumd),vget_high_f32(accumd)); + dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,3); + + + blkCnt = S->vectorDimension & 3; + while (blkCnt > 0U) + { + dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,0) + *pIn * *pSupporta++, dotV,0); + dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,1) + *pIn * *pSupportb++, dotV,1); + dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,2) + *pIn * *pSupportc++, dotV,2); + dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,3) + *pIn * *pSupportd++, dotV,3); + + pIn++; + + blkCnt -- ; + } + + vec1 = vld1q_f32(pDualCoefs); + pDualCoefs += 4; + + // To vectorize later + dotV = vmulq_n_f32(dotV, S->gamma); + dotV = vaddq_f32(dotV, coef0); + + dotV = arm_vec_exponent_f32(dotV,S->degree); + + accum = vmulq_f32(vec1,dotV); + accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum)); + sum += vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1); + + pSupporta += 3*S->vectorDimension; + pSupportb += 3*S->vectorDimension; + pSupportc += 3*S->vectorDimension; + pSupportd += 3*S->vectorDimension; + + vectorBlkCnt -- ; + } + + pSupport = pSupporta; + vectorBlkCnt = S->nbOfSupportVectors & 3; + + while (vectorBlkCnt > 0U) + { + accum = vdupq_n_f32(0); + dot = 0.0f; + pIn = in; + + blkCnt = S->vectorDimension >> 2; + while (blkCnt > 0U) + { + + vec1 = vld1q_f32(pIn); + vec2 = vld1q_f32(pSupport); + pIn += 4; + pSupport += 4; + + accum = vmlaq_f32(accum, vec1,vec2); + + blkCnt -- ; + } + accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum)); + dot = vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1); + + + blkCnt = S->vectorDimension & 3; + while (blkCnt > 0U) + { + dot = dot + *pIn++ * *pSupport++; + + blkCnt -- ; + } + + sum += *pDualCoefs++ * arm_exponent_f32(S->gamma * dot + S->coef0, S->degree); + vectorBlkCnt -- ; + } + + *pResult=S->classes[STEP(sum)]; +} +#else +void arm_svm_polynomial_predict_f32( + const arm_svm_polynomial_instance_f32 *S, + const float32_t * in, + int32_t * pResult) +{ + float32_t sum=S->intercept; + float32_t dot=0; + uint32_t i,j; + const float32_t *pSupport = S->supportVectors; + + for(i=0; i < S->nbOfSupportVectors; i++) + { + dot=0; + for(j=0; j < S->vectorDimension; j++) + { + dot = dot + in[j]* *pSupport++; + } + sum += S->dualCoefficients[i] * arm_exponent_f32(S->gamma * dot + S->coef0, S->degree); + } + + *pResult=S->classes[STEP(sum)]; +} +#endif +#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */ + + +/** + * @} end of polysvm group + */ diff --git a/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_rbf_init_f16.c b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_rbf_init_f16.c new file mode 100644 index 0000000..43de249 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_rbf_init_f16.c @@ -0,0 +1,97 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_svm_rbf_init_f16.c + * Description: SVM Radial Basis Function Instance Initialization + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/svm_functions_f16.h" + +#if defined(ARM_FLOAT16_SUPPORTED) + +#include +#include + +/** + @ingroup groupSVM + */ + +/** + @defgroup rbfsvm RBF SVM + + RBF SVM classifier + */ + + +/** + * @addtogroup rbfsvm + * @{ + */ + + +/** + * @brief SVM radial basis function instance init function + * + * Classes are integer used as output of the function (instead of having -1,1 + * as class values). + * + * @param[in] S points to an instance of the polynomial SVM structure. + * @param[in] nbOfSupportVectors Number of support vectors + * @param[in] vectorDimension Dimension of vector space + * @param[in] intercept Intercept + * @param[in] dualCoefficients Array of dual coefficients + * @param[in] supportVectors Array of support vectors + * @param[in] classes Array of 2 classes ID + * @param[in] gamma gamma (scikit-learn terminology) + * @return none. + * + */ + +void arm_svm_rbf_init_f16(arm_svm_rbf_instance_f16 *S, + uint32_t nbOfSupportVectors, + uint32_t vectorDimension, + float16_t intercept, + const float16_t *dualCoefficients, + const float16_t *supportVectors, + const int32_t *classes, + float16_t gamma + ) +{ + S->nbOfSupportVectors = nbOfSupportVectors; + S->vectorDimension = vectorDimension; + S->intercept = intercept; + S->dualCoefficients = dualCoefficients; + S->supportVectors = supportVectors; + S->classes = classes; + S->gamma = gamma; +} + + + +/** + * @} end of rbfsvm group + */ + +#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ + diff --git a/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_rbf_init_f32.c b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_rbf_init_f32.c new file mode 100644 index 0000000..77bf282 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_rbf_init_f32.c @@ -0,0 +1,91 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_svm_rbf_init_f32.c + * Description: SVM Radial Basis Function Instance Initialization + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/svm_functions.h" +#include +#include + +/** + @ingroup groupSVM + */ + +/** + @defgroup rbfsvm RBF SVM + + RBF SVM classifier + */ + + +/** + * @addtogroup rbfsvm + * @{ + */ + + +/** + * @brief SVM radial basis function instance init function + * + * Classes are integer used as output of the function (instead of having -1,1 + * as class values). + * + * @param[in] S points to an instance of the polynomial SVM structure. + * @param[in] nbOfSupportVectors Number of support vectors + * @param[in] vectorDimension Dimension of vector space + * @param[in] intercept Intercept + * @param[in] dualCoefficients Array of dual coefficients + * @param[in] supportVectors Array of support vectors + * @param[in] classes Array of 2 classes ID + * @param[in] gamma gamma (scikit-learn terminology) + * @return none. + * + */ + +void arm_svm_rbf_init_f32(arm_svm_rbf_instance_f32 *S, + uint32_t nbOfSupportVectors, + uint32_t vectorDimension, + float32_t intercept, + const float32_t *dualCoefficients, + const float32_t *supportVectors, + const int32_t *classes, + float32_t gamma + ) +{ + S->nbOfSupportVectors = nbOfSupportVectors; + S->vectorDimension = vectorDimension; + S->intercept = intercept; + S->dualCoefficients = dualCoefficients; + S->supportVectors = supportVectors; + S->classes = classes; + S->gamma = gamma; +} + + + +/** + * @} end of rbfsvm group + */ diff --git a/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_rbf_predict_f16.c b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_rbf_predict_f16.c new file mode 100644 index 0000000..91afcc1 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_rbf_predict_f16.c @@ -0,0 +1,352 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_svm_rbf_predict_f16.c + * Description: SVM Radial Basis Function Classifier + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/svm_functions_f16.h" + +#if defined(ARM_FLOAT16_SUPPORTED) + +#include +#include + + +/** + * @addtogroup rbfsvm + * @{ + */ + + +/** + * @brief SVM rbf prediction + * @param[in] S Pointer to an instance of the rbf SVM structure. + * @param[in] in Pointer to input vector + * @param[out] pResult decision value + * @return none. + * + */ + +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) + +#include "arm_helium_utils.h" +#include "arm_vec_math_f16.h" + +void arm_svm_rbf_predict_f16( + const arm_svm_rbf_instance_f16 *S, + const float16_t * in, + int32_t * pResult) +{ + /* inlined Matrix x Vector function interleaved with dot prod */ + uint32_t numRows = S->nbOfSupportVectors; + uint32_t numCols = S->vectorDimension; + const float16_t *pSupport = S->supportVectors; + const float16_t *pSrcA = pSupport; + const float16_t *pInA0; + const float16_t *pInA1; + uint32_t row; + uint32_t blkCnt; /* loop counters */ + const float16_t *pDualCoef = S->dualCoefficients; + _Float16 sum = S->intercept; + f16x8_t vSum = vdupq_n_f16(0.0f16); + + row = numRows; + + /* + * compute 4 rows in parrallel + */ + while (row >= 4) { + const float16_t *pInA2, *pInA3; + float16_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec; + f16x8_t vecIn, acc0, acc1, acc2, acc3; + float16_t const *pSrcVecPtr = in; + + /* + * Initialize the pointers to 4 consecutive MatrixA rows + */ + pInA0 = pSrcA; + pInA1 = pInA0 + numCols; + pInA2 = pInA1 + numCols; + pInA3 = pInA2 + numCols; + /* + * Initialize the vector pointer + */ + pInVec = pSrcVecPtr; + /* + * reset accumulators + */ + acc0 = vdupq_n_f16(0.0f16); + acc1 = vdupq_n_f16(0.0f16); + acc2 = vdupq_n_f16(0.0f16); + acc3 = vdupq_n_f16(0.0f16); + + pSrcA0Vec = pInA0; + pSrcA1Vec = pInA1; + pSrcA2Vec = pInA2; + pSrcA3Vec = pInA3; + + blkCnt = numCols >> 3; + while (blkCnt > 0U) { + f16x8_t vecA; + f16x8_t vecDif; + + vecIn = vld1q(pInVec); + pInVec += 8; + vecA = vld1q(pSrcA0Vec); + pSrcA0Vec += 8; + vecDif = vsubq(vecIn, vecA); + acc0 = vfmaq(acc0, vecDif, vecDif); + vecA = vld1q(pSrcA1Vec); + pSrcA1Vec += 8; + vecDif = vsubq(vecIn, vecA); + acc1 = vfmaq(acc1, vecDif, vecDif); + vecA = vld1q(pSrcA2Vec); + pSrcA2Vec += 8; + vecDif = vsubq(vecIn, vecA); + acc2 = vfmaq(acc2, vecDif, vecDif); + vecA = vld1q(pSrcA3Vec); + pSrcA3Vec += 8; + vecDif = vsubq(vecIn, vecA); + acc3 = vfmaq(acc3, vecDif, vecDif); + + blkCnt--; + } + /* + * tail + * (will be merged thru tail predication) + */ + blkCnt = numCols & 7; + if (blkCnt > 0U) { + mve_pred16_t p0 = vctp16q(blkCnt); + f16x8_t vecA; + f16x8_t vecDif; + + vecIn = vldrhq_z_f16(pInVec, p0); + vecA = vldrhq_z_f16(pSrcA0Vec, p0); + vecDif = vsubq(vecIn, vecA); + acc0 = vfmaq(acc0, vecDif, vecDif); + vecA = vldrhq_z_f16(pSrcA1Vec, p0); + vecDif = vsubq(vecIn, vecA); + acc1 = vfmaq(acc1, vecDif, vecDif); + vecA = vldrhq_z_f16(pSrcA2Vec, p0);; + vecDif = vsubq(vecIn, vecA); + acc2 = vfmaq(acc2, vecDif, vecDif); + vecA = vldrhq_z_f16(pSrcA3Vec, p0); + vecDif = vsubq(vecIn, vecA); + acc3 = vfmaq(acc3, vecDif, vecDif); + } + /* + * Sum the partial parts + */ + + //sum += *pDualCoef++ * expf(-S->gamma * vecReduceF16Mve(acc0)); + f16x8_t vtmp = vuninitializedq_f16(); + vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0); + vtmp = vsetq_lane(vecAddAcrossF16Mve(acc1), vtmp, 1); + vtmp = vsetq_lane(vecAddAcrossF16Mve(acc2), vtmp, 2); + vtmp = vsetq_lane(vecAddAcrossF16Mve(acc3), vtmp, 3); + + vSum = + vfmaq_m_f16(vSum, vld1q(pDualCoef), + vexpq_f16(vmulq_n_f16(vtmp, -(_Float16)S->gamma)),vctp16q(4)); + pDualCoef += 4; + pSrcA += numCols * 4; + /* + * Decrement the row loop counter + */ + row -= 4; + } + + /* + * compute 2 rows in parrallel + */ + if (row >= 2) { + float16_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec; + f16x8_t vecIn, acc0, acc1; + float16_t const *pSrcVecPtr = in; + + /* + * Initialize the pointers to 2 consecutive MatrixA rows + */ + pInA0 = pSrcA; + pInA1 = pInA0 + numCols; + /* + * Initialize the vector pointer + */ + pInVec = pSrcVecPtr; + /* + * reset accumulators + */ + acc0 = vdupq_n_f16(0.0f16); + acc1 = vdupq_n_f16(0.0f16); + pSrcA0Vec = pInA0; + pSrcA1Vec = pInA1; + + blkCnt = numCols >> 3; + while (blkCnt > 0U) { + f16x8_t vecA; + f16x8_t vecDif; + + vecIn = vld1q(pInVec); + pInVec += 8; + vecA = vld1q(pSrcA0Vec); + pSrcA0Vec += 8; + vecDif = vsubq(vecIn, vecA); + acc0 = vfmaq(acc0, vecDif, vecDif);; + vecA = vld1q(pSrcA1Vec); + pSrcA1Vec += 8; + vecDif = vsubq(vecIn, vecA); + acc1 = vfmaq(acc1, vecDif, vecDif); + + blkCnt--; + } + /* + * tail + * (will be merged thru tail predication) + */ + blkCnt = numCols & 7; + if (blkCnt > 0U) { + mve_pred16_t p0 = vctp16q(blkCnt); + f16x8_t vecA, vecDif; + + vecIn = vldrhq_z_f16(pInVec, p0); + vecA = vldrhq_z_f16(pSrcA0Vec, p0); + vecDif = vsubq(vecIn, vecA); + acc0 = vfmaq(acc0, vecDif, vecDif); + vecA = vldrhq_z_f16(pSrcA1Vec, p0); + vecDif = vsubq(vecIn, vecA); + acc1 = vfmaq(acc1, vecDif, vecDif); + } + /* + * Sum the partial parts + */ + f16x8_t vtmp = vuninitializedq_f16(); + vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0); + vtmp = vsetq_lane(vecAddAcrossF16Mve(acc1), vtmp, 1); + + vSum = + vfmaq_m_f16(vSum, vld1q(pDualCoef), + vexpq_f16(vmulq_n_f16(vtmp, -(_Float16)S->gamma)), vctp16q(2)); + pDualCoef += 2; + + pSrcA += numCols * 2; + row -= 2; + } + + if (row >= 1) { + f16x8_t vecIn, acc0; + float16_t const *pSrcA0Vec, *pInVec; + float16_t const *pSrcVecPtr = in; + /* + * Initialize the pointers to last MatrixA row + */ + pInA0 = pSrcA; + /* + * Initialize the vector pointer + */ + pInVec = pSrcVecPtr; + /* + * reset accumulators + */ + acc0 = vdupq_n_f16(0.0f); + + pSrcA0Vec = pInA0; + + blkCnt = numCols >> 3; + while (blkCnt > 0U) { + f16x8_t vecA, vecDif; + + vecIn = vld1q(pInVec); + pInVec += 8; + vecA = vld1q(pSrcA0Vec); + pSrcA0Vec += 8; + vecDif = vsubq(vecIn, vecA); + acc0 = vfmaq(acc0, vecDif, vecDif); + + blkCnt--; + } + /* + * tail + * (will be merged thru tail predication) + */ + blkCnt = numCols & 7; + if (blkCnt > 0U) { + mve_pred16_t p0 = vctp16q(blkCnt); + f16x8_t vecA, vecDif; + + vecIn = vldrhq_z_f16(pInVec, p0); + vecA = vldrhq_z_f16(pSrcA0Vec, p0); + vecDif = vsubq(vecIn, vecA); + acc0 = vfmaq(acc0, vecDif, vecDif); + } + /* + * Sum the partial parts + */ + f16x8_t vtmp = vuninitializedq_f16(); + vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0); + + vSum = + vfmaq_m_f16(vSum, vld1q(pDualCoef), + vexpq_f16(vmulq_n_f16(vtmp, -(_Float16)S->gamma)), vctp16q(1)); + + } + + + sum += (_Float16)vecAddAcrossF16Mve(vSum); + *pResult = S->classes[STEP(sum)]; +} + +#else +void arm_svm_rbf_predict_f16( + const arm_svm_rbf_instance_f16 *S, + const float16_t * in, + int32_t * pResult) +{ + _Float16 sum=S->intercept; + _Float16 dot=00.f16; + uint32_t i,j; + const float16_t *pSupport = S->supportVectors; + + for(i=0; i < S->nbOfSupportVectors; i++) + { + dot=0.0f16; + for(j=0; j < S->vectorDimension; j++) + { + dot = dot + SQ((_Float16)in[j] - (_Float16) *pSupport); + pSupport++; + } + sum += (_Float16)S->dualCoefficients[i] * (_Float16)expf((float32_t)(-(_Float16)S->gamma * (_Float16)dot)); + } + *pResult=S->classes[STEP(sum)]; +} + +#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */ + +/** + * @} end of rbfsvm group + */ + +#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ + diff --git a/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_rbf_predict_f32.c b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_rbf_predict_f32.c new file mode 100644 index 0000000..d3c43bf --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_rbf_predict_f32.c @@ -0,0 +1,523 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_svm_rbf_predict_f32.c + * Description: SVM Radial Basis Function Classifier + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/svm_functions.h" +#include +#include + + +/** + * @addtogroup rbfsvm + * @{ + */ + + +/** + * @brief SVM rbf prediction + * @param[in] S Pointer to an instance of the rbf SVM structure. + * @param[in] in Pointer to input vector + * @param[out] pResult decision value + * @return none. + * + */ + +#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) + +#include "arm_helium_utils.h" +#include "arm_vec_math.h" + +void arm_svm_rbf_predict_f32( + const arm_svm_rbf_instance_f32 *S, + const float32_t * in, + int32_t * pResult) +{ + /* inlined Matrix x Vector function interleaved with dot prod */ + uint32_t numRows = S->nbOfSupportVectors; + uint32_t numCols = S->vectorDimension; + const float32_t *pSupport = S->supportVectors; + const float32_t *pSrcA = pSupport; + const float32_t *pInA0; + const float32_t *pInA1; + uint32_t row; + uint32_t blkCnt; /* loop counters */ + const float32_t *pDualCoef = S->dualCoefficients; + float32_t sum = S->intercept; + f32x4_t vSum = vdupq_n_f32(0); + + row = numRows; + + /* + * compute 4 rows in parrallel + */ + while (row >= 4) { + const float32_t *pInA2, *pInA3; + float32_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec; + f32x4_t vecIn, acc0, acc1, acc2, acc3; + float32_t const *pSrcVecPtr = in; + + /* + * Initialize the pointers to 4 consecutive MatrixA rows + */ + pInA0 = pSrcA; + pInA1 = pInA0 + numCols; + pInA2 = pInA1 + numCols; + pInA3 = pInA2 + numCols; + /* + * Initialize the vector pointer + */ + pInVec = pSrcVecPtr; + /* + * reset accumulators + */ + acc0 = vdupq_n_f32(0.0f); + acc1 = vdupq_n_f32(0.0f); + acc2 = vdupq_n_f32(0.0f); + acc3 = vdupq_n_f32(0.0f); + + pSrcA0Vec = pInA0; + pSrcA1Vec = pInA1; + pSrcA2Vec = pInA2; + pSrcA3Vec = pInA3; + + blkCnt = numCols >> 2; + while (blkCnt > 0U) { + f32x4_t vecA; + f32x4_t vecDif; + + vecIn = vld1q(pInVec); + pInVec += 4; + vecA = vld1q(pSrcA0Vec); + pSrcA0Vec += 4; + vecDif = vsubq(vecIn, vecA); + acc0 = vfmaq(acc0, vecDif, vecDif); + vecA = vld1q(pSrcA1Vec); + pSrcA1Vec += 4; + vecDif = vsubq(vecIn, vecA); + acc1 = vfmaq(acc1, vecDif, vecDif); + vecA = vld1q(pSrcA2Vec); + pSrcA2Vec += 4; + vecDif = vsubq(vecIn, vecA); + acc2 = vfmaq(acc2, vecDif, vecDif); + vecA = vld1q(pSrcA3Vec); + pSrcA3Vec += 4; + vecDif = vsubq(vecIn, vecA); + acc3 = vfmaq(acc3, vecDif, vecDif); + + blkCnt--; + } + /* + * tail + * (will be merged thru tail predication) + */ + blkCnt = numCols & 3; + if (blkCnt > 0U) { + mve_pred16_t p0 = vctp32q(blkCnt); + f32x4_t vecA; + f32x4_t vecDif; + + vecIn = vldrwq_z_f32(pInVec, p0); + vecA = vldrwq_z_f32(pSrcA0Vec, p0); + vecDif = vsubq(vecIn, vecA); + acc0 = vfmaq(acc0, vecDif, vecDif); + vecA = vldrwq_z_f32(pSrcA1Vec, p0); + vecDif = vsubq(vecIn, vecA); + acc1 = vfmaq(acc1, vecDif, vecDif); + vecA = vldrwq_z_f32(pSrcA2Vec, p0);; + vecDif = vsubq(vecIn, vecA); + acc2 = vfmaq(acc2, vecDif, vecDif); + vecA = vldrwq_z_f32(pSrcA3Vec, p0); + vecDif = vsubq(vecIn, vecA); + acc3 = vfmaq(acc3, vecDif, vecDif); + } + /* + * Sum the partial parts + */ + + //sum += *pDualCoef++ * expf(-S->gamma * vecReduceF32Mve(acc0)); + f32x4_t vtmp = vuninitializedq_f32(); + vtmp = vsetq_lane(vecAddAcrossF32Mve(acc0), vtmp, 0); + vtmp = vsetq_lane(vecAddAcrossF32Mve(acc1), vtmp, 1); + vtmp = vsetq_lane(vecAddAcrossF32Mve(acc2), vtmp, 2); + vtmp = vsetq_lane(vecAddAcrossF32Mve(acc3), vtmp, 3); + + vSum = + vfmaq_f32(vSum, vld1q(pDualCoef), + vexpq_f32(vmulq_n_f32(vtmp, -S->gamma))); + pDualCoef += 4; + pSrcA += numCols * 4; + /* + * Decrement the row loop counter + */ + row -= 4; + } + + /* + * compute 2 rows in parrallel + */ + if (row >= 2) { + float32_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec; + f32x4_t vecIn, acc0, acc1; + float32_t const *pSrcVecPtr = in; + + /* + * Initialize the pointers to 2 consecutive MatrixA rows + */ + pInA0 = pSrcA; + pInA1 = pInA0 + numCols; + /* + * Initialize the vector pointer + */ + pInVec = pSrcVecPtr; + /* + * reset accumulators + */ + acc0 = vdupq_n_f32(0.0f); + acc1 = vdupq_n_f32(0.0f); + pSrcA0Vec = pInA0; + pSrcA1Vec = pInA1; + + blkCnt = numCols >> 2; + while (blkCnt > 0U) { + f32x4_t vecA; + f32x4_t vecDif; + + vecIn = vld1q(pInVec); + pInVec += 4; + vecA = vld1q(pSrcA0Vec); + pSrcA0Vec += 4; + vecDif = vsubq(vecIn, vecA); + acc0 = vfmaq(acc0, vecDif, vecDif);; + vecA = vld1q(pSrcA1Vec); + pSrcA1Vec += 4; + vecDif = vsubq(vecIn, vecA); + acc1 = vfmaq(acc1, vecDif, vecDif); + + blkCnt--; + } + /* + * tail + * (will be merged thru tail predication) + */ + blkCnt = numCols & 3; + if (blkCnt > 0U) { + mve_pred16_t p0 = vctp32q(blkCnt); + f32x4_t vecA, vecDif; + + vecIn = vldrwq_z_f32(pInVec, p0); + vecA = vldrwq_z_f32(pSrcA0Vec, p0); + vecDif = vsubq(vecIn, vecA); + acc0 = vfmaq(acc0, vecDif, vecDif); + vecA = vldrwq_z_f32(pSrcA1Vec, p0); + vecDif = vsubq(vecIn, vecA); + acc1 = vfmaq(acc1, vecDif, vecDif); + } + /* + * Sum the partial parts + */ + f32x4_t vtmp = vuninitializedq_f32(); + vtmp = vsetq_lane(vecAddAcrossF32Mve(acc0), vtmp, 0); + vtmp = vsetq_lane(vecAddAcrossF32Mve(acc1), vtmp, 1); + + vSum = + vfmaq_m_f32(vSum, vld1q(pDualCoef), + vexpq_f32(vmulq_n_f32(vtmp, -S->gamma)), vctp32q(2)); + pDualCoef += 2; + + pSrcA += numCols * 2; + row -= 2; + } + + if (row >= 1) { + f32x4_t vecIn, acc0; + float32_t const *pSrcA0Vec, *pInVec; + float32_t const *pSrcVecPtr = in; + /* + * Initialize the pointers to last MatrixA row + */ + pInA0 = pSrcA; + /* + * Initialize the vector pointer + */ + pInVec = pSrcVecPtr; + /* + * reset accumulators + */ + acc0 = vdupq_n_f32(0.0f); + + pSrcA0Vec = pInA0; + + blkCnt = numCols >> 2; + while (blkCnt > 0U) { + f32x4_t vecA, vecDif; + + vecIn = vld1q(pInVec); + pInVec += 4; + vecA = vld1q(pSrcA0Vec); + pSrcA0Vec += 4; + vecDif = vsubq(vecIn, vecA); + acc0 = vfmaq(acc0, vecDif, vecDif); + + blkCnt--; + } + /* + * tail + * (will be merged thru tail predication) + */ + blkCnt = numCols & 3; + if (blkCnt > 0U) { + mve_pred16_t p0 = vctp32q(blkCnt); + f32x4_t vecA, vecDif; + + vecIn = vldrwq_z_f32(pInVec, p0); + vecA = vldrwq_z_f32(pSrcA0Vec, p0); + vecDif = vsubq(vecIn, vecA); + acc0 = vfmaq(acc0, vecDif, vecDif); + } + /* + * Sum the partial parts + */ + f32x4_t vtmp = vuninitializedq_f32(); + vtmp = vsetq_lane(vecAddAcrossF32Mve(acc0), vtmp, 0); + + vSum = + vfmaq_m_f32(vSum, vld1q(pDualCoef), + vexpq_f32(vmulq_n_f32(vtmp, -S->gamma)), vctp32q(1)); + + } + + + sum += vecAddAcrossF32Mve(vSum); + *pResult = S->classes[STEP(sum)]; +} + + +#else +#if defined(ARM_MATH_NEON) + +#include "NEMath.h" + +void arm_svm_rbf_predict_f32( + const arm_svm_rbf_instance_f32 *S, + const float32_t * in, + int32_t * pResult) +{ + float32_t sum = S->intercept; + + float32_t dot; + float32x4_t dotV; + + float32x4_t accuma,accumb,accumc,accumd,accum; + float32x2_t accum2; + float32x4_t temp; + float32x4_t vec1; + + float32x4_t vec2,vec2a,vec2b,vec2c,vec2d; + + uint32_t blkCnt; + uint32_t vectorBlkCnt; + + const float32_t *pIn = in; + + const float32_t *pSupport = S->supportVectors; + + const float32_t *pSupporta = S->supportVectors; + const float32_t *pSupportb; + const float32_t *pSupportc; + const float32_t *pSupportd; + + pSupportb = pSupporta + S->vectorDimension; + pSupportc = pSupportb + S->vectorDimension; + pSupportd = pSupportc + S->vectorDimension; + + const float32_t *pDualCoefs = S->dualCoefficients; + + + vectorBlkCnt = S->nbOfSupportVectors >> 2; + while (vectorBlkCnt > 0U) + { + accuma = vdupq_n_f32(0); + accumb = vdupq_n_f32(0); + accumc = vdupq_n_f32(0); + accumd = vdupq_n_f32(0); + + pIn = in; + + blkCnt = S->vectorDimension >> 2; + while (blkCnt > 0U) + { + + vec1 = vld1q_f32(pIn); + vec2a = vld1q_f32(pSupporta); + vec2b = vld1q_f32(pSupportb); + vec2c = vld1q_f32(pSupportc); + vec2d = vld1q_f32(pSupportd); + + pIn += 4; + pSupporta += 4; + pSupportb += 4; + pSupportc += 4; + pSupportd += 4; + + temp = vsubq_f32(vec1, vec2a); + accuma = vmlaq_f32(accuma, temp, temp); + + temp = vsubq_f32(vec1, vec2b); + accumb = vmlaq_f32(accumb, temp, temp); + + temp = vsubq_f32(vec1, vec2c); + accumc = vmlaq_f32(accumc, temp, temp); + + temp = vsubq_f32(vec1, vec2d); + accumd = vmlaq_f32(accumd, temp, temp); + + blkCnt -- ; + } + accum2 = vpadd_f32(vget_low_f32(accuma),vget_high_f32(accuma)); + dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,0); + + accum2 = vpadd_f32(vget_low_f32(accumb),vget_high_f32(accumb)); + dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,1); + + accum2 = vpadd_f32(vget_low_f32(accumc),vget_high_f32(accumc)); + dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,2); + + accum2 = vpadd_f32(vget_low_f32(accumd),vget_high_f32(accumd)); + dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,3); + + + blkCnt = S->vectorDimension & 3; + while (blkCnt > 0U) + { + dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,0) + SQ(*pIn - *pSupporta), dotV,0); + dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,1) + SQ(*pIn - *pSupportb), dotV,1); + dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,2) + SQ(*pIn - *pSupportc), dotV,2); + dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,3) + SQ(*pIn - *pSupportd), dotV,3); + + pSupporta++; + pSupportb++; + pSupportc++; + pSupportd++; + + pIn++; + + blkCnt -- ; + } + + vec1 = vld1q_f32(pDualCoefs); + pDualCoefs += 4; + + // To vectorize later + dotV = vmulq_n_f32(dotV, -S->gamma); + dotV = vexpq_f32(dotV); + + accum = vmulq_f32(vec1,dotV); + accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum)); + sum += vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1); + + pSupporta += 3*S->vectorDimension; + pSupportb += 3*S->vectorDimension; + pSupportc += 3*S->vectorDimension; + pSupportd += 3*S->vectorDimension; + + vectorBlkCnt -- ; + } + + pSupport = pSupporta; + vectorBlkCnt = S->nbOfSupportVectors & 3; + + while (vectorBlkCnt > 0U) + { + accum = vdupq_n_f32(0); + dot = 0.0f; + pIn = in; + + blkCnt = S->vectorDimension >> 2; + while (blkCnt > 0U) + { + + vec1 = vld1q_f32(pIn); + vec2 = vld1q_f32(pSupport); + pIn += 4; + pSupport += 4; + + temp = vsubq_f32(vec1,vec2); + accum = vmlaq_f32(accum, temp,temp); + + blkCnt -- ; + } + accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum)); + dot = vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1); + + + blkCnt = S->vectorDimension & 3; + while (blkCnt > 0U) + { + + dot = dot + SQ(*pIn - *pSupport); + pIn++; + pSupport++; + + blkCnt -- ; + } + + sum += *pDualCoefs++ * expf(-S->gamma * dot); + vectorBlkCnt -- ; + } + + *pResult=S->classes[STEP(sum)]; +} +#else +void arm_svm_rbf_predict_f32( + const arm_svm_rbf_instance_f32 *S, + const float32_t * in, + int32_t * pResult) +{ + float32_t sum=S->intercept; + float32_t dot=0; + uint32_t i,j; + const float32_t *pSupport = S->supportVectors; + + for(i=0; i < S->nbOfSupportVectors; i++) + { + dot=0; + for(j=0; j < S->vectorDimension; j++) + { + dot = dot + SQ(in[j] - *pSupport); + pSupport++; + } + sum += S->dualCoefficients[i] * expf(-S->gamma * dot); + } + *pResult=S->classes[STEP(sum)]; +} +#endif + +#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */ + +/** + * @} end of rbfsvm group + */ diff --git a/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_sigmoid_init_f16.c b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_sigmoid_init_f16.c new file mode 100644 index 0000000..7a27417 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_sigmoid_init_f16.c @@ -0,0 +1,98 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_svm_sigmoid_predict_f16.c + * Description: SVM Sigmoid Instance Initialization + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/svm_functions_f16.h" + +#if defined(ARM_FLOAT16_SUPPORTED) + +#include +#include + +/** + @ingroup groupSVM + */ + +/** + @defgroup sigmoidsvm Sigmoid SVM + + Sigmoid SVM classifier + */ + +/** + * @addtogroup sigmoidsvm + * @{ + */ + + +/** + * @brief SVM sigmoid instance init function + * + * Classes are integer used as output of the function (instead of having -1,1 + * as class values). + * + * @param[in] S points to an instance of the rbf SVM structure. + * @param[in] nbOfSupportVectors Number of support vectors + * @param[in] vectorDimension Dimension of vector space + * @param[in] intercept Intercept + * @param[in] dualCoefficients Array of dual coefficients + * @param[in] supportVectors Array of support vectors + * @param[in] classes Array of 2 classes ID + * @param[in] coef0 coeff0 (scikit-learn terminology) + * @param[in] gamma gamma (scikit-learn terminology) + * @return none. + * + */ + +void arm_svm_sigmoid_init_f16(arm_svm_sigmoid_instance_f16 *S, + uint32_t nbOfSupportVectors, + uint32_t vectorDimension, + float16_t intercept, + const float16_t *dualCoefficients, + const float16_t *supportVectors, + const int32_t *classes, + float16_t coef0, + float16_t gamma + ) +{ + S->nbOfSupportVectors = nbOfSupportVectors; + S->vectorDimension = vectorDimension; + S->intercept = intercept; + S->dualCoefficients = dualCoefficients; + S->supportVectors = supportVectors; + S->classes = classes; + S->coef0 = coef0; + S->gamma = gamma; +} + + +/** + * @} end of sigmoidsvm group + */ + +#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ + diff --git a/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_sigmoid_init_f32.c b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_sigmoid_init_f32.c new file mode 100644 index 0000000..a7f16c2 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_sigmoid_init_f32.c @@ -0,0 +1,92 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_svm_sigmoid_predict_f32.c + * Description: SVM Sigmoid Instance Initialization + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/svm_functions.h" +#include +#include + +/** + @ingroup groupSVM + */ + +/** + @defgroup sigmoidsvm Sigmoid SVM + + Sigmoid SVM classifier + */ + +/** + * @addtogroup sigmoidsvm + * @{ + */ + + +/** + * @brief SVM sigmoid instance init function + * + * Classes are integer used as output of the function (instead of having -1,1 + * as class values). + * + * @param[in] S points to an instance of the rbf SVM structure. + * @param[in] nbOfSupportVectors Number of support vectors + * @param[in] vectorDimension Dimension of vector space + * @param[in] intercept Intercept + * @param[in] dualCoefficients Array of dual coefficients + * @param[in] supportVectors Array of support vectors + * @param[in] classes Array of 2 classes ID + * @param[in] coef0 coeff0 (scikit-learn terminology) + * @param[in] gamma gamma (scikit-learn terminology) + * @return none. + * + */ + +void arm_svm_sigmoid_init_f32(arm_svm_sigmoid_instance_f32 *S, + uint32_t nbOfSupportVectors, + uint32_t vectorDimension, + float32_t intercept, + const float32_t *dualCoefficients, + const float32_t *supportVectors, + const int32_t *classes, + float32_t coef0, + float32_t gamma + ) +{ + S->nbOfSupportVectors = nbOfSupportVectors; + S->vectorDimension = vectorDimension; + S->intercept = intercept; + S->dualCoefficients = dualCoefficients; + S->supportVectors = supportVectors; + S->classes = classes; + S->coef0 = coef0; + S->gamma = gamma; +} + + +/** + * @} end of sigmoidsvm group + */ diff --git a/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_sigmoid_predict_f16.c b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_sigmoid_predict_f16.c new file mode 100644 index 0000000..e2d541f --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_sigmoid_predict_f16.c @@ -0,0 +1,333 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_svm_sigmoid_predict_f16.c + * Description: SVM Sigmoid Classifier + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/svm_functions_f16.h" + +#if defined(ARM_FLOAT16_SUPPORTED) + +#include +#include + +/** + * @addtogroup sigmoidsvm + * @{ + */ + + + +/** + * @brief SVM sigmoid prediction + * @param[in] S Pointer to an instance of the rbf SVM structure. + * @param[in] in Pointer to input vector + * @param[out] pResult Decision value + * @return none. + * + */ + +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) + +#include "arm_helium_utils.h" +#include "arm_vec_math_f16.h" + +void arm_svm_sigmoid_predict_f16( + const arm_svm_sigmoid_instance_f16 *S, + const float16_t * in, + int32_t * pResult) +{ + /* inlined Matrix x Vector function interleaved with dot prod */ + uint32_t numRows = S->nbOfSupportVectors; + uint32_t numCols = S->vectorDimension; + const float16_t *pSupport = S->supportVectors; + const float16_t *pSrcA = pSupport; + const float16_t *pInA0; + const float16_t *pInA1; + uint32_t row; + uint32_t blkCnt; /* loop counters */ + const float16_t *pDualCoef = S->dualCoefficients; + _Float16 sum = S->intercept; + f16x8_t vSum = vdupq_n_f16(0.0f); + + row = numRows; + + /* + * compute 4 rows in parrallel + */ + while (row >= 4) { + const float16_t *pInA2, *pInA3; + float16_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec; + f16x8_t vecIn, acc0, acc1, acc2, acc3; + float16_t const *pSrcVecPtr = in; + + /* + * Initialize the pointers to 4 consecutive MatrixA rows + */ + pInA0 = pSrcA; + pInA1 = pInA0 + numCols; + pInA2 = pInA1 + numCols; + pInA3 = pInA2 + numCols; + /* + * Initialize the vector pointer + */ + pInVec = pSrcVecPtr; + /* + * reset accumulators + */ + acc0 = vdupq_n_f16(0.0f); + acc1 = vdupq_n_f16(0.0f); + acc2 = vdupq_n_f16(0.0f); + acc3 = vdupq_n_f16(0.0f); + + pSrcA0Vec = pInA0; + pSrcA1Vec = pInA1; + pSrcA2Vec = pInA2; + pSrcA3Vec = pInA3; + + blkCnt = numCols >> 3; + while (blkCnt > 0U) { + f16x8_t vecA; + + vecIn = vld1q(pInVec); + pInVec += 8; + vecA = vld1q(pSrcA0Vec); + pSrcA0Vec += 8; + acc0 = vfmaq(acc0, vecIn, vecA); + vecA = vld1q(pSrcA1Vec); + pSrcA1Vec += 8; + acc1 = vfmaq(acc1, vecIn, vecA); + vecA = vld1q(pSrcA2Vec); + pSrcA2Vec += 8; + acc2 = vfmaq(acc2, vecIn, vecA); + vecA = vld1q(pSrcA3Vec); + pSrcA3Vec += 8; + acc3 = vfmaq(acc3, vecIn, vecA); + + blkCnt--; + } + /* + * tail + * (will be merged thru tail predication) + */ + blkCnt = numCols & 7; + if (blkCnt > 0U) { + mve_pred16_t p0 = vctp16q(blkCnt); + f16x8_t vecA; + + vecIn = vldrhq_z_f16(pInVec, p0); + vecA = vldrhq_z_f16(pSrcA0Vec, p0); + acc0 = vfmaq(acc0, vecIn, vecA); + vecA = vldrhq_z_f16(pSrcA1Vec, p0); + acc1 = vfmaq(acc1, vecIn, vecA); + vecA = vldrhq_z_f16(pSrcA2Vec, p0); + acc2 = vfmaq(acc2, vecIn, vecA); + vecA = vldrhq_z_f16(pSrcA3Vec, p0); + acc3 = vfmaq(acc3, vecIn, vecA); + } + /* + * Sum the partial parts + */ + f16x8_t vtmp = vuninitializedq_f16(); + vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0); + vtmp = vsetq_lane(vecAddAcrossF16Mve(acc1), vtmp, 1); + vtmp = vsetq_lane(vecAddAcrossF16Mve(acc2), vtmp, 2); + vtmp = vsetq_lane(vecAddAcrossF16Mve(acc3), vtmp, 3); + + vSum = + vfmaq_m_f16(vSum, vld1q(pDualCoef), + vtanhq_f16(vaddq_n_f16(vmulq_n_f16(vtmp, S->gamma), S->coef0)),vctp16q(4)); + + pDualCoef += 4; + + pSrcA += numCols * 4; + /* + * Decrement the row loop counter + */ + row -= 4; + } + + /* + * compute 2 rows in parrallel + */ + if (row >= 2) { + float16_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec; + f16x8_t vecIn, acc0, acc1; + float16_t const *pSrcVecPtr = in; + + /* + * Initialize the pointers to 2 consecutive MatrixA rows + */ + pInA0 = pSrcA; + pInA1 = pInA0 + numCols; + /* + * Initialize the vector pointer + */ + pInVec = pSrcVecPtr; + /* + * reset accumulators + */ + acc0 = vdupq_n_f16(0.0f); + acc1 = vdupq_n_f16(0.0f); + pSrcA0Vec = pInA0; + pSrcA1Vec = pInA1; + + blkCnt = numCols >> 3; + while (blkCnt > 0U) { + f16x8_t vecA; + + vecIn = vld1q(pInVec); + pInVec += 8; + vecA = vld1q(pSrcA0Vec); + pSrcA0Vec += 8; + acc0 = vfmaq(acc0, vecIn, vecA); + vecA = vld1q(pSrcA1Vec); + pSrcA1Vec += 8; + acc1 = vfmaq(acc1, vecIn, vecA); + + blkCnt--; + } + /* + * tail + * (will be merged thru tail predication) + */ + blkCnt = numCols & 7; + if (blkCnt > 0U) { + mve_pred16_t p0 = vctp16q(blkCnt); + f16x8_t vecA; + + vecIn = vldrhq_z_f16(pInVec, p0); + vecA = vldrhq_z_f16(pSrcA0Vec, p0); + acc0 = vfmaq(acc0, vecIn, vecA); + vecA = vldrhq_z_f16(pSrcA1Vec, p0); + acc1 = vfmaq(acc1, vecIn, vecA); + } + /* + * Sum the partial parts + */ + f16x8_t vtmp = vuninitializedq_f16(); + vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0); + vtmp = vsetq_lane(vecAddAcrossF16Mve(acc1), vtmp, 1); + + vSum = + vfmaq_m_f16(vSum, vld1q(pDualCoef), + vtanhq_f16(vaddq_n_f16(vmulq_n_f16(vtmp, S->gamma), S->coef0)), + vctp16q(2)); + + pSrcA += numCols * 2; + row -= 2; + } + + if (row >= 1) { + f16x8_t vecIn, acc0; + float16_t const *pSrcA0Vec, *pInVec; + float16_t const *pSrcVecPtr = in; + /* + * Initialize the pointers to last MatrixA row + */ + pInA0 = pSrcA; + /* + * Initialize the vector pointer + */ + pInVec = pSrcVecPtr; + /* + * reset accumulators + */ + acc0 = vdupq_n_f16(0.0f); + + pSrcA0Vec = pInA0; + + blkCnt = numCols >> 3; + while (blkCnt > 0U) { + f16x8_t vecA; + + vecIn = vld1q(pInVec); + pInVec += 8; + vecA = vld1q(pSrcA0Vec); + pSrcA0Vec += 8; + acc0 = vfmaq(acc0, vecIn, vecA); + + blkCnt--; + } + /* + * tail + * (will be merged thru tail predication) + */ + blkCnt = numCols & 7; + if (blkCnt > 0U) { + mve_pred16_t p0 = vctp16q(blkCnt); + f16x8_t vecA; + + vecIn = vldrhq_z_f16(pInVec, p0); + vecA = vldrhq_z_f16(pSrcA0Vec, p0); + acc0 = vfmaq(acc0, vecIn, vecA); + } + /* + * Sum the partial parts + */ + f16x8_t vtmp = vuninitializedq_f16(); + vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0); + + vSum = + vfmaq_m_f16(vSum, vld1q(pDualCoef), + vtanhq_f16(vaddq_n_f16(vmulq_n_f16(vtmp, S->gamma), S->coef0)), + vctp16q(1)); + } + sum += (_Float16)vecAddAcrossF16Mve(vSum); + + *pResult = S->classes[STEP(sum)]; +} + +#else +void arm_svm_sigmoid_predict_f16( + const arm_svm_sigmoid_instance_f16 *S, + const float16_t * in, + int32_t * pResult) +{ + _Float16 sum=S->intercept; + _Float16 dot=0.0f16; + uint32_t i,j; + const float16_t *pSupport = S->supportVectors; + + for(i=0; i < S->nbOfSupportVectors; i++) + { + dot=0.0f16; + for(j=0; j < S->vectorDimension; j++) + { + dot = (_Float16)dot + (_Float16)in[j] * (_Float16)*pSupport++; + } + sum += (_Float16)S->dualCoefficients[i] * (_Float16)tanhf((float32_t)((_Float16)S->gamma * (_Float16)dot + (_Float16)S->coef0)); + } + *pResult=S->classes[STEP(sum)]; +} + +#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */ + +/** + * @} end of sigmoidsvm group + */ + +#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ + diff --git a/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_sigmoid_predict_f32.c b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_sigmoid_predict_f32.c new file mode 100644 index 0000000..37c8a08 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_sigmoid_predict_f32.c @@ -0,0 +1,487 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_svm_sigmoid_predict_f32.c + * Description: SVM Sigmoid Classifier + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/svm_functions.h" +#include +#include + +/** + * @addtogroup sigmoidsvm + * @{ + */ + + + +/** + * @brief SVM sigmoid prediction + * @param[in] S Pointer to an instance of the rbf SVM structure. + * @param[in] in Pointer to input vector + * @param[out] pResult Decision value + * @return none. + * + */ + +#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) + +#include "arm_helium_utils.h" +#include "arm_vec_math.h" + +void arm_svm_sigmoid_predict_f32( + const arm_svm_sigmoid_instance_f32 *S, + const float32_t * in, + int32_t * pResult) +{ + /* inlined Matrix x Vector function interleaved with dot prod */ + uint32_t numRows = S->nbOfSupportVectors; + uint32_t numCols = S->vectorDimension; + const float32_t *pSupport = S->supportVectors; + const float32_t *pSrcA = pSupport; + const float32_t *pInA0; + const float32_t *pInA1; + uint32_t row; + uint32_t blkCnt; /* loop counters */ + const float32_t *pDualCoef = S->dualCoefficients; + float32_t sum = S->intercept; + f32x4_t vSum = vdupq_n_f32(0.0f); + + row = numRows; + + /* + * compute 4 rows in parrallel + */ + while (row >= 4) { + const float32_t *pInA2, *pInA3; + float32_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec; + f32x4_t vecIn, acc0, acc1, acc2, acc3; + float32_t const *pSrcVecPtr = in; + + /* + * Initialize the pointers to 4 consecutive MatrixA rows + */ + pInA0 = pSrcA; + pInA1 = pInA0 + numCols; + pInA2 = pInA1 + numCols; + pInA3 = pInA2 + numCols; + /* + * Initialize the vector pointer + */ + pInVec = pSrcVecPtr; + /* + * reset accumulators + */ + acc0 = vdupq_n_f32(0.0f); + acc1 = vdupq_n_f32(0.0f); + acc2 = vdupq_n_f32(0.0f); + acc3 = vdupq_n_f32(0.0f); + + pSrcA0Vec = pInA0; + pSrcA1Vec = pInA1; + pSrcA2Vec = pInA2; + pSrcA3Vec = pInA3; + + blkCnt = numCols >> 2; + while (blkCnt > 0U) { + f32x4_t vecA; + + vecIn = vld1q(pInVec); + pInVec += 4; + vecA = vld1q(pSrcA0Vec); + pSrcA0Vec += 4; + acc0 = vfmaq(acc0, vecIn, vecA); + vecA = vld1q(pSrcA1Vec); + pSrcA1Vec += 4; + acc1 = vfmaq(acc1, vecIn, vecA); + vecA = vld1q(pSrcA2Vec); + pSrcA2Vec += 4; + acc2 = vfmaq(acc2, vecIn, vecA); + vecA = vld1q(pSrcA3Vec); + pSrcA3Vec += 4; + acc3 = vfmaq(acc3, vecIn, vecA); + + blkCnt--; + } + /* + * tail + * (will be merged thru tail predication) + */ + blkCnt = numCols & 3; + if (blkCnt > 0U) { + mve_pred16_t p0 = vctp32q(blkCnt); + f32x4_t vecA; + + vecIn = vldrwq_z_f32(pInVec, p0); + vecA = vldrwq_z_f32(pSrcA0Vec, p0); + acc0 = vfmaq(acc0, vecIn, vecA); + vecA = vldrwq_z_f32(pSrcA1Vec, p0); + acc1 = vfmaq(acc1, vecIn, vecA); + vecA = vldrwq_z_f32(pSrcA2Vec, p0); + acc2 = vfmaq(acc2, vecIn, vecA); + vecA = vldrwq_z_f32(pSrcA3Vec, p0); + acc3 = vfmaq(acc3, vecIn, vecA); + } + /* + * Sum the partial parts + */ + f32x4_t vtmp = vuninitializedq_f32(); + vtmp = vsetq_lane(vecAddAcrossF32Mve(acc0), vtmp, 0); + vtmp = vsetq_lane(vecAddAcrossF32Mve(acc1), vtmp, 1); + vtmp = vsetq_lane(vecAddAcrossF32Mve(acc2), vtmp, 2); + vtmp = vsetq_lane(vecAddAcrossF32Mve(acc3), vtmp, 3); + + vSum = + vfmaq_f32(vSum, vld1q(pDualCoef), + vtanhq_f32(vaddq_n_f32(vmulq_n_f32(vtmp, S->gamma), S->coef0))); + + pDualCoef += 4; + + pSrcA += numCols * 4; + /* + * Decrement the row loop counter + */ + row -= 4; + } + + /* + * compute 2 rows in parrallel + */ + if (row >= 2) { + float32_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec; + f32x4_t vecIn, acc0, acc1; + float32_t const *pSrcVecPtr = in; + + /* + * Initialize the pointers to 2 consecutive MatrixA rows + */ + pInA0 = pSrcA; + pInA1 = pInA0 + numCols; + /* + * Initialize the vector pointer + */ + pInVec = pSrcVecPtr; + /* + * reset accumulators + */ + acc0 = vdupq_n_f32(0.0f); + acc1 = vdupq_n_f32(0.0f); + pSrcA0Vec = pInA0; + pSrcA1Vec = pInA1; + + blkCnt = numCols >> 2; + while (blkCnt > 0U) { + f32x4_t vecA; + + vecIn = vld1q(pInVec); + pInVec += 4; + vecA = vld1q(pSrcA0Vec); + pSrcA0Vec += 4; + acc0 = vfmaq(acc0, vecIn, vecA); + vecA = vld1q(pSrcA1Vec); + pSrcA1Vec += 4; + acc1 = vfmaq(acc1, vecIn, vecA); + + blkCnt--; + } + /* + * tail + * (will be merged thru tail predication) + */ + blkCnt = numCols & 3; + if (blkCnt > 0U) { + mve_pred16_t p0 = vctp32q(blkCnt); + f32x4_t vecA; + + vecIn = vldrwq_z_f32(pInVec, p0); + vecA = vldrwq_z_f32(pSrcA0Vec, p0); + acc0 = vfmaq(acc0, vecIn, vecA); + vecA = vldrwq_z_f32(pSrcA1Vec, p0); + acc1 = vfmaq(acc1, vecIn, vecA); + } + /* + * Sum the partial parts + */ + f32x4_t vtmp = vuninitializedq_f32(); + vtmp = vsetq_lane(vecAddAcrossF32Mve(acc0), vtmp, 0); + vtmp = vsetq_lane(vecAddAcrossF32Mve(acc1), vtmp, 1); + + vSum = + vfmaq_m_f32(vSum, vld1q(pDualCoef), + vtanhq_f32(vaddq_n_f32(vmulq_n_f32(vtmp, S->gamma), S->coef0)), + vctp32q(2)); + + pSrcA += numCols * 2; + row -= 2; + } + + if (row >= 1) { + f32x4_t vecIn, acc0; + float32_t const *pSrcA0Vec, *pInVec; + float32_t const *pSrcVecPtr = in; + /* + * Initialize the pointers to last MatrixA row + */ + pInA0 = pSrcA; + /* + * Initialize the vector pointer + */ + pInVec = pSrcVecPtr; + /* + * reset accumulators + */ + acc0 = vdupq_n_f32(0.0f); + + pSrcA0Vec = pInA0; + + blkCnt = numCols >> 2; + while (blkCnt > 0U) { + f32x4_t vecA; + + vecIn = vld1q(pInVec); + pInVec += 4; + vecA = vld1q(pSrcA0Vec); + pSrcA0Vec += 4; + acc0 = vfmaq(acc0, vecIn, vecA); + + blkCnt--; + } + /* + * tail + * (will be merged thru tail predication) + */ + blkCnt = numCols & 3; + if (blkCnt > 0U) { + mve_pred16_t p0 = vctp32q(blkCnt); + f32x4_t vecA; + + vecIn = vldrwq_z_f32(pInVec, p0); + vecA = vldrwq_z_f32(pSrcA0Vec, p0); + acc0 = vfmaq(acc0, vecIn, vecA); + } + /* + * Sum the partial parts + */ + f32x4_t vtmp = vuninitializedq_f32(); + vtmp = vsetq_lane(vecAddAcrossF32Mve(acc0), vtmp, 0); + + vSum = + vfmaq_m_f32(vSum, vld1q(pDualCoef), + vtanhq_f32(vaddq_n_f32(vmulq_n_f32(vtmp, S->gamma), S->coef0)), + vctp32q(1)); + } + sum += vecAddAcrossF32Mve(vSum); + + *pResult = S->classes[STEP(sum)]; +} + +#else +#if defined(ARM_MATH_NEON) +#include "NEMath.h" + +void arm_svm_sigmoid_predict_f32( + const arm_svm_sigmoid_instance_f32 *S, + const float32_t * in, + int32_t * pResult) +{ + float32_t sum = S->intercept; + + float32_t dot; + float32x4_t dotV; + + float32x4_t accuma,accumb,accumc,accumd,accum; + float32x2_t accum2; + float32x4_t vec1; + float32x4_t coef0 = vdupq_n_f32(S->coef0); + + float32x4_t vec2,vec2a,vec2b,vec2c,vec2d; + + uint32_t blkCnt; + uint32_t vectorBlkCnt; + + const float32_t *pIn = in; + + const float32_t *pSupport = S->supportVectors; + + const float32_t *pSupporta = S->supportVectors; + const float32_t *pSupportb; + const float32_t *pSupportc; + const float32_t *pSupportd; + + pSupportb = pSupporta + S->vectorDimension; + pSupportc = pSupportb + S->vectorDimension; + pSupportd = pSupportc + S->vectorDimension; + + const float32_t *pDualCoefs = S->dualCoefficients; + + vectorBlkCnt = S->nbOfSupportVectors >> 2; + while (vectorBlkCnt > 0U) + { + accuma = vdupq_n_f32(0); + accumb = vdupq_n_f32(0); + accumc = vdupq_n_f32(0); + accumd = vdupq_n_f32(0); + + pIn = in; + + blkCnt = S->vectorDimension >> 2; + while (blkCnt > 0U) + { + + vec1 = vld1q_f32(pIn); + vec2a = vld1q_f32(pSupporta); + vec2b = vld1q_f32(pSupportb); + vec2c = vld1q_f32(pSupportc); + vec2d = vld1q_f32(pSupportd); + + pIn += 4; + pSupporta += 4; + pSupportb += 4; + pSupportc += 4; + pSupportd += 4; + + accuma = vmlaq_f32(accuma, vec1,vec2a); + accumb = vmlaq_f32(accumb, vec1,vec2b); + accumc = vmlaq_f32(accumc, vec1,vec2c); + accumd = vmlaq_f32(accumd, vec1,vec2d); + + blkCnt -- ; + } + accum2 = vpadd_f32(vget_low_f32(accuma),vget_high_f32(accuma)); + dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,0); + + accum2 = vpadd_f32(vget_low_f32(accumb),vget_high_f32(accumb)); + dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,1); + + accum2 = vpadd_f32(vget_low_f32(accumc),vget_high_f32(accumc)); + dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,2); + + accum2 = vpadd_f32(vget_low_f32(accumd),vget_high_f32(accumd)); + dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,3); + + + blkCnt = S->vectorDimension & 3; + while (blkCnt > 0U) + { + dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,0) + *pIn * *pSupporta++, dotV,0); + dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,1) + *pIn * *pSupportb++, dotV,1); + dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,2) + *pIn * *pSupportc++, dotV,2); + dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,3) + *pIn * *pSupportd++, dotV,3); + + pIn++; + + blkCnt -- ; + } + + vec1 = vld1q_f32(pDualCoefs); + pDualCoefs += 4; + + // To vectorize later + dotV = vmulq_n_f32(dotV, S->gamma); + dotV = vaddq_f32(dotV, coef0); + + dotV = vtanhq_f32(dotV); + + accum = vmulq_f32(vec1,dotV); + accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum)); + sum += vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1); + + pSupporta += 3*S->vectorDimension; + pSupportb += 3*S->vectorDimension; + pSupportc += 3*S->vectorDimension; + pSupportd += 3*S->vectorDimension; + + vectorBlkCnt -- ; + } + + pSupport = pSupporta; + vectorBlkCnt = S->nbOfSupportVectors & 3; + + while (vectorBlkCnt > 0U) + { + accum = vdupq_n_f32(0); + dot = 0.0f; + pIn = in; + + blkCnt = S->vectorDimension >> 2; + while (blkCnt > 0U) + { + + vec1 = vld1q_f32(pIn); + vec2 = vld1q_f32(pSupport); + pIn += 4; + pSupport += 4; + + accum = vmlaq_f32(accum, vec1,vec2); + + blkCnt -- ; + } + accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum)); + dot = vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1); + + + blkCnt = S->vectorDimension & 3; + while (blkCnt > 0U) + { + dot = dot + *pIn++ * *pSupport++; + + blkCnt -- ; + } + + sum += *pDualCoefs++ * tanhf(S->gamma * dot + S->coef0); + vectorBlkCnt -- ; + } + + *pResult=S->classes[STEP(sum)]; +} +#else +void arm_svm_sigmoid_predict_f32( + const arm_svm_sigmoid_instance_f32 *S, + const float32_t * in, + int32_t * pResult) +{ + float32_t sum=S->intercept; + float32_t dot=0; + uint32_t i,j; + const float32_t *pSupport = S->supportVectors; + + for(i=0; i < S->nbOfSupportVectors; i++) + { + dot=0; + for(j=0; j < S->vectorDimension; j++) + { + dot = dot + in[j]* *pSupport++; + } + sum += S->dualCoefficients[i] * tanhf(S->gamma * dot + S->coef0); + } + *pResult=S->classes[STEP(sum)]; +} + +#endif +#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */ + +/** + * @} end of sigmoidsvm group + */ -- cgit v1.2.3