diff options
author | Clyne Sullivan <clyne@bitgloo.com> | 2025-01-29 21:34:25 -0500 |
---|---|---|
committer | Clyne Sullivan <clyne@bitgloo.com> | 2025-01-29 21:34:25 -0500 |
commit | 5b81bc8ccbd342b8566d88fc9f17a73aec03b5b6 (patch) | |
tree | cc57486912cfa74c6440d8b97c28f451ec787d78 /Drivers/CMSIS/DSP/Source/SupportFunctions |
initial commit
Diffstat (limited to 'Drivers/CMSIS/DSP/Source/SupportFunctions')
45 files changed, 7920 insertions, 0 deletions
diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/CMakeLists.txt b/Drivers/CMSIS/DSP/Source/SupportFunctions/CMakeLists.txt new file mode 100644 index 0000000..41b13f2 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/CMakeLists.txt @@ -0,0 +1,28 @@ +cmake_minimum_required (VERSION 3.14) + +project(CMSISDSPSupport) + +include(configLib) +include(configDsp) + +file(GLOB SRC "./*_*.c") + +add_library(CMSISDSPSupport STATIC ${SRC}) + +configLib(CMSISDSPSupport ${ROOT}) +configDsp(CMSISDSPSupport ${ROOT}) + +### Includes +target_include_directories(CMSISDSPSupport PUBLIC "${DSP}/Include") + +if ((NOT ARMAC5) AND (NOT DISABLEFLOAT16)) +target_sources(CMSISDSPSupport PRIVATE arm_copy_f16.c) +target_sources(CMSISDSPSupport PRIVATE arm_fill_f16.c) +target_sources(CMSISDSPSupport PRIVATE arm_f16_to_q15.c) +target_sources(CMSISDSPSupport PRIVATE arm_q15_to_f16.c) +target_sources(CMSISDSPSupport PRIVATE arm_float_to_f16.c) +target_sources(CMSISDSPSupport PRIVATE arm_f16_to_float.c) +target_sources(CMSISDSPSupport PRIVATE arm_weighted_sum_f16.c) +target_sources(CMSISDSPSupport PRIVATE arm_barycenter_f16.c) +endif() + diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/SupportFunctions.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/SupportFunctions.c new file mode 100644 index 0000000..ca8b1b6 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/SupportFunctions.c @@ -0,0 +1,63 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: SupportFunctions.c + * Description: Combination of all support function source files. + * + * $Date: 16. March 2020 + * $Revision: V1.1.0 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2019-2020 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_barycenter_f32.c" +#include "arm_bitonic_sort_f32.c" +#include "arm_bubble_sort_f32.c" +#include "arm_copy_f32.c" +#include "arm_copy_f64.c" +#include "arm_copy_q15.c" +#include "arm_copy_q31.c" +#include "arm_copy_q7.c" +#include "arm_fill_f32.c" +#include "arm_fill_f64.c" +#include "arm_fill_q15.c" +#include "arm_fill_q31.c" +#include "arm_fill_q7.c" +#include "arm_heap_sort_f32.c" +#include "arm_insertion_sort_f32.c" +#include "arm_merge_sort_f32.c" +#include "arm_merge_sort_init_f32.c" +#include "arm_quick_sort_f32.c" +#include "arm_selection_sort_f32.c" +#include "arm_sort_f32.c" +#include "arm_sort_init_f32.c" +#include "arm_weighted_sum_f32.c" + +#include "arm_float_to_q15.c" +#include "arm_float_to_q31.c" +#include "arm_float_to_q7.c" +#include "arm_q15_to_float.c" +#include "arm_q15_to_q31.c" +#include "arm_q15_to_q7.c" +#include "arm_q31_to_float.c" +#include "arm_q31_to_q15.c" +#include "arm_q31_to_q7.c" +#include "arm_q7_to_float.c" +#include "arm_q7_to_q15.c" +#include "arm_q7_to_q31.c" diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/SupportFunctionsF16.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/SupportFunctionsF16.c new file mode 100644 index 0000000..0e39d8d --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/SupportFunctionsF16.c @@ -0,0 +1,36 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: SupportFunctions.c + * Description: Combination of all support function source files. + * + * $Date: 16. March 2020 + * $Revision: V1.1.0 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2019-2020 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_copy_f16.c" +#include "arm_fill_f16.c" +#include "arm_f16_to_q15.c" +#include "arm_f16_to_float.c" +#include "arm_q15_to_f16.c" +#include "arm_float_to_f16.c" +#include "arm_weighted_sum_f16.c" +#include "arm_barycenter_f16.c" diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_barycenter_f16.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_barycenter_f16.c new file mode 100644 index 0000000..6dfe55c --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_barycenter_f16.c @@ -0,0 +1,274 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_barycenter_f16.c + * Description: Barycenter + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/support_functions_f16.h" + +#if defined(ARM_FLOAT16_SUPPORTED) + +#include <limits.h> +#include <math.h> + +/** + @ingroup groupSupport + */ + +/** + @defgroup barycenter Barycenter + + Barycenter of weighted vectors + */ + +/** + @addtogroup barycenter + @{ + */ + + +/** + * @brief Barycenter + * + * + * @param[in] *in List of vectors + * @param[in] *weights Weights of the vectors + * @param[out] *out Barycenter + * @param[in] nbVectors Number of vectors + * @param[in] vecDim Dimension of space (vector dimension) + * @return None + * + */ + +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) + +void arm_barycenter_f16(const float16_t *in, + const float16_t *weights, + float16_t *out, + uint32_t nbVectors, + uint32_t vecDim) +{ + const float16_t *pIn, *pW; + const float16_t *pIn1, *pIn2, *pIn3, *pIn4; + float16_t *pOut; + uint32_t blkCntVector, blkCntSample; + float16_t accum, w; + + blkCntVector = nbVectors; + blkCntSample = vecDim; + + accum = 0.0f; + + pW = weights; + pIn = in; + + + arm_fill_f16(0.0f, out, vecDim); + + + /* Sum */ + pIn1 = pIn; + pIn2 = pIn1 + vecDim; + pIn3 = pIn2 + vecDim; + pIn4 = pIn3 + vecDim; + + blkCntVector = nbVectors >> 2; + while (blkCntVector > 0) + { + f16x8_t outV, inV1, inV2, inV3, inV4; + float16_t w1, w2, w3, w4; + + pOut = out; + w1 = *pW++; + w2 = *pW++; + w3 = *pW++; + w4 = *pW++; + accum += (_Float16)w1 + (_Float16)w2 + (_Float16)w3 + (_Float16)w4; + + blkCntSample = vecDim >> 3; + while (blkCntSample > 0) { + outV = vld1q((const float16_t *) pOut); + inV1 = vld1q(pIn1); + inV2 = vld1q(pIn2); + inV3 = vld1q(pIn3); + inV4 = vld1q(pIn4); + outV = vfmaq(outV, inV1, w1); + outV = vfmaq(outV, inV2, w2); + outV = vfmaq(outV, inV3, w3); + outV = vfmaq(outV, inV4, w4); + vst1q(pOut, outV); + + pOut += 8; + pIn1 += 8; + pIn2 += 8; + pIn3 += 8; + pIn4 += 8; + + blkCntSample--; + } + + blkCntSample = vecDim & 7; + while (blkCntSample > 0) { + *pOut = (_Float16)*pOut + (_Float16)*pIn1++ * (_Float16)w1; + *pOut = (_Float16)*pOut + (_Float16)*pIn2++ * (_Float16)w2; + *pOut = (_Float16)*pOut + (_Float16)*pIn3++ * (_Float16)w3; + *pOut = (_Float16)*pOut + (_Float16)*pIn4++ * (_Float16)w4; + pOut++; + blkCntSample--; + } + + pIn1 += 3 * vecDim; + pIn2 += 3 * vecDim; + pIn3 += 3 * vecDim; + pIn4 += 3 * vecDim; + + blkCntVector--; + } + + pIn = pIn1; + + blkCntVector = nbVectors & 3; + while (blkCntVector > 0) + { + f16x8_t inV, outV; + + pOut = out; + w = *pW++; + accum += (_Float16)w; + + blkCntSample = vecDim >> 3; + while (blkCntSample > 0) + { + outV = vld1q_f16(pOut); + inV = vld1q_f16(pIn); + outV = vfmaq(outV, inV, w); + vst1q_f16(pOut, outV); + pOut += 8; + pIn += 8; + + blkCntSample--; + } + + blkCntSample = vecDim & 7; + while (blkCntSample > 0) + { + *pOut = (_Float16)*pOut + (_Float16)*pIn++ * (_Float16)w; + pOut++; + blkCntSample--; + } + + blkCntVector--; + } + + /* Normalize */ + pOut = out; + accum = 1.0f16 / (_Float16)accum; + + blkCntSample = vecDim >> 3; + while (blkCntSample > 0) + { + f16x8_t tmp; + + tmp = vld1q((const float16_t *) pOut); + tmp = vmulq(tmp, accum); + vst1q(pOut, tmp); + pOut += 8; + blkCntSample--; + } + + blkCntSample = vecDim & 7; + while (blkCntSample > 0) + { + *pOut = (_Float16)*pOut * (_Float16)accum; + pOut++; + blkCntSample--; + } +} +#else +void arm_barycenter_f16(const float16_t *in, const float16_t *weights, float16_t *out, uint32_t nbVectors,uint32_t vecDim) +{ + + const float16_t *pIn,*pW; + float16_t *pOut; + uint32_t blkCntVector,blkCntSample; + float16_t accum, w; + + blkCntVector = nbVectors; + blkCntSample = vecDim; + + accum = 0.0f16; + + pW = weights; + pIn = in; + + /* Set counters to 0 */ + blkCntSample = vecDim; + pOut = out; + + while(blkCntSample > 0) + { + *pOut = 0.0f16; + pOut++; + blkCntSample--; + } + + /* Sum */ + while(blkCntVector > 0) + { + pOut = out; + w = *pW++; + accum += (_Float16)w; + + blkCntSample = vecDim; + while(blkCntSample > 0) + { + *pOut = (_Float16)*pOut + (_Float16)*pIn++ * (_Float16)w; + pOut++; + blkCntSample--; + } + + blkCntVector--; + } + + /* Normalize */ + blkCntSample = vecDim; + pOut = out; + + while(blkCntSample > 0) + { + *pOut = (_Float16)*pOut / (_Float16)accum; + pOut++; + blkCntSample--; + } + +} +#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */ + +/** + * @} end of barycenter group + */ + +#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ + diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_barycenter_f32.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_barycenter_f32.c new file mode 100644 index 0000000..817bb58 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_barycenter_f32.c @@ -0,0 +1,414 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_barycenter_f32.c + * Description: Barycenter + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/support_functions.h" +#include <limits.h> +#include <math.h> + + +/** + @ingroup barycenter + */ + + +/** + * @brief Barycenter + * + * + * @param[in] *in List of vectors + * @param[in] *weights Weights of the vectors + * @param[out] *out Barycenter + * @param[in] nbVectors Number of vectors + * @param[in] vecDim Dimension of space (vector dimension) + * @return None + * + */ + +#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) +void arm_barycenter_f32(const float32_t *in, + const float32_t *weights, + float32_t *out, + uint32_t nbVectors, + uint32_t vecDim) +{ + const float32_t *pIn, *pW; + const float32_t *pIn1, *pIn2, *pIn3, *pIn4; + float32_t *pOut; + uint32_t blkCntVector, blkCntSample; + float32_t accum, w; + + blkCntVector = nbVectors; + blkCntSample = vecDim; + + accum = 0.0f; + + pW = weights; + pIn = in; + + + arm_fill_f32(0.0f, out, vecDim); + + + /* Sum */ + pIn1 = pIn; + pIn2 = pIn1 + vecDim; + pIn3 = pIn2 + vecDim; + pIn4 = pIn3 + vecDim; + + blkCntVector = nbVectors >> 2; + while (blkCntVector > 0) + { + f32x4_t outV, inV1, inV2, inV3, inV4; + float32_t w1, w2, w3, w4; + + pOut = out; + w1 = *pW++; + w2 = *pW++; + w3 = *pW++; + w4 = *pW++; + accum += w1 + w2 + w3 + w4; + + blkCntSample = vecDim >> 2; + while (blkCntSample > 0) { + outV = vld1q((const float32_t *) pOut); + inV1 = vld1q(pIn1); + inV2 = vld1q(pIn2); + inV3 = vld1q(pIn3); + inV4 = vld1q(pIn4); + outV = vfmaq(outV, inV1, w1); + outV = vfmaq(outV, inV2, w2); + outV = vfmaq(outV, inV3, w3); + outV = vfmaq(outV, inV4, w4); + vst1q(pOut, outV); + + pOut += 4; + pIn1 += 4; + pIn2 += 4; + pIn3 += 4; + pIn4 += 4; + + blkCntSample--; + } + + blkCntSample = vecDim & 3; + while (blkCntSample > 0) { + *pOut = *pOut + *pIn1++ * w1; + *pOut = *pOut + *pIn2++ * w2; + *pOut = *pOut + *pIn3++ * w3; + *pOut = *pOut + *pIn4++ * w4; + pOut++; + blkCntSample--; + } + + pIn1 += 3 * vecDim; + pIn2 += 3 * vecDim; + pIn3 += 3 * vecDim; + pIn4 += 3 * vecDim; + + blkCntVector--; + } + + pIn = pIn1; + + blkCntVector = nbVectors & 3; + while (blkCntVector > 0) + { + f32x4_t inV, outV; + + pOut = out; + w = *pW++; + accum += w; + + blkCntSample = vecDim >> 2; + while (blkCntSample > 0) + { + outV = vld1q_f32(pOut); + inV = vld1q_f32(pIn); + outV = vfmaq(outV, inV, w); + vst1q_f32(pOut, outV); + pOut += 4; + pIn += 4; + + blkCntSample--; + } + + blkCntSample = vecDim & 3; + while (blkCntSample > 0) + { + *pOut = *pOut + *pIn++ * w; + pOut++; + blkCntSample--; + } + + blkCntVector--; + } + + /* Normalize */ + pOut = out; + accum = 1.0f / accum; + + blkCntSample = vecDim >> 2; + while (blkCntSample > 0) + { + f32x4_t tmp; + + tmp = vld1q((const float32_t *) pOut); + tmp = vmulq(tmp, accum); + vst1q(pOut, tmp); + pOut += 4; + blkCntSample--; + } + + blkCntSample = vecDim & 3; + while (blkCntSample > 0) + { + *pOut = *pOut * accum; + pOut++; + blkCntSample--; + } +} +#else +#if defined(ARM_MATH_NEON) + +#include "NEMath.h" +void arm_barycenter_f32(const float32_t *in, const float32_t *weights, float32_t *out, uint32_t nbVectors,uint32_t vecDim) +{ + + const float32_t *pIn,*pW, *pIn1, *pIn2, *pIn3, *pIn4; + float32_t *pOut; + uint32_t blkCntVector,blkCntSample; + float32_t accum, w,w1,w2,w3,w4; + + float32x4_t tmp, inV,outV, inV1, inV2, inV3, inV4; + + blkCntVector = nbVectors; + blkCntSample = vecDim; + + accum = 0.0f; + + pW = weights; + pIn = in; + + /* Set counters to 0 */ + tmp = vdupq_n_f32(0.0f); + pOut = out; + + blkCntSample = vecDim >> 2; + while(blkCntSample > 0) + { + vst1q_f32(pOut, tmp); + pOut += 4; + blkCntSample--; + } + + blkCntSample = vecDim & 3; + while(blkCntSample > 0) + { + *pOut = 0.0f; + pOut++; + blkCntSample--; + } + + /* Sum */ + + pIn1 = pIn; + pIn2 = pIn1 + vecDim; + pIn3 = pIn2 + vecDim; + pIn4 = pIn3 + vecDim; + + blkCntVector = nbVectors >> 2; + while(blkCntVector > 0) + { + pOut = out; + w1 = *pW++; + w2 = *pW++; + w3 = *pW++; + w4 = *pW++; + accum += w1 + w2 + w3 + w4; + + blkCntSample = vecDim >> 2; + while(blkCntSample > 0) + { + outV = vld1q_f32(pOut); + inV1 = vld1q_f32(pIn1); + inV2 = vld1q_f32(pIn2); + inV3 = vld1q_f32(pIn3); + inV4 = vld1q_f32(pIn4); + outV = vmlaq_n_f32(outV,inV1,w1); + outV = vmlaq_n_f32(outV,inV2,w2); + outV = vmlaq_n_f32(outV,inV3,w3); + outV = vmlaq_n_f32(outV,inV4,w4); + vst1q_f32(pOut, outV); + pOut += 4; + pIn1 += 4; + pIn2 += 4; + pIn3 += 4; + pIn4 += 4; + + blkCntSample--; + } + + blkCntSample = vecDim & 3; + while(blkCntSample > 0) + { + *pOut = *pOut + *pIn1++ * w1; + *pOut = *pOut + *pIn2++ * w2; + *pOut = *pOut + *pIn3++ * w3; + *pOut = *pOut + *pIn4++ * w4; + pOut++; + blkCntSample--; + } + + pIn1 += 3*vecDim; + pIn2 += 3*vecDim; + pIn3 += 3*vecDim; + pIn4 += 3*vecDim; + + blkCntVector--; + } + + pIn = pIn1; + + blkCntVector = nbVectors & 3; + while(blkCntVector > 0) + { + pOut = out; + w = *pW++; + accum += w; + + blkCntSample = vecDim >> 2; + while(blkCntSample > 0) + { + outV = vld1q_f32(pOut); + inV = vld1q_f32(pIn); + outV = vmlaq_n_f32(outV,inV,w); + vst1q_f32(pOut, outV); + pOut += 4; + pIn += 4; + + blkCntSample--; + } + + blkCntSample = vecDim & 3; + while(blkCntSample > 0) + { + *pOut = *pOut + *pIn++ * w; + pOut++; + blkCntSample--; + } + + blkCntVector--; + } + + /* Normalize */ + pOut = out; + accum = 1.0f / accum; + + blkCntSample = vecDim >> 2; + while(blkCntSample > 0) + { + tmp = vld1q_f32(pOut); + tmp = vmulq_n_f32(tmp,accum); + vst1q_f32(pOut, tmp); + pOut += 4; + blkCntSample--; + } + + blkCntSample = vecDim & 3; + while(blkCntSample > 0) + { + *pOut = *pOut * accum; + pOut++; + blkCntSample--; + } + +} +#else +void arm_barycenter_f32(const float32_t *in, const float32_t *weights, float32_t *out, uint32_t nbVectors,uint32_t vecDim) +{ + + const float32_t *pIn,*pW; + float32_t *pOut; + uint32_t blkCntVector,blkCntSample; + float32_t accum, w; + + blkCntVector = nbVectors; + blkCntSample = vecDim; + + accum = 0.0f; + + pW = weights; + pIn = in; + + /* Set counters to 0 */ + blkCntSample = vecDim; + pOut = out; + + while(blkCntSample > 0) + { + *pOut = 0.0f; + pOut++; + blkCntSample--; + } + + /* Sum */ + while(blkCntVector > 0) + { + pOut = out; + w = *pW++; + accum += w; + + blkCntSample = vecDim; + while(blkCntSample > 0) + { + *pOut = *pOut + *pIn++ * w; + pOut++; + blkCntSample--; + } + + blkCntVector--; + } + + /* Normalize */ + blkCntSample = vecDim; + pOut = out; + + while(blkCntSample > 0) + { + *pOut = *pOut / accum; + pOut++; + blkCntSample--; + } + +} +#endif +#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */ + +/** + * @} end of barycenter group + */ diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_bitonic_sort_f32.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_bitonic_sort_f32.c new file mode 100644 index 0000000..e9612b1 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_bitonic_sort_f32.c @@ -0,0 +1,1039 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_bitonic_sort_f32.c + * Description: Floating point bitonic sort + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/support_functions.h" +#include "arm_sorting.h" + + +#if !defined(ARM_MATH_NEON) + +static void arm_bitonic_sort_core_f32(float32_t *pSrc, uint32_t n, uint8_t dir) +{ + uint32_t step; + uint32_t k, j; + float32_t *leftPtr, *rightPtr; + float32_t temp; + + step = n>>1; + leftPtr = pSrc; + rightPtr = pSrc+n-1; + + for(k=0; k<step; k++) + { + if(dir == (*leftPtr > *rightPtr)) + { + // Swap + temp=*leftPtr; + *leftPtr=*rightPtr; + *rightPtr=temp; + } + + leftPtr++; // Move right + rightPtr--; // Move left + } + + // Merge + for(step=(n>>2); step>0; step/=2) + { + for(j=0; j<n; j=j+step*2) + { + leftPtr = pSrc+j; + rightPtr = pSrc+j+step; + + for(k=0; k<step; k++) + { + if(*leftPtr > *rightPtr) + { + // Swap + temp=*leftPtr; + *leftPtr=*rightPtr; + *rightPtr=temp; + } + + leftPtr++; + rightPtr++; + } + } + } +} +#endif + +#if defined(ARM_MATH_NEON) + + +static float32x4x2_t arm_bitonic_resort_8_f32(float32x4_t a, float32x4_t b, uint8_t dir) +{ + /* Start with two vectors: + * +---+---+---+---+ + * | a | b | c | d | + * +---+---+---+---+ + * +---+---+---+---+ + * | e | f | g | h | + * +---+---+---+---+ + * All the elements of the first are guaranteed to be less than or equal to + * all of the elements in the second, and both vectors are bitonic. + * We need to perform these operations to completely sort both lists: + * vminmax([abcd],[efgh]) + * vminmax([acbd],[egfh]) + */ + vtrn128_64q(a, b); + /* +---+---+---+---+ + * | a | b | e | f | + * +---+---+---+---+ + * +---+---+---+---+ + * | c | d | g | h | + * +---+---+---+---+ + */ + if(dir) + vminmaxq(a, b); + else + vminmaxq(b, a); + + vtrn128_32q(a, b); + /* +---+---+---+---+ + * | a | c | e | g | + * +---+---+---+---+ + * +---+---+---+---+ + * | b | d | f | h | + * +---+---+---+---+ + */ + if(dir) + vminmaxq(a, b); + else + vminmaxq(b, a); + + return vzipq_f32(a, b); +} + + +static float32x4x2_t arm_bitonic_merge_8_f32(float32x4_t a, float32x4_t b, uint8_t dir) +{ + /* a and b are guaranteed to be bitonic */ + // Reverse the element of the second vector + b = vrev128q_f32(b); + + // Compare the two vectors + if(dir) + vminmaxq(a, b); + else + vminmaxq(b, a); + + // Merge the two vectors + float32x4x2_t ab = arm_bitonic_resort_8_f32(a, b, dir); + + return ab; +} + +static void arm_bitonic_resort_16_f32(float32_t * pOut, float32x4x2_t a, float32x4x2_t b, uint8_t dir) +{ + /* Start with two vectors: + * +---+---+---+---+---+---+---+---+ + * | a | b | c | d | e | f | g | h | + * +---+---+---+---+---+---+---+---+ + * +---+---+---+---+---+---+---+---+ + * | i | j | k | l | m | n | o | p | + * +---+---+---+---+---+---+---+---+ + * All the elements of the first are guaranteed to be less than or equal to + * all of the elements in the second, and both vectors are bitonic. + * We need to perform these operations to completely sort both lists: + * vminmax([abcd],[efgh]) vminmax([ijkl],[mnop]) + * vminmax([abef],[cdgh]) vminmax([ijmn],[klop]) + * vminmax([acef],[bdfh]) vminmax([ikmo],[jlmp]) + */ + + vtrn256_128q(a, b); + /* +---+---+---+---+---+---+---+---+ + * | a | b | c | d | i | j | k | l | + * +---+---+---+---+---+---+---+---+ + * +---+---+---+---+---+---+---+---+ + * | e | f | g | h | m | n | o | p | + * +---+---+---+---+---+---+---+---+ + */ + if(dir) + vminmax256q(a, b); + else + vminmax256q(b, a); + + vtrn256_64q(a, b); + + /* +---+---+---+---+---+---+---+---+ + * | a | b | e | f | i | j | m | n | + * +---+---+---+---+---+---+---+---+ + * +---+---+---+---+---+---+---+---+ + * | c | d | g | h | k | l | o | p | + * +---+---+---+---+---+---+---+---+ + */ + if(dir) + vminmax256q(a, b); + else + vminmax256q(b, a); + + vtrn256_32q(a, b); + /* We now have: + * +---+---+---+---+---+---+---+---+ + * | a | c | e | g | i | k | m | o | + * +---+---+---+---+---+---+---+---+ + * +---+---+---+---+---+---+---+---+ + * | b | d | f | h | j | l | n | p | + * +---+---+---+---+---+---+---+---+ + */ + if(dir) + vminmax256q(a, b); + else + vminmax256q(b, a); + + float32x4x2_t out1 = vzipq_f32(a.val[0], b.val[0]); + float32x4x2_t out2 = vzipq_f32(a.val[1], b.val[1]); + + vst1q_f32(pOut, out1.val[0]); + vst1q_f32(pOut+4, out1.val[1]); + vst1q_f32(pOut+8, out2.val[0]); + vst1q_f32(pOut+12, out2.val[1]); +} + +static void arm_bitonic_merge_16_f32(float32_t * pOut, float32x4x2_t a, float32x4x2_t b, uint8_t dir) +{ + // Merge two preordered float32x4x2_t + vrev256q_f32(b); + + if(dir) + vminmax256q(a, b); + else + vminmax256q(b, a); + + arm_bitonic_resort_16_f32(pOut, a, b, dir); +} + +static void arm_bitonic_sort_16_f32(float32_t *pSrc, float32_t *pDst, uint8_t dir) +{ + float32x4_t a; + float32x4_t b; + float32x4_t c; + float32x4_t d; + + // Load 16 samples + a = vld1q_f32(pSrc); + b = vld1q_f32(pSrc+4); + c = vld1q_f32(pSrc+8); + d = vld1q_f32(pSrc+12); + + // Bitonic sorting network for 4 samples x 4 times + if(dir) + { + vminmaxq(a, b); + vminmaxq(c, d); + + vminmaxq(a, d); + vminmaxq(b, c); + + vminmaxq(a, b); + vminmaxq(c, d); + } + else + { + vminmaxq(b, a); + vminmaxq(d, c); + + vminmaxq(d, a); + vminmaxq(c, b); + + vminmaxq(b, a); + vminmaxq(d, c); + } + + float32x4x2_t ab = vtrnq_f32 (a, b); + float32x4x2_t cd = vtrnq_f32 (c, d); + + // Transpose 4 ordered arrays of 4 samples + a = vcombine_f32(vget_low_f32(ab.val[0]), vget_low_f32(cd.val[0])); + b = vcombine_f32(vget_low_f32(ab.val[1]), vget_low_f32(cd.val[1])); + c = vcombine_f32(vget_high_f32(ab.val[0]), vget_high_f32(cd.val[0])); + d = vcombine_f32(vget_high_f32(ab.val[1]), vget_high_f32(cd.val[1])); + + // Merge pairs of arrays of 4 samples + ab = arm_bitonic_merge_8_f32(a, b, dir); + cd = arm_bitonic_merge_8_f32(c, d, dir); + + // Merge arrays of 8 samples + arm_bitonic_merge_16_f32(pDst, ab, cd, dir); +} + + + + + +static void arm_bitonic_merge_32_f32(float32_t * pSrc, float32x4x2_t ab1, float32x4x2_t ab2, float32x4x2_t cd1, float32x4x2_t cd2, uint8_t dir) +{ + //Compare + if(dir) + { + vminmax256q(ab1, cd1); + vminmax256q(ab2, cd2); + } + else + { + vminmax256q(cd1, ab1); + vminmax256q(cd2, ab2); + } + //Transpose 256 + float32x4_t temp; + + temp = ab2.val[0]; + ab2.val[0] = cd1.val[0]; + cd1.val[0] = temp; + temp = ab2.val[1]; + ab2.val[1] = cd1.val[1]; + cd1.val[1] = temp; + + //Compare + if(dir) + { + vminmax256q(ab1, cd1); + vminmax256q(ab2, cd2); + } + else + { + vminmax256q(cd1, ab1); + vminmax256q(cd2, ab2); + } + + //Transpose 128 + arm_bitonic_merge_16_f32(pSrc+0, ab1, cd1, dir); + arm_bitonic_merge_16_f32(pSrc+16, ab2, cd2, dir); +} + +static void arm_bitonic_merge_64_f32(float32_t * pSrc, uint8_t dir) +{ + float32x4x2_t ab1, ab2, ab3, ab4; + float32x4x2_t cd1, cd2, cd3, cd4; + + //Load and reverse second array + ab1.val[0] = vld1q_f32(pSrc+0 ); + ab1.val[1] = vld1q_f32(pSrc+4 ); + ab2.val[0] = vld1q_f32(pSrc+8 ); + ab2.val[1] = vld1q_f32(pSrc+12); + ab3.val[0] = vld1q_f32(pSrc+16); + ab3.val[1] = vld1q_f32(pSrc+20); + ab4.val[0] = vld1q_f32(pSrc+24); + ab4.val[1] = vld1q_f32(pSrc+28); + + vldrev128q_f32(cd4.val[1], pSrc+32); + vldrev128q_f32(cd4.val[0], pSrc+36); + vldrev128q_f32(cd3.val[1], pSrc+40); + vldrev128q_f32(cd3.val[0], pSrc+44); + vldrev128q_f32(cd2.val[1], pSrc+48); + vldrev128q_f32(cd2.val[0], pSrc+52); + vldrev128q_f32(cd1.val[1], pSrc+56); + vldrev128q_f32(cd1.val[0], pSrc+60); + + //Compare + if(dir) + { + vminmax256q(ab1, cd1); + vminmax256q(ab2, cd2); + vminmax256q(ab3, cd3); + vminmax256q(ab4, cd4); + } + else + { + vminmax256q(cd1, ab1); + vminmax256q(cd2, ab2); + vminmax256q(cd3, ab3); + vminmax256q(cd4, ab4); + } + + //Transpose 512 + float32x4_t temp; + + temp = ab3.val[0]; + ab3.val[0] = cd1.val[0]; + cd1.val[0] = temp; + temp = ab3.val[1]; + ab3.val[1] = cd1.val[1]; + cd1.val[1] = temp; + temp = ab4.val[0]; + ab4.val[0] = cd2.val[0]; + cd2.val[0] = temp; + temp = ab4.val[1]; + ab4.val[1] = cd2.val[1]; + cd2.val[1] = temp; + + //Compare + if(dir) + { + vminmax256q(ab1, cd1); + vminmax256q(ab2, cd2); + vminmax256q(ab3, cd3); + vminmax256q(ab4, cd4); + } + else + { + vminmax256q(cd1, ab1); + vminmax256q(cd2, ab2); + vminmax256q(cd3, ab3); + vminmax256q(cd4, ab4); + } + + //Transpose 256 + arm_bitonic_merge_32_f32(pSrc+0, ab1, ab2, cd1, cd2, dir); + arm_bitonic_merge_32_f32(pSrc+32, ab3, ab4, cd3, cd4, dir); +} + +static void arm_bitonic_merge_128_f32(float32_t * pSrc, uint8_t dir) +{ + float32x4x2_t ab1, ab2, ab3, ab4, ab5, ab6, ab7, ab8; + float32x4x2_t cd1, cd2, cd3, cd4, cd5, cd6, cd7, cd8; + + //Load and reverse second array + ab1.val[0] = vld1q_f32(pSrc+0 ); + ab1.val[1] = vld1q_f32(pSrc+4 ); + ab2.val[0] = vld1q_f32(pSrc+8 ); + ab2.val[1] = vld1q_f32(pSrc+12); + ab3.val[0] = vld1q_f32(pSrc+16); + ab3.val[1] = vld1q_f32(pSrc+20); + ab4.val[0] = vld1q_f32(pSrc+24); + ab4.val[1] = vld1q_f32(pSrc+28); + ab5.val[0] = vld1q_f32(pSrc+32); + ab5.val[1] = vld1q_f32(pSrc+36); + ab6.val[0] = vld1q_f32(pSrc+40); + ab6.val[1] = vld1q_f32(pSrc+44); + ab7.val[0] = vld1q_f32(pSrc+48); + ab7.val[1] = vld1q_f32(pSrc+52); + ab8.val[0] = vld1q_f32(pSrc+56); + ab8.val[1] = vld1q_f32(pSrc+60); + + vldrev128q_f32(cd8.val[1], pSrc+64); + vldrev128q_f32(cd8.val[0], pSrc+68); + vldrev128q_f32(cd7.val[1], pSrc+72); + vldrev128q_f32(cd7.val[0], pSrc+76); + vldrev128q_f32(cd6.val[1], pSrc+80); + vldrev128q_f32(cd6.val[0], pSrc+84); + vldrev128q_f32(cd5.val[1], pSrc+88); + vldrev128q_f32(cd5.val[0], pSrc+92); + vldrev128q_f32(cd4.val[1], pSrc+96); + vldrev128q_f32(cd4.val[0], pSrc+100); + vldrev128q_f32(cd3.val[1], pSrc+104); + vldrev128q_f32(cd3.val[0], pSrc+108); + vldrev128q_f32(cd2.val[1], pSrc+112); + vldrev128q_f32(cd2.val[0], pSrc+116); + vldrev128q_f32(cd1.val[1], pSrc+120); + vldrev128q_f32(cd1.val[0], pSrc+124); + + //Compare + if(dir) + { + vminmax256q(ab1, cd1); + vminmax256q(ab2, cd2); + vminmax256q(ab3, cd3); + vminmax256q(ab4, cd4); + vminmax256q(ab5, cd5); + vminmax256q(ab6, cd6); + vminmax256q(ab7, cd7); + vminmax256q(ab8, cd8); + } + else + { + vminmax256q(cd1, ab1); + vminmax256q(cd2, ab2); + vminmax256q(cd3, ab3); + vminmax256q(cd4, ab4); + vminmax256q(cd5, ab5); + vminmax256q(cd6, ab6); + vminmax256q(cd7, ab7); + vminmax256q(cd8, ab8); + } + + //Transpose + float32x4_t temp; + + temp = ab5.val[0]; + ab5.val[0] = cd1.val[0]; + cd1.val[0] = temp; + temp = ab5.val[1]; + ab5.val[1] = cd1.val[1]; + cd1.val[1] = temp; + temp = ab6.val[0]; + ab6.val[0] = cd2.val[0]; + cd2.val[0] = temp; + temp = ab6.val[1]; + ab6.val[1] = cd2.val[1]; + cd2.val[1] = temp; + temp = ab7.val[0]; + ab7.val[0] = cd3.val[0]; + cd3.val[0] = temp; + temp = ab7.val[1]; + ab7.val[1] = cd3.val[1]; + cd3.val[1] = temp; + temp = ab8.val[0]; + ab8.val[0] = cd4.val[0]; + cd4.val[0] = temp; + temp = ab8.val[1]; + ab8.val[1] = cd4.val[1]; + cd4.val[1] = temp; + + //Compare + if(dir) + { + vminmax256q(ab1, cd1); + vminmax256q(ab2, cd2); + vminmax256q(ab3, cd3); + vminmax256q(ab4, cd4); + vminmax256q(ab5, cd5); + vminmax256q(ab6, cd6); + vminmax256q(ab7, cd7); + vminmax256q(ab8, cd8); + } + else + { + vminmax256q(cd1, ab1); + vminmax256q(cd2, ab2); + vminmax256q(cd3, ab3); + vminmax256q(cd4, ab4); + vminmax256q(cd5, ab5); + vminmax256q(cd6, ab6); + vminmax256q(cd7, ab7); + vminmax256q(cd8, ab8); + } + + vst1q_f32(pSrc, ab1.val[0]); + vst1q_f32(pSrc+4, ab1.val[1]); + vst1q_f32(pSrc+8, ab2.val[0]); + vst1q_f32(pSrc+12, ab2.val[1]); + vst1q_f32(pSrc+16, ab3.val[0]); + vst1q_f32(pSrc+20, ab3.val[1]); + vst1q_f32(pSrc+24, ab4.val[0]); + vst1q_f32(pSrc+28, ab4.val[1]); + vst1q_f32(pSrc+32, cd1.val[0]); + vst1q_f32(pSrc+36, cd1.val[1]); + vst1q_f32(pSrc+40, cd2.val[0]); + vst1q_f32(pSrc+44, cd2.val[1]); + vst1q_f32(pSrc+48, cd3.val[0]); + vst1q_f32(pSrc+52, cd3.val[1]); + vst1q_f32(pSrc+56, cd4.val[0]); + vst1q_f32(pSrc+60, cd4.val[1]); + vst1q_f32(pSrc+64, ab5.val[0]); + vst1q_f32(pSrc+68, ab5.val[1]); + vst1q_f32(pSrc+72, ab6.val[0]); + vst1q_f32(pSrc+76, ab6.val[1]); + vst1q_f32(pSrc+80, ab7.val[0]); + vst1q_f32(pSrc+84, ab7.val[1]); + vst1q_f32(pSrc+88, ab8.val[0]); + vst1q_f32(pSrc+92, ab8.val[1]); + vst1q_f32(pSrc+96, cd5.val[0]); + vst1q_f32(pSrc+100, cd5.val[1]); + vst1q_f32(pSrc+104, cd6.val[0]); + vst1q_f32(pSrc+108, cd6.val[1]); + vst1q_f32(pSrc+112, cd7.val[0]); + vst1q_f32(pSrc+116, cd7.val[1]); + vst1q_f32(pSrc+120, cd8.val[0]); + vst1q_f32(pSrc+124, cd8.val[1]); + + //Transpose + arm_bitonic_merge_64_f32(pSrc+0 , dir); + arm_bitonic_merge_64_f32(pSrc+64, dir); +} + +static void arm_bitonic_merge_256_f32(float32_t * pSrc, uint8_t dir) +{ + float32x4x2_t ab1, ab2, ab3, ab4, ab5, ab6, ab7, ab8; + float32x4x2_t ab9, ab10, ab11, ab12, ab13, ab14, ab15, ab16; + float32x4x2_t cd1, cd2, cd3, cd4, cd5, cd6, cd7, cd8; + float32x4x2_t cd9, cd10, cd11, cd12, cd13, cd14, cd15, cd16; + + //Load and reverse second array + ab1.val[0] = vld1q_f32(pSrc+0 ); + ab1.val[1] = vld1q_f32(pSrc+4 ); + ab2.val[0] = vld1q_f32(pSrc+8 ); + ab2.val[1] = vld1q_f32(pSrc+12 ); + ab3.val[0] = vld1q_f32(pSrc+16 ); + ab3.val[1] = vld1q_f32(pSrc+20 ); + ab4.val[0] = vld1q_f32(pSrc+24 ); + ab4.val[1] = vld1q_f32(pSrc+28 ); + ab5.val[0] = vld1q_f32(pSrc+32 ); + ab5.val[1] = vld1q_f32(pSrc+36 ); + ab6.val[0] = vld1q_f32(pSrc+40 ); + ab6.val[1] = vld1q_f32(pSrc+44 ); + ab7.val[0] = vld1q_f32(pSrc+48 ); + ab7.val[1] = vld1q_f32(pSrc+52 ); + ab8.val[0] = vld1q_f32(pSrc+56 ); + ab8.val[1] = vld1q_f32(pSrc+60 ); + ab9.val[0] = vld1q_f32(pSrc+64 ); + ab9.val[1] = vld1q_f32(pSrc+68 ); + ab10.val[0] = vld1q_f32(pSrc+72 ); + ab10.val[1] = vld1q_f32(pSrc+76 ); + ab11.val[0] = vld1q_f32(pSrc+80 ); + ab11.val[1] = vld1q_f32(pSrc+84 ); + ab12.val[0] = vld1q_f32(pSrc+88 ); + ab12.val[1] = vld1q_f32(pSrc+92 ); + ab13.val[0] = vld1q_f32(pSrc+96 ); + ab13.val[1] = vld1q_f32(pSrc+100); + ab14.val[0] = vld1q_f32(pSrc+104); + ab14.val[1] = vld1q_f32(pSrc+108); + ab15.val[0] = vld1q_f32(pSrc+112); + ab15.val[1] = vld1q_f32(pSrc+116); + ab16.val[0] = vld1q_f32(pSrc+120); + ab16.val[1] = vld1q_f32(pSrc+124); + + vldrev128q_f32(cd16.val[1], pSrc+128); + vldrev128q_f32(cd16.val[0], pSrc+132); + vldrev128q_f32(cd15.val[1], pSrc+136); + vldrev128q_f32(cd15.val[0], pSrc+140); + vldrev128q_f32(cd14.val[1], pSrc+144); + vldrev128q_f32(cd14.val[0], pSrc+148); + vldrev128q_f32(cd13.val[1], pSrc+152); + vldrev128q_f32(cd13.val[0], pSrc+156); + vldrev128q_f32(cd12.val[1], pSrc+160); + vldrev128q_f32(cd12.val[0], pSrc+164); + vldrev128q_f32(cd11.val[1], pSrc+168); + vldrev128q_f32(cd11.val[0], pSrc+172); + vldrev128q_f32(cd10.val[1], pSrc+176); + vldrev128q_f32(cd10.val[0], pSrc+180); + vldrev128q_f32(cd9.val[1] , pSrc+184); + vldrev128q_f32(cd9.val[0] , pSrc+188); + vldrev128q_f32(cd8.val[1] , pSrc+192); + vldrev128q_f32(cd8.val[0] , pSrc+196); + vldrev128q_f32(cd7.val[1] , pSrc+200); + vldrev128q_f32(cd7.val[0] , pSrc+204); + vldrev128q_f32(cd6.val[1] , pSrc+208); + vldrev128q_f32(cd6.val[0] , pSrc+212); + vldrev128q_f32(cd5.val[1] , pSrc+216); + vldrev128q_f32(cd5.val[0] , pSrc+220); + vldrev128q_f32(cd4.val[1] , pSrc+224); + vldrev128q_f32(cd4.val[0] , pSrc+228); + vldrev128q_f32(cd3.val[1] , pSrc+232); + vldrev128q_f32(cd3.val[0] , pSrc+236); + vldrev128q_f32(cd2.val[1] , pSrc+240); + vldrev128q_f32(cd2.val[0] , pSrc+244); + vldrev128q_f32(cd1.val[1] , pSrc+248); + vldrev128q_f32(cd1.val[0] , pSrc+252); + + //Compare + if(dir) + { + vminmax256q(ab1 , cd1 ); + vminmax256q(ab2 , cd2 ); + vminmax256q(ab3 , cd3 ); + vminmax256q(ab4 , cd4 ); + vminmax256q(ab5 , cd5 ); + vminmax256q(ab6 , cd6 ); + vminmax256q(ab7 , cd7 ); + vminmax256q(ab8 , cd8 ); + vminmax256q(ab9 , cd9 ); + vminmax256q(ab10, cd10); + vminmax256q(ab11, cd11); + vminmax256q(ab12, cd12); + vminmax256q(ab13, cd13); + vminmax256q(ab14, cd14); + vminmax256q(ab15, cd15); + vminmax256q(ab16, cd16); + } + else + { + vminmax256q(cd1 , ab1 ); + vminmax256q(cd2 , ab2 ); + vminmax256q(cd3 , ab3 ); + vminmax256q(cd4 , ab4 ); + vminmax256q(cd5 , ab5 ); + vminmax256q(cd6 , ab6 ); + vminmax256q(cd7 , ab7 ); + vminmax256q(cd8 , ab8 ); + vminmax256q(cd9 , ab9 ); + vminmax256q(cd10, ab10); + vminmax256q(cd11, ab11); + vminmax256q(cd12, ab12); + vminmax256q(cd13, ab13); + vminmax256q(cd14, ab14); + vminmax256q(cd15, ab15); + vminmax256q(cd16, ab16); + } + + //Transpose + float32x4_t temp; + + temp = ab9.val[0]; + ab9.val[0] = cd1.val[0]; + cd1.val[0] = temp; + temp = ab9.val[1]; + ab9.val[1] = cd1.val[1]; + cd1.val[1] = temp; + temp = ab10.val[0]; + ab10.val[0] = cd2.val[0]; + cd2.val[0] = temp; + temp = ab10.val[1]; + ab10.val[1] = cd2.val[1]; + cd2.val[1] = temp; + temp = ab11.val[0]; + ab11.val[0] = cd3.val[0]; + cd3.val[0] = temp; + temp = ab11.val[1]; + ab11.val[1] = cd3.val[1]; + cd3.val[1] = temp; + temp = ab12.val[0]; + ab12.val[0] = cd4.val[0]; + cd4.val[0] = temp; + temp = ab12.val[1]; + ab12.val[1] = cd4.val[1]; + cd4.val[1] = temp; + temp = ab13.val[0]; + ab13.val[0] = cd5.val[0]; + cd5.val[0] = temp; + temp = ab13.val[1]; + ab13.val[1] = cd5.val[1]; + cd5.val[1] = temp; + temp = ab14.val[0]; + ab14.val[0] = cd6.val[0]; + cd6.val[0] = temp; + temp = ab14.val[1]; + ab14.val[1] = cd6.val[1]; + cd6.val[1] = temp; + temp = ab15.val[0]; + ab15.val[0] = cd7.val[0]; + cd7.val[0] = temp; + temp = ab15.val[1]; + ab15.val[1] = cd7.val[1]; + cd7.val[1] = temp; + temp = ab16.val[0]; + ab16.val[0] = cd8.val[0]; + cd8.val[0] = temp; + temp = ab16.val[1]; + ab16.val[1] = cd8.val[1]; + cd8.val[1] = temp; + + //Compare + if(dir) + { + vminmax256q(ab1 , cd1 ); + vminmax256q(ab2 , cd2 ); + vminmax256q(ab3 , cd3 ); + vminmax256q(ab4 , cd4 ); + vminmax256q(ab5 , cd5 ); + vminmax256q(ab6 , cd6 ); + vminmax256q(ab7 , cd7 ); + vminmax256q(ab8 , cd8 ); + vminmax256q(ab9 , cd9 ); + vminmax256q(ab10, cd10); + vminmax256q(ab11, cd11); + vminmax256q(ab12, cd12); + vminmax256q(ab13, cd13); + vminmax256q(ab14, cd14); + vminmax256q(ab15, cd15); + vminmax256q(ab16, cd16); + } + else + { + vminmax256q(cd1 , ab1 ); + vminmax256q(cd2 , ab2 ); + vminmax256q(cd3 , ab3 ); + vminmax256q(cd4 , ab4 ); + vminmax256q(cd5 , ab5 ); + vminmax256q(cd6 , ab6 ); + vminmax256q(cd7 , ab7 ); + vminmax256q(cd8 , ab8 ); + vminmax256q(cd9 , ab9 ); + vminmax256q(cd10, ab10); + vminmax256q(cd11, ab11); + vminmax256q(cd12, ab12); + vminmax256q(cd13, ab13); + vminmax256q(cd14, ab14); + vminmax256q(cd15, ab15); + vminmax256q(cd16, ab16); + } + + vst1q_f32(pSrc, ab1.val[0] ); + vst1q_f32(pSrc+4, ab1.val[1] ); + vst1q_f32(pSrc+8, ab2.val[0] ); + vst1q_f32(pSrc+12, ab2.val[1] ); + vst1q_f32(pSrc+16, ab3.val[0] ); + vst1q_f32(pSrc+20, ab3.val[1] ); + vst1q_f32(pSrc+24, ab4.val[0] ); + vst1q_f32(pSrc+28, ab4.val[1] ); + vst1q_f32(pSrc+32, ab5.val[0] ); + vst1q_f32(pSrc+36, ab5.val[1] ); + vst1q_f32(pSrc+40, ab6.val[0] ); + vst1q_f32(pSrc+44, ab6.val[1] ); + vst1q_f32(pSrc+48, ab7.val[0] ); + vst1q_f32(pSrc+52, ab7.val[1] ); + vst1q_f32(pSrc+56, ab8.val[0] ); + vst1q_f32(pSrc+60, ab8.val[1] ); + vst1q_f32(pSrc+64, cd1.val[0] ); + vst1q_f32(pSrc+68, cd1.val[1] ); + vst1q_f32(pSrc+72, cd2.val[0] ); + vst1q_f32(pSrc+76, cd2.val[1] ); + vst1q_f32(pSrc+80, cd3.val[0] ); + vst1q_f32(pSrc+84, cd3.val[1] ); + vst1q_f32(pSrc+88, cd4.val[0] ); + vst1q_f32(pSrc+92, cd4.val[1] ); + vst1q_f32(pSrc+96, cd5.val[0] ); + vst1q_f32(pSrc+100, cd5.val[1] ); + vst1q_f32(pSrc+104, cd6.val[0] ); + vst1q_f32(pSrc+108, cd6.val[1] ); + vst1q_f32(pSrc+112, cd7.val[0] ); + vst1q_f32(pSrc+116, cd7.val[1] ); + vst1q_f32(pSrc+120, cd8.val[0] ); + vst1q_f32(pSrc+124, cd8.val[1] ); + vst1q_f32(pSrc+128, ab9.val[0] ); + vst1q_f32(pSrc+132, ab9.val[1] ); + vst1q_f32(pSrc+136, ab10.val[0]); + vst1q_f32(pSrc+140, ab10.val[1]); + vst1q_f32(pSrc+144, ab11.val[0]); + vst1q_f32(pSrc+148, ab11.val[1]); + vst1q_f32(pSrc+152, ab12.val[0]); + vst1q_f32(pSrc+156, ab12.val[1]); + vst1q_f32(pSrc+160, ab13.val[0]); + vst1q_f32(pSrc+164, ab13.val[1]); + vst1q_f32(pSrc+168, ab14.val[0]); + vst1q_f32(pSrc+172, ab14.val[1]); + vst1q_f32(pSrc+176, ab15.val[0]); + vst1q_f32(pSrc+180, ab15.val[1]); + vst1q_f32(pSrc+184, ab16.val[0]); + vst1q_f32(pSrc+188, ab16.val[1]); + vst1q_f32(pSrc+192, cd9.val[0] ); + vst1q_f32(pSrc+196, cd9.val[1] ); + vst1q_f32(pSrc+200, cd10.val[0]); + vst1q_f32(pSrc+204, cd10.val[1]); + vst1q_f32(pSrc+208, cd11.val[0]); + vst1q_f32(pSrc+212, cd11.val[1]); + vst1q_f32(pSrc+216, cd12.val[0]); + vst1q_f32(pSrc+220, cd12.val[1]); + vst1q_f32(pSrc+224, cd13.val[0]); + vst1q_f32(pSrc+228, cd13.val[1]); + vst1q_f32(pSrc+232, cd14.val[0]); + vst1q_f32(pSrc+236, cd14.val[1]); + vst1q_f32(pSrc+240, cd15.val[0]); + vst1q_f32(pSrc+244, cd15.val[1]); + vst1q_f32(pSrc+248, cd16.val[0]); + vst1q_f32(pSrc+252, cd16.val[1]); + + //Transpose + arm_bitonic_merge_128_f32(pSrc+0 , dir); + arm_bitonic_merge_128_f32(pSrc+128, dir); +} + +#define SWAP(a,i,j) \ + temp = vgetq_lane_f32(a, j); \ + a = vsetq_lane_f32(vgetq_lane_f32(a, i), a, j);\ + a = vsetq_lane_f32(temp, a, i); + +static float32x4_t arm_bitonic_sort_4_f32(float32x4_t a, uint8_t dir) +{ + float32_t temp; + + + if( dir==(vgetq_lane_f32(a, 0) > vgetq_lane_f32(a, 1)) ) + { + SWAP(a,0,1); + } + if( dir==(vgetq_lane_f32(a, 2) > vgetq_lane_f32(a, 3)) ) + { + SWAP(a,2,3); + } + + if( dir==(vgetq_lane_f32(a, 0) > vgetq_lane_f32(a, 3)) ) + { + SWAP(a,0,3); + } + if( dir==(vgetq_lane_f32(a, 1) > vgetq_lane_f32(a, 2)) ) + { + SWAP(a,1,2); + } + + if( dir==(vgetq_lane_f32(a, 0) > vgetq_lane_f32(a, 1)) ) + { + SWAP(a,0,1); + } + if( dir==(vgetq_lane_f32(a, 2)>vgetq_lane_f32(a, 3)) ) + { + SWAP(a,2,3); + } + + return a; +} + +static float32x4x2_t arm_bitonic_sort_8_f32(float32x4_t a, float32x4_t b, uint8_t dir) +{ + a = arm_bitonic_sort_4_f32(a, dir); + b = arm_bitonic_sort_4_f32(b, dir); + return arm_bitonic_merge_8_f32(a, b, dir); +} + + + +#endif + +/** + @ingroup groupSupport + */ + +/** + @defgroup Sorting Vector sorting algorithms + + Sort the elements of a vector + + There are separate functions for floating-point, Q31, Q15, and Q7 data types. + */ + +/** + @addtogroup Sorting + @{ + */ + +/** + * @private + * @param[in] S points to an instance of the sorting structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data + * @param[in] blockSize number of samples to process. + */ +void arm_bitonic_sort_f32( +const arm_sort_instance_f32 * S, + float32_t * pSrc, + float32_t * pDst, + uint32_t blockSize) +{ + uint16_t s, i; + uint8_t dir = S->dir; + +#ifdef ARM_MATH_NEON + (void)s; + + float32_t * pOut; + uint16_t counter = blockSize>>5; + + if( (blockSize & (blockSize-1)) == 0 ) // Powers of 2 only + { + if(pSrc == pDst) // in-place + pOut = pSrc; + else + pOut = pDst; + + float32x4x2_t ab1, ab2; + float32x4x2_t cd1, cd2; + + if(blockSize == 1) + pOut = pSrc; + else if(blockSize == 2) + { + float32_t temp; + + if( dir==(pSrc[0]>pSrc[1]) ) + { + temp = pSrc[1]; + pOut[1] = pSrc[0]; + pOut[0] = temp; + } + else + pOut = pSrc; + } + else if(blockSize == 4) + { + float32x4_t a = vld1q_f32(pSrc); + + a = arm_bitonic_sort_4_f32(a, dir); + + vst1q_f32(pOut, a); + } + else if(blockSize == 8) + { + float32x4_t a; + float32x4_t b; + float32x4x2_t ab; + + a = vld1q_f32(pSrc); + b = vld1q_f32(pSrc+4); + + ab = arm_bitonic_sort_8_f32(a, b, dir); + + vst1q_f32(pOut, ab.val[0]); + vst1q_f32(pOut+4, ab.val[1]); + } + else if(blockSize >=16) + { + // Order 16 bits long vectors + for(i=0; i<blockSize; i=i+16) + arm_bitonic_sort_16_f32(pSrc+i, pOut+i, dir); + + // Merge + for(i=0; i<counter; i++) + { + // Load and reverse second vector + ab1.val[0] = vld1q_f32(pOut+32*i+0 ); + ab1.val[1] = vld1q_f32(pOut+32*i+4 ); + ab2.val[0] = vld1q_f32(pOut+32*i+8 ); + ab2.val[1] = vld1q_f32(pOut+32*i+12); + + vldrev128q_f32(cd2.val[1], pOut+32*i+16); + vldrev128q_f32(cd2.val[0], pOut+32*i+20); + vldrev128q_f32(cd1.val[1], pOut+32*i+24); + vldrev128q_f32(cd1.val[0], pOut+32*i+28); + + arm_bitonic_merge_32_f32(pOut+32*i, ab1, ab2, cd1, cd2, dir); + } + + counter = counter>>1; + for(i=0; i<counter; i++) + arm_bitonic_merge_64_f32(pOut+64*i, dir); + + counter = counter>>1; + for(i=0; i<counter; i++) + arm_bitonic_merge_128_f32(pOut+128*i, dir); + + counter = counter>>1; + for(i=0; i<counter; i++) + arm_bitonic_merge_256_f32(pOut+256*i, dir); + + // Etc... + } + } + +#else + + float32_t * pA; + + if(pSrc != pDst) // out-of-place + { + memcpy(pDst, pSrc, blockSize*sizeof(float32_t) ); + pA = pDst; + } + else + pA = pSrc; + + + if( (blockSize & (blockSize-1)) == 0 ) // Powers of 2 only + { + for(s=2; s<=blockSize; s=s*2) + { + for(i=0; i<blockSize; i=i+s) + arm_bitonic_sort_core_f32(pA+i, s, dir); + } + } +#endif +} + +/** + @} end of Sorting group + */ diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_bubble_sort_f32.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_bubble_sort_f32.c new file mode 100644 index 0000000..640778d --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_bubble_sort_f32.c @@ -0,0 +1,104 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_bubble_sort_f32.c + * Description: Floating point bubble sort + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/support_functions.h" +#include "arm_sorting.h" + +/** + @ingroup groupSupport + */ + +/** + @addtogroup Sorting + @{ + */ + +/** + * @private + * @param[in] S points to an instance of the sorting structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data + * @param[in] blockSize number of samples to process. + * + * @par Algorithm + * The bubble sort algorithm is a simple comparison algorithm that + * reads the elements of a vector from the beginning to the end, + * compares the adjacent ones and swaps them if they are in the + * wrong order. The procedure is repeated until there is nothing + * left to swap. Bubble sort is fast for input vectors that are + * nearly sorted. + * + * @par It's an in-place algorithm. In order to obtain an out-of-place + * function, a memcpy of the source vector is performed + */ + +void arm_bubble_sort_f32( + const arm_sort_instance_f32 * S, + float32_t * pSrc, + float32_t * pDst, + uint32_t blockSize) +{ + uint8_t dir = S->dir; + uint32_t i; + uint8_t swapped =1; + float32_t * pA; + float32_t temp; + + if(pSrc != pDst) // out-of-place + { + memcpy(pDst, pSrc, blockSize*sizeof(float32_t) ); + pA = pDst; + } + else + pA = pSrc; + + while(swapped==1) // If nothing has been swapped after one loop stop + { + swapped=0; + + for(i=0; i<blockSize-1; i++) + { + if(dir==(pA[i]>pA[i+1])) + { + // Swap + temp = pA[i]; + pA[i] = pA[i+1]; + pA[i+1] = temp; + + // Update flag + swapped = 1; + } + } + + blockSize--; + } +} + +/** + @} end of Sorting group + */ diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_copy_f16.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_copy_f16.c new file mode 100644 index 0000000..d441332 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_copy_f16.c @@ -0,0 +1,130 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_copy_f16.c + * Description: Copies the elements of a floating-point vector + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/support_functions_f16.h" + +#if defined(ARM_FLOAT16_SUPPORTED) + + +/** + @ingroup groupSupport + */ + + +/** + @addtogroup copy + @{ + */ + +/** + @brief Copies the elements of a f16 vector. + @param[in] pSrc points to input vector + @param[out] pDst points to output vector + @param[in] blockSize number of samples in each vector + @return none + */ +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) + +void arm_copy_f16( + const float16_t * pSrc, + float16_t * pDst, + uint32_t blockSize) +{ + do { + mve_pred16_t p = vctp16q(blockSize); + + vstrhq_p_f16(pDst, + vldrhq_z_f16((float16_t const *) pSrc, p), p); + /* + * Decrement the blockSize loop counter + * Advance vector source and destination pointers + */ + pSrc += 8; + pDst += 8; + blockSize -= 8; + } + while ((int32_t) blockSize > 0); +} + +#else + +void arm_copy_f16( + const float16_t * pSrc, + float16_t * pDst, + uint32_t blockSize) +{ + uint32_t blkCnt; /* Loop counter */ + +#if defined (ARM_MATH_LOOPUNROLL) + + /* Loop unrolling: Compute 4 outputs at a time */ + blkCnt = blockSize >> 2U; + + while (blkCnt > 0U) + { + /* C = A */ + + /* Copy and store result in destination buffer */ + *pDst++ = *pSrc++; + *pDst++ = *pSrc++; + *pDst++ = *pSrc++; + *pDst++ = *pSrc++; + + /* Decrement loop counter */ + blkCnt--; + } + + /* Loop unrolling: Compute remaining outputs */ + blkCnt = blockSize % 0x4U; + +#else + + /* Initialize blkCnt with number of samples */ + blkCnt = blockSize; + +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ + + while (blkCnt > 0U) + { + /* C = A */ + + /* Copy and store result in destination buffer */ + *pDst++ = *pSrc++; + + /* Decrement loop counter */ + blkCnt--; + } +} +#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */ + +/** + @} end of BasicCopy group + */ + +#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ + diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_copy_f32.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_copy_f32.c new file mode 100644 index 0000000..d739f7c --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_copy_f32.c @@ -0,0 +1,192 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_copy_f32.c + * Description: Copies the elements of a floating-point vector + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/support_functions.h" + +/** + @ingroup groupSupport + */ + +/** + @defgroup copy Vector Copy + + Copies sample by sample from source vector to destination vector. + + <pre> + pDst[n] = pSrc[n]; 0 <= n < blockSize. + </pre> + + There are separate functions for floating point, Q31, Q15, and Q7 data types. + */ + +/** + @addtogroup copy + @{ + */ + +/** + @brief Copies the elements of a floating-point vector. + @param[in] pSrc points to input vector + @param[out] pDst points to output vector + @param[in] blockSize number of samples in each vector + @return none + */ +#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) + +void arm_copy_f32( + const float32_t * pSrc, + float32_t * pDst, + uint32_t blockSize) +{ + uint32_t blkCnt; + blkCnt = blockSize >> 2U; + + /* Compute 4 outputs at a time */ + while (blkCnt > 0U) + { + vstrwq_f32(pDst, vldrwq_f32(pSrc)); + /* + * Decrement the blockSize loop counter + * Advance vector source and destination pointers + */ + pSrc += 4; + pDst += 4; + blkCnt --; + } + + blkCnt = blockSize & 3; + + while (blkCnt > 0U) + { + /* C = A */ + + /* Copy and store result in destination buffer */ + *pDst++ = *pSrc++; + + /* Decrement loop counter */ + blkCnt--; + } + +} + +#else +#if defined(ARM_MATH_NEON_EXPERIMENTAL) +void arm_copy_f32( + const float32_t * pSrc, + float32_t * pDst, + uint32_t blockSize) +{ + uint32_t blkCnt; /* loop counter */ + + float32x4_t inV; + + blkCnt = blockSize >> 2U; + + /* Compute 4 outputs at a time. + ** a second loop below computes the remaining 1 to 3 samples. */ + while (blkCnt > 0U) + { + /* C = A */ + /* Copy and then store the results in the destination buffer */ + inV = vld1q_f32(pSrc); + vst1q_f32(pDst, inV); + pSrc += 4; + pDst += 4; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize & 3; + + while (blkCnt > 0U) + { + /* C = A */ + /* Copy and then store the results in the destination buffer */ + *pDst++ = *pSrc++; + + /* Decrement the loop counter */ + blkCnt--; + } +} +#else +void arm_copy_f32( + const float32_t * pSrc, + float32_t * pDst, + uint32_t blockSize) +{ + uint32_t blkCnt; /* Loop counter */ + +#if defined (ARM_MATH_LOOPUNROLL) + + /* Loop unrolling: Compute 4 outputs at a time */ + blkCnt = blockSize >> 2U; + + while (blkCnt > 0U) + { + /* C = A */ + + /* Copy and store result in destination buffer */ + *pDst++ = *pSrc++; + *pDst++ = *pSrc++; + *pDst++ = *pSrc++; + *pDst++ = *pSrc++; + + /* Decrement loop counter */ + blkCnt--; + } + + /* Loop unrolling: Compute remaining outputs */ + blkCnt = blockSize % 0x4U; + +#else + + /* Initialize blkCnt with number of samples */ + blkCnt = blockSize; + +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ + + while (blkCnt > 0U) + { + /* C = A */ + + /* Copy and store result in destination buffer */ + *pDst++ = *pSrc++; + + /* Decrement loop counter */ + blkCnt--; + } +} +#endif /* #if defined(ARM_MATH_NEON) */ +#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */ + +/** + @} end of BasicCopy group + */ diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_copy_f64.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_copy_f64.c new file mode 100644 index 0000000..ec1df54 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_copy_f64.c @@ -0,0 +1,71 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_copy_f64.c + * Description: Copies the elements of a floating-point vector + * + * $Date: 13 September 2021 + * $Revision: V1.10.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/support_functions.h" + +/** + @ingroup groupSupport + */ + +/** + @addtogroup copy + @{ + */ + +/** + @brief Copies the elements of a floating-point vector. + @param[in] pSrc points to input vector + @param[out] pDst points to output vector + @param[in] blockSize number of samples in each vector + @return none + */ +void arm_copy_f64( + const float64_t * pSrc, + float64_t * pDst, + uint32_t blockSize) +{ + uint32_t blkCnt; /* Loop counter */ + + /* Initialize blkCnt with number of samples */ + blkCnt = blockSize; + + while (blkCnt > 0U) + { + /* C = A */ + + /* Copy and store result in destination buffer */ + *pDst++ = *pSrc++; + + /* Decrement loop counter */ + blkCnt--; + } +} + +/** + @} end of BasicCopy group + */ diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_copy_q15.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_copy_q15.c new file mode 100644 index 0000000..18f3387 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_copy_q15.c @@ -0,0 +1,130 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_copy_q15.c + * Description: Copies the elements of a Q15 vector + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/support_functions.h" + +/** + @ingroup groupSupport + */ + +/** + @addtogroup copy + @{ + */ + +/** + @brief Copies the elements of a Q15 vector. + @param[in] pSrc points to input vector + @param[out] pDst points to output vector + @param[in] blockSize number of samples in each vector + @return none + */ +#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) +void arm_copy_q15( + const q15_t * pSrc, + q15_t * pDst, + uint32_t blockSize) +{ + uint32_t blkCnt; + + blkCnt = blockSize >> 3; + while (blkCnt > 0U) + { + vstrhq_s16(pDst,vldrhq_s16(pSrc)); + /* + * Decrement the blockSize loop counter + * Advance vector source and destination pointers + */ + pSrc += 8; + pDst += 8; + blkCnt --; + } + + blkCnt = blockSize & 7; + while (blkCnt > 0U) + { + /* C = A */ + + /* Copy and store result in destination buffer */ + *pDst++ = *pSrc++; + + /* Decrement loop counter */ + blkCnt--; + } +} +#else +void arm_copy_q15( + const q15_t * pSrc, + q15_t * pDst, + uint32_t blockSize) +{ + uint32_t blkCnt; /* Loop counter */ + +#if defined (ARM_MATH_LOOPUNROLL) + + /* Loop unrolling: Compute 4 outputs at a time */ + blkCnt = blockSize >> 2U; + + while (blkCnt > 0U) + { + /* C = A */ + + /* read 2 times 2 samples at a time */ + write_q15x2_ia (&pDst, read_q15x2_ia (&pSrc)); + write_q15x2_ia (&pDst, read_q15x2_ia (&pSrc)); + + /* Decrement loop counter */ + blkCnt--; + } + + /* Loop unrolling: Compute remaining outputs */ + blkCnt = blockSize % 0x4U; + +#else + + /* Initialize blkCnt with number of samples */ + blkCnt = blockSize; + +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ + + while (blkCnt > 0U) + { + /* C = A */ + + /* Copy and store result in destination buffer */ + *pDst++ = *pSrc++; + + /* Decrement loop counter */ + blkCnt--; + } +} +#endif /* defined(ARM_MATH_MVEI) */ + +/** + @} end of BasicCopy group + */ diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_copy_q31.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_copy_q31.c new file mode 100644 index 0000000..8e06bda --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_copy_q31.c @@ -0,0 +1,135 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_copy_q31.c + * Description: Copies the elements of a Q31 vector + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/support_functions.h" + +/** + @ingroup groupSupport + */ + +/** + @addtogroup copy + @{ + */ + +/** + @brief Copies the elements of a Q31 vector. + @param[in] pSrc points to input vector + @param[out] pDst points to output vector + @param[in] blockSize number of samples in each vector + @return none + */ +#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) +void arm_copy_q31( + const q31_t * pSrc, + q31_t * pDst, + uint32_t blockSize) +{ + uint32_t blkCnt; + blkCnt = blockSize >> 2U; + + /* Compute 4 outputs at a time */ + while (blkCnt > 0U) + { + vstrwq_s32(pDst,vldrwq_s32(pSrc)); + /* + * Decrement the blockSize loop counter + * Advance vector source and destination pointers + */ + pSrc += 4; + pDst += 4; + blkCnt --; + } + + blkCnt = blockSize & 3; + while (blkCnt > 0U) + { + /* C = A */ + + /* Copy and store result in destination buffer */ + *pDst++ = *pSrc++; + + /* Decrement loop counter */ + blkCnt--; + } + +} + +#else +void arm_copy_q31( + const q31_t * pSrc, + q31_t * pDst, + uint32_t blockSize) +{ + uint32_t blkCnt; /* Loop counter */ + +#if defined (ARM_MATH_LOOPUNROLL) + + /* Loop unrolling: Compute 4 outputs at a time */ + blkCnt = blockSize >> 2U; + + while (blkCnt > 0U) + { + /* C = A */ + + /* Copy and store result in destination buffer */ + *pDst++ = *pSrc++; + *pDst++ = *pSrc++; + *pDst++ = *pSrc++; + *pDst++ = *pSrc++; + + /* Decrement loop counter */ + blkCnt--; + } + + /* Loop unrolling: Compute remaining outputs */ + blkCnt = blockSize % 0x4U; + +#else + + /* Initialize blkCnt with number of samples */ + blkCnt = blockSize; + +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ + + while (blkCnt > 0U) + { + /* C = A */ + + /* Copy and store result in destination buffer */ + *pDst++ = *pSrc++; + + /* Decrement loop counter */ + blkCnt--; + } +} +#endif /* defined(ARM_MATH_MVEI) */ + +/** + @} end of BasicCopy group + */ diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_copy_q7.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_copy_q7.c new file mode 100644 index 0000000..1918d3e --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_copy_q7.c @@ -0,0 +1,132 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_copy_q7.c + * Description: Copies the elements of a Q7 vector + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/support_functions.h" + +/** + @ingroup groupSupport + */ + +/** + @addtogroup copy + @{ + */ + +/** + @brief Copies the elements of a Q7 vector. + @param[in] pSrc points to input vector + @param[out] pDst points to output vector + @param[in] blockSize number of samples in each vector + @return none + */ +#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) +void arm_copy_q7( + const q7_t * pSrc, + q7_t * pDst, + uint32_t blockSize) +{ + + uint32_t blkCnt; + + blkCnt = blockSize >> 4; + while (blkCnt > 0U) + { + + vstrbq_s8(pDst,vldrbq_s8(pSrc)); + /* + * Decrement the blockSize loop counter + * Advance vector source and destination pointers + */ + pSrc += 16; + pDst += 16; + blkCnt --; + } + + blkCnt = blockSize & 0xF; + while (blkCnt > 0U) + { + /* C = A */ + + /* Copy and store result in destination buffer */ + *pDst++ = *pSrc++; + + /* Decrement loop counter */ + blkCnt--; + } +} + +#else +void arm_copy_q7( + const q7_t * pSrc, + q7_t * pDst, + uint32_t blockSize) +{ + uint32_t blkCnt; /* Loop counter */ + +#if defined (ARM_MATH_LOOPUNROLL) + + /* Loop unrolling: Compute 4 outputs at a time */ + blkCnt = blockSize >> 2U; + + while (blkCnt > 0U) + { + /* C = A */ + + /* read 4 samples at a time */ + write_q7x4_ia (&pDst, read_q7x4_ia (&pSrc)); + + /* Decrement loop counter */ + blkCnt--; + } + + /* Loop unrolling: Compute remaining outputs */ + blkCnt = blockSize % 0x4U; + +#else + + /* Initialize blkCnt with number of samples */ + blkCnt = blockSize; + +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ + + while (blkCnt > 0U) + { + /* C = A */ + + /* Copy and store result in destination buffer */ + *pDst++ = *pSrc++; + + /* Decrement loop counter */ + blkCnt--; + } +} +#endif /* defined(ARM_MATH_MVEI) */ + +/** + @} end of BasicCopy group + */ diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_f16_to_float.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_f16_to_float.c new file mode 100644 index 0000000..a004353 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_f16_to_float.c @@ -0,0 +1,134 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_float_to_q15.c + * Description: Converts the elements of the floating-point vector to Q15 vector + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/support_functions_f16.h" + +#if defined(ARM_FLOAT16_SUPPORTED) + + +/** + @ingroup groupSupport + */ + +/** + * @defgroup f16_to_x Convert 16-bit floating point value + */ + +/** + @addtogroup f16_to_x + @{ + */ + +/** + @brief Converts the elements of the f16 vector to f32 vector. + @param[in] pSrc points to the f16 input vector + @param[out] pDst points to the f32 output vector + @param[in] blockSize number of samples in each vector + @return none + + */ + +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) && defined(__CMSIS_GCC_H) +#pragma GCC warning "Scalar version of arm_f16_to_float built. Helium version has build issues with gcc." +#endif + +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) && !defined(__CMSIS_GCC_H) + +void arm_f16_to_float( + const float16_t * pSrc, + float32_t * pDst, + uint32_t blockSize) +{ + int32_t blkCnt; /* loop counters */ + float16x8_t vecDst; + float32x4x2_t tmp; + + blkCnt = blockSize >> 3; + while (blkCnt > 0) + { + vecDst = vldrhq_f16(pSrc); + pSrc += 8; + + tmp.val[0] = vcvtbq_f32_f16(vecDst); + tmp.val[1] = vcvttq_f32_f16(vecDst); + vst2q(pDst,tmp); + + pDst += 8; + /* + * Decrement the blockSize loop counter + */ + blkCnt--; + } + /* + * tail + * (will be merged thru tail predication) + */ + blkCnt = blockSize & 7; + while (blkCnt > 0) + { + + *pDst++ = (float32_t) *pSrc++; + /* + * Decrement the loop counter + */ + blkCnt--; + } +} + +#else +void arm_f16_to_float( + const float16_t * pSrc, + float32_t * pDst, + uint32_t blockSize) +{ + const float16_t *pIn = pSrc; /* Src pointer */ + uint32_t blkCnt; /* loop counter */ + + /* + * Loop over blockSize number of values + */ + blkCnt = blockSize; + + while (blkCnt > 0U) + { + + *pDst++ = (float32_t) * pIn++; + /* + * Decrement the loop counter + */ + blkCnt--; + } +} +#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */ + +/** + @} end of f16_to_x group + */ + +#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ + diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_f16_to_q15.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_f16_to_q15.c new file mode 100644 index 0000000..bb425d1 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_f16_to_q15.c @@ -0,0 +1,157 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_float_to_q15.c + * Description: Converts the elements of the floating-point vector to Q15 vector + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/support_functions_f16.h" + +#if defined(ARM_FLOAT16_SUPPORTED) + + +/** + @ingroup groupSupport + */ + +/** + @addtogroup f16_to_x + @{ + */ + +/** + @brief Converts the elements of the f16 vector to Q15 vector. + @param[in] pSrc points to the f16 input vector + @param[out] pDst points to the Q15 output vector + @param[in] blockSize number of samples in each vector + @return none + + @par Details + The equation used for the conversion process is: + <pre> + pDst[n] = (q15_t)(pSrc[n] * 32768); 0 <= n < blockSize. + </pre> + + @par Scaling and Overflow Behavior + The function uses saturating arithmetic. + Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated. + + @note + In order to apply rounding in scalar version, the library should be rebuilt with the ROUNDING macro + defined in the preprocessor section of project options. + */ + +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) + +void arm_f16_to_q15( + const float16_t * pSrc, + q15_t * pDst, + uint32_t blockSize) +{ + float16_t maxQ = (float16_t) Q15_MAX; + float16x8_t vecDst; + + + do { + mve_pred16_t p = vctp16q(blockSize); + + vecDst = vldrhq_z_f16((float16_t const *) pSrc, p); + /* C = A * 32767 */ + /* convert from float to Q15 and then store the results in the destination buffer */ + vecDst = vmulq_m(vuninitializedq_f16(), vecDst, maxQ, p); + + vstrhq_p_s16(pDst, + vcvtaq_m(vuninitializedq_s16(), vecDst, p), p); + /* + * Decrement the blockSize loop counter + * Advance vector source and destination pointers + */ + pSrc += 8; + pDst += 8; + blockSize -= 8; + } + while ((int32_t) blockSize > 0); +} + +#else + +void arm_f16_to_q15( + const float16_t * pSrc, + q15_t * pDst, + uint32_t blockSize) +{ + const float16_t *pIn = pSrc; /* Src pointer */ + uint32_t blkCnt; /* loop counter */ +#ifdef ARM_MATH_ROUNDING + float16_t in; +#endif /* #ifdef ARM_MATH_ROUNDING */ + + /* + * Loop over blockSize number of values + */ + blkCnt = blockSize; + + while (blkCnt > 0U) + { + +#ifdef ARM_MATH_ROUNDING + + /* + * C = A * 65536 + */ + /* + * convert from float to Q31 and then store the results in the destination buffer + */ + in = *pIn++; + in = (in * 32768.0); + in += in > 0.0 ? 0.5 : -0.5; + *pDst++ = clip_q31_to_q15((q31_t) (in)); + +#else + + /* + * C = A * 32768 + */ + /* + * convert from float to Q31 and then store the results in the destination buffer + */ + *pDst++ = clip_q31_to_q15((q31_t) ((_Float16)*pIn++ * 32768.0f16)); + +#endif /* #ifdef ARM_MATH_ROUNDING */ + + /* + * Decrement the loop counter + */ + blkCnt--; + } + +} +#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */ + +/** + @} end of f16_to_x group + */ + +#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ + diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_fill_f16.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_fill_f16.c new file mode 100644 index 0000000..0b08f12 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_fill_f16.c @@ -0,0 +1,127 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_fill_f16.c + * Description: Fills a constant value into a floating-point vector + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/support_functions_f16.h" + +#if defined(ARM_FLOAT16_SUPPORTED) + + +/** + @ingroup groupSupport + */ + + +/** + @addtogroup Fill + @{ + */ + +/** + @brief Fills a constant value into a f16 vector. + @param[in] value input value to be filled + @param[out] pDst points to output vector + @param[in] blockSize number of samples in each vector + @return none + */ +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) + +void arm_fill_f16( + float16_t value, + float16_t * pDst, + uint32_t blockSize) +{ + do { + mve_pred16_t p = vctp16q(blockSize); + + vstrhq_p_f16(pDst, + vdupq_m_n_f16(vuninitializedq_f16(), value, p), p); + /* + * Decrement the blockSize loop counter + * Advance vector source and destination pointers + */ + pDst += 8; + blockSize -= 8; + } + while ((int32_t) blockSize > 0); +} +#else +void arm_fill_f16( + float16_t value, + float16_t * pDst, + uint32_t blockSize) +{ + uint32_t blkCnt; /* Loop counter */ + +#if defined (ARM_MATH_LOOPUNROLL) + + /* Loop unrolling: Compute 4 outputs at a time */ + blkCnt = blockSize >> 2U; + + while (blkCnt > 0U) + { + /* C = value */ + + /* Fill value in destination buffer */ + *pDst++ = value; + *pDst++ = value; + *pDst++ = value; + *pDst++ = value; + + /* Decrement loop counter */ + blkCnt--; + } + + /* Loop unrolling: Compute remaining outputs */ + blkCnt = blockSize % 0x4U; + +#else + + /* Initialize blkCnt with number of samples */ + blkCnt = blockSize; + +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ + + while (blkCnt > 0U) + { + /* C = value */ + + /* Fill value in destination buffer */ + *pDst++ = value; + + /* Decrement loop counter */ + blkCnt--; + } +} +#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */ + +/** + @} end of Fill group + */ + +#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ + diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_fill_f32.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_fill_f32.c new file mode 100644 index 0000000..50cdd8f --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_fill_f32.c @@ -0,0 +1,189 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_fill_f32.c + * Description: Fills a constant value into a floating-point vector + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/support_functions.h" + +/** + @ingroup groupSupport + */ + +/** + @defgroup Fill Vector Fill + + Fills the destination vector with a constant value. + + <pre> + pDst[n] = value; 0 <= n < blockSize. + </pre> + + There are separate functions for floating point, Q31, Q15, and Q7 data types. + */ + +/** + @addtogroup Fill + @{ + */ + +/** + @brief Fills a constant value into a floating-point vector. + @param[in] value input value to be filled + @param[out] pDst points to output vector + @param[in] blockSize number of samples in each vector + @return none + */ +#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) +void arm_fill_f32( + float32_t value, + float32_t * pDst, + uint32_t blockSize) +{ + uint32_t blkCnt; + blkCnt = blockSize >> 2U; + + /* Compute 4 outputs at a time */ + while (blkCnt > 0U) + { + + vstrwq_f32(pDst,vdupq_n_f32(value)); + /* + * Decrement the blockSize loop counter + * Advance vector source and destination pointers + */ + pDst += 4; + blkCnt --; + } + + blkCnt = blockSize & 3; + + while (blkCnt > 0U) + { + /* C = value */ + + /* Fill value in destination buffer */ + *pDst++ = value; + + /* Decrement loop counter */ + blkCnt--; + } + +} +#else +#if defined(ARM_MATH_NEON_EXPERIMENTAL) +void arm_fill_f32( + float32_t value, + float32_t * pDst, + uint32_t blockSize) +{ + uint32_t blkCnt; /* loop counter */ + + + float32x4_t inV = vdupq_n_f32(value); + + blkCnt = blockSize >> 2U; + + /* Compute 4 outputs at a time. + ** a second loop below computes the remaining 1 to 3 samples. */ + while (blkCnt > 0U) + { + /* C = value */ + /* Fill the value in the destination buffer */ + vst1q_f32(pDst, inV); + pDst += 4; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize & 3; + + while (blkCnt > 0U) + { + /* C = value */ + /* Fill the value in the destination buffer */ + *pDst++ = value; + + /* Decrement the loop counter */ + blkCnt--; + } +} +#else +void arm_fill_f32( + float32_t value, + float32_t * pDst, + uint32_t blockSize) +{ + uint32_t blkCnt; /* Loop counter */ + +#if defined (ARM_MATH_LOOPUNROLL) + + /* Loop unrolling: Compute 4 outputs at a time */ + blkCnt = blockSize >> 2U; + + while (blkCnt > 0U) + { + /* C = value */ + + /* Fill value in destination buffer */ + *pDst++ = value; + *pDst++ = value; + *pDst++ = value; + *pDst++ = value; + + /* Decrement loop counter */ + blkCnt--; + } + + /* Loop unrolling: Compute remaining outputs */ + blkCnt = blockSize % 0x4U; + +#else + + /* Initialize blkCnt with number of samples */ + blkCnt = blockSize; + +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ + + while (blkCnt > 0U) + { + /* C = value */ + + /* Fill value in destination buffer */ + *pDst++ = value; + + /* Decrement loop counter */ + blkCnt--; + } +} +#endif /* #if defined(ARM_MATH_NEON) */ +#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */ + +/** + @} end of Fill group + */ diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_fill_f64.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_fill_f64.c new file mode 100644 index 0000000..4bc2700 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_fill_f64.c @@ -0,0 +1,71 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_fill_f64.c + * Description: Fills a constant value into a floating-point vector + * + * $Date: 13 September 2021 + * $Revision: V1.10.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/support_functions.h" + +/** + @ingroup groupSupport + */ + +/** + @addtogroup Fill + @{ + */ + +/** + @brief Fills a constant value into a floating-point vector. + @param[in] value input value to be filled + @param[out] pDst points to output vector + @param[in] blockSize number of samples in each vector + @return none + */ +void arm_fill_f64( + float64_t value, + float64_t * pDst, + uint32_t blockSize) +{ + uint32_t blkCnt; /* Loop counter */ + + /* Initialize blkCnt with number of samples */ + blkCnt = blockSize; + + while (blkCnt > 0U) + { + /* C = value */ + + /* Fill value in destination buffer */ + *pDst++ = value; + + /* Decrement loop counter */ + blkCnt--; + } +} + +/** + @} end of Fill group + */ diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_fill_q15.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_fill_q15.c new file mode 100644 index 0000000..997a728 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_fill_q15.c @@ -0,0 +1,134 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_fill_q15.c + * Description: Fills a constant value into a Q15 vector + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/support_functions.h" + +/** + @ingroup groupSupport + */ + +/** + @addtogroup Fill + @{ + */ + +/** + @brief Fills a constant value into a Q15 vector. + @param[in] value input value to be filled + @param[out] pDst points to output vector + @param[in] blockSize number of samples in each vector + @return none + */ +#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) +void arm_fill_q15( + q15_t value, + q15_t * pDst, + uint32_t blockSize) +{ + uint32_t blkCnt; + blkCnt = blockSize >> 3; + while (blkCnt > 0U) + { + + vstrhq_s16(pDst,vdupq_n_s16(value)); + /* + * Decrement the blockSize loop counter + * Advance vector source and destination pointers + */ + pDst += 8; + blkCnt --; + } + + blkCnt = blockSize & 7; + while (blkCnt > 0U) + { + /* C = value */ + + /* Fill value in destination buffer */ + *pDst++ = value; + + /* Decrement loop counter */ + blkCnt--; + } +} + +#else +void arm_fill_q15( + q15_t value, + q15_t * pDst, + uint32_t blockSize) +{ + uint32_t blkCnt; /* Loop counter */ + +#if defined (ARM_MATH_LOOPUNROLL) + q31_t packedValue; /* value packed to 32 bits */ + + /* Packing two 16 bit values to 32 bit value in order to use SIMD */ + packedValue = __PKHBT(value, value, 16U); + + /* Loop unrolling: Compute 4 outputs at a time */ + blkCnt = blockSize >> 2U; + + while (blkCnt > 0U) + { + /* C = value */ + + /* fill 2 times 2 samples at a time */ + write_q15x2_ia (&pDst, packedValue); + write_q15x2_ia (&pDst, packedValue); + + /* Decrement loop counter */ + blkCnt--; + } + + /* Loop unrolling: Compute remaining outputs */ + blkCnt = blockSize % 0x4U; + +#else + + /* Initialize blkCnt with number of samples */ + blkCnt = blockSize; + +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ + + while (blkCnt > 0U) + { + /* C = value */ + + /* Fill value in destination buffer */ + *pDst++ = value; + + /* Decrement loop counter */ + blkCnt--; + } +} +#endif /* defined(ARM_MATH_MVEI) */ + +/** + @} end of Fill group + */ diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_fill_q31.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_fill_q31.c new file mode 100644 index 0000000..7da5fb6 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_fill_q31.c @@ -0,0 +1,135 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_fill_q31.c + * Description: Fills a constant value into a Q31 vector + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/support_functions.h" + +/** + @ingroup groupSupport + */ + +/** + @addtogroup Fill + @{ + */ + +/** + @brief Fills a constant value into a Q31 vector. + @param[in] value input value to be filled + @param[out] pDst points to output vector + @param[in] blockSize number of samples in each vector + @return none + */ +#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) +void arm_fill_q31( + q31_t value, + q31_t * pDst, + uint32_t blockSize) +{ + uint32_t blkCnt; + blkCnt = blockSize >> 2U; + + /* Compute 4 outputs at a time */ + while (blkCnt > 0U) + { + + vstrwq_s32(pDst,vdupq_n_s32(value)); + /* + * Decrement the blockSize loop counter + * Advance vector source and destination pointers + */ + pDst += 4; + blkCnt --; + } + + blkCnt = blockSize & 3; + while (blkCnt > 0U) + { + /* C = value */ + + /* Fill value in destination buffer */ + *pDst++ = value; + + /* Decrement loop counter */ + blkCnt--; + } + +} + +#else +void arm_fill_q31( + q31_t value, + q31_t * pDst, + uint32_t blockSize) +{ + uint32_t blkCnt; /* Loop counter */ + +#if defined (ARM_MATH_LOOPUNROLL) + + /* Loop unrolling: Compute 4 outputs at a time */ + blkCnt = blockSize >> 2U; + + while (blkCnt > 0U) + { + /* C = value */ + + /* Fill value in destination buffer */ + *pDst++ = value; + *pDst++ = value; + *pDst++ = value; + *pDst++ = value; + + /* Decrement loop counter */ + blkCnt--; + } + + /* Loop unrolling: Compute remaining outputs */ + blkCnt = blockSize % 0x4U; + +#else + + /* Initialize blkCnt with number of samples */ + blkCnt = blockSize; + +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ + + while (blkCnt > 0U) + { + /* C = value */ + + /* Fill value in destination buffer */ + *pDst++ = value; + + /* Decrement loop counter */ + blkCnt--; + } +} +#endif /* defined(ARM_MATH_MVEI) */ + +/** + @} end of Fill group + */ diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_fill_q7.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_fill_q7.c new file mode 100644 index 0000000..830fc73 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_fill_q7.c @@ -0,0 +1,133 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_fill_q7.c + * Description: Fills a constant value into a Q7 vector + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/support_functions.h" + +/** + @ingroup groupSupport + */ + +/** + @addtogroup Fill + @{ + */ + +/** + @brief Fills a constant value into a Q7 vector. + @param[in] value input value to be filled + @param[out] pDst points to output vector + @param[in] blockSize number of samples in each vector + @return none + */ +#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) +void arm_fill_q7( + q7_t value, + q7_t * pDst, + uint32_t blockSize) +{ + uint32_t blkCnt; + + blkCnt = blockSize >> 4; + while (blkCnt > 0U) + { + + vstrbq_s8(pDst,vdupq_n_s8(value)); + /* + * Decrement the blockSize loop counter + * Advance vector source and destination pointers + */ + pDst += 16; + blkCnt --; + } + + blkCnt = blockSize & 0xF; + while (blkCnt > 0U) + { + /* C = value */ + + /* Fill value in destination buffer */ + *pDst++ = value; + + /* Decrement loop counter */ + blkCnt--; + } +} +#else +void arm_fill_q7( + q7_t value, + q7_t * pDst, + uint32_t blockSize) +{ + uint32_t blkCnt; /* Loop counter */ + +#if defined (ARM_MATH_LOOPUNROLL) + q31_t packedValue; /* value packed to 32 bits */ + + /* Packing four 8 bit values to 32 bit value in order to use SIMD */ + packedValue = __PACKq7(value, value, value, value); + + /* Loop unrolling: Compute 4 outputs at a time */ + blkCnt = blockSize >> 2U; + + while (blkCnt > 0U) + { + /* C = value */ + + /* fill 4 samples at a time */ + write_q7x4_ia (&pDst, packedValue); + + /* Decrement loop counter */ + blkCnt--; + } + + /* Loop unrolling: Compute remaining outputs */ + blkCnt = blockSize % 0x4U; + +#else + + /* Initialize blkCnt with number of samples */ + blkCnt = blockSize; + +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ + + while (blkCnt > 0U) + { + /* C = value */ + + /* Fill value in destination buffer */ + *pDst++ = value; + + /* Decrement loop counter */ + blkCnt--; + } +} +#endif /* defined(ARM_MATH_MVEI) */ + +/** + @} end of Fill group + */ diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_float_to_f16.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_float_to_f16.c new file mode 100644 index 0000000..d627a8a --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_float_to_f16.c @@ -0,0 +1,131 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_float_to_q15.c + * Description: Converts the elements of the floating-point vector to Q15 vector + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/support_functions_f16.h" + +#if defined(ARM_FLOAT16_SUPPORTED) + + +/** + @ingroup groupSupport + */ + +/** + @addtogroup float_to_x + @{ + */ + +/** + @brief Converts the elements of the floating-point vector to f16 vector. + @param[in] pSrc points to the f32 input vector + @param[out] pDst points to the f16 output vector + @param[in] blockSize number of samples in each vector + @return none + + */ + +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) && defined(__CMSIS_GCC_H) +#pragma GCC warning "Scalar version of arm_float_to_f16 built. Helium version has build issues with gcc." +#endif + +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) && !defined(__CMSIS_GCC_H) + +void arm_float_to_f16( + const float32_t * pSrc, + float16_t * pDst, + uint32_t blockSize) +{ + int32_t blkCnt; /* loop counters */ + float32x4x2_t tmp; + float16x8_t vecDst; + float32_t const *pSrcVec; + + + pSrcVec = (float32_t const *) pSrc; + blkCnt = blockSize >> 3; + while (blkCnt > 0) + { + /* convert from float32 to float16 and then store the results in the destination buffer */ + tmp = vld2q(pSrcVec); pSrcVec += 8; + /* narrow / merge */ + vecDst = vcvtbq_f16_f32(vecDst, tmp.val[0]); + vecDst = vcvttq_f16_f32(vecDst, tmp.val[1]); + vst1q(pDst, vecDst); pDst += 8; + /* + * Decrement the blockSize loop counter + */ + blkCnt--; + } + + /* + * tail + */ + blkCnt = blockSize & 7; + if (blkCnt > 0) + { + mve_pred16_t p0 = vctp16q(blkCnt); + tmp = vld2q(pSrcVec); + vecDst = vcvtbq_f16_f32(vecDst, tmp.val[0]); + vecDst = vcvttq_f16_f32(vecDst, tmp.val[1]); + vstrhq_p(pDst, vecDst, p0); + } +} + +#else + +void arm_float_to_f16( + const float32_t * pSrc, + float16_t * pDst, + uint32_t blockSize) +{ + const float32_t *pIn = pSrc; /* Src pointer */ + uint32_t blkCnt; /* loop counter */ + + /* + * Loop over blockSize number of values + */ + blkCnt = blockSize; + + while (blkCnt > 0U) + { + + *pDst++ = (float16_t) * pIn++; + /* + * Decrement the loop counter + */ + blkCnt--; + } +} +#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */ + +/** + @} end of float_to_x group + */ + +#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ + diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_float_to_q15.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_float_to_q15.c new file mode 100644 index 0000000..8061c9f --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_float_to_q15.c @@ -0,0 +1,308 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_float_to_q15.c + * Description: Converts the elements of the floating-point vector to Q15 vector + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/support_functions.h" + +/** + @ingroup groupSupport + */ + +/** + @addtogroup float_to_x + @{ + */ + +/** + @brief Converts the elements of the floating-point vector to Q15 vector. + @param[in] pSrc points to the floating-point input vector + @param[out] pDst points to the Q15 output vector + @param[in] blockSize number of samples in each vector + @return none + + @par Details + The equation used for the conversion process is: + <pre> + pDst[n] = (q15_t)(pSrc[n] * 32768); 0 <= n < blockSize. + </pre> + + @par Scaling and Overflow Behavior + The function uses saturating arithmetic. + Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated. + + @note + In order to apply rounding, the library should be rebuilt with the ROUNDING macro + defined in the preprocessor section of project options. + */ + +#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) +void arm_float_to_q15( + const float32_t * pSrc, + q15_t * pDst, + uint32_t blockSize) +{ + uint32_t blkCnt; + float32_t maxQ = (float32_t) Q15_MAX; + f32x4x2_t tmp; + q15x8_t vecDst; +#ifdef ARM_MATH_ROUNDING + float32_t in; +#endif + + + blkCnt = blockSize >> 3; + while (blkCnt > 0U) + { + /* C = A * 32768 */ + /* convert from float to q15 and then store the results in the destination buffer */ + tmp = vld2q(pSrc); + + tmp.val[0] = vmulq(tmp.val[0], maxQ); + tmp.val[1] = vmulq(tmp.val[1], maxQ); + + vecDst = vqmovnbq(vecDst, vcvtaq_s32_f32(tmp.val[0])); + vecDst = vqmovntq(vecDst, vcvtaq_s32_f32(tmp.val[1])); + vst1q(pDst, vecDst); + /* + * Decrement the blockSize loop counter + */ + blkCnt--; + pDst += 8; + pSrc += 8; + } + + blkCnt = blockSize & 7; + while (blkCnt > 0U) + { + /* C = A * 32768 */ + + /* convert from float to Q15 and store result in destination buffer */ +#ifdef ARM_MATH_ROUNDING + + in = (*pSrc++ * 32768.0f); + in += in > 0.0f ? 0.5f : -0.5f; + *pDst++ = (q15_t) (__SSAT((q31_t) (in), 16)); + +#else + + /* C = A * 32768 */ + /* Convert from float to q15 and then store the results in the destination buffer */ + *pDst++ = (q15_t) __SSAT((q31_t) (*pSrc++ * 32768.0f), 16); + +#endif /* #ifdef ARM_MATH_ROUNDING */ + + /* Decrement loop counter */ + blkCnt--; + } +} + +#else +#if defined(ARM_MATH_NEON_EXPERIMENTAL) +void arm_float_to_q15( + const float32_t * pSrc, + q15_t * pDst, + uint32_t blockSize) +{ + const float32_t *pIn = pSrc; /* Src pointer */ + uint32_t blkCnt; /* loop counter */ + + float32x4_t inV; + #ifdef ARM_MATH_ROUNDING + float32x4_t zeroV = vdupq_n_f32(0.0f); + float32x4_t pHalf = vdupq_n_f32(0.5f / 32768.0f); + float32x4_t mHalf = vdupq_n_f32(-0.5f / 32768.0f); + float32x4_t r; + uint32x4_t cmp; + float32_t in; + #endif + + int32x4_t cvt; + int16x4_t outV; + + blkCnt = blockSize >> 2U; + + /* Compute 4 outputs at a time. + ** a second loop below computes the remaining 1 to 3 samples. */ + while (blkCnt > 0U) + { + +#ifdef ARM_MATH_ROUNDING + /* C = A * 32768 */ + /* Convert from float to q15 and then store the results in the destination buffer */ + inV = vld1q_f32(pIn); + cmp = vcgtq_f32(inV,zeroV); + r = vbslq_f32(cmp,pHalf,mHalf); + inV = vaddq_f32(inV, r); + + pIn += 4; + + cvt = vcvtq_n_s32_f32(inV,15); + outV = vqmovn_s32(cvt); + + vst1_s16(pDst, outV); + pDst += 4; + +#else + + /* C = A * 32768 */ + /* Convert from float to q15 and then store the results in the destination buffer */ + inV = vld1q_f32(pIn); + + cvt = vcvtq_n_s32_f32(inV,15); + outV = vqmovn_s32(cvt); + + vst1_s16(pDst, outV); + pDst += 4; + pIn += 4; + +#endif /* #ifdef ARM_MATH_ROUNDING */ + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize & 3; + + while (blkCnt > 0U) + { + +#ifdef ARM_MATH_ROUNDING + /* C = A * 32768 */ + /* Convert from float to q15 and then store the results in the destination buffer */ + in = *pIn++; + in = (in * 32768.0f); + in += in > 0.0f ? 0.5f : -0.5f; + *pDst++ = (q15_t) (__SSAT((q31_t) (in), 16)); + +#else + + /* C = A * 32768 */ + /* Convert from float to q15 and then store the results in the destination buffer */ + *pDst++ = (q15_t) __SSAT((q31_t) (*pIn++ * 32768.0f), 16); + +#endif /* #ifdef ARM_MATH_ROUNDING */ + + /* Decrement the loop counter */ + blkCnt--; + } +} +#else +void arm_float_to_q15( + const float32_t * pSrc, + q15_t * pDst, + uint32_t blockSize) +{ + uint32_t blkCnt; /* Loop counter */ + const float32_t *pIn = pSrc; /* Source pointer */ + +#ifdef ARM_MATH_ROUNDING + float32_t in; +#endif /* #ifdef ARM_MATH_ROUNDING */ + +#if defined (ARM_MATH_LOOPUNROLL) + + /* Loop unrolling: Compute 4 outputs at a time */ + blkCnt = blockSize >> 2U; + + while (blkCnt > 0U) + { + /* C = A * 32768 */ + + /* convert from float to Q15 and store result in destination buffer */ +#ifdef ARM_MATH_ROUNDING + + in = (*pIn++ * 32768.0f); + in += in > 0.0f ? 0.5f : -0.5f; + *pDst++ = (q15_t) (__SSAT((q31_t) (in), 16)); + + in = (*pIn++ * 32768.0f); + in += in > 0.0f ? 0.5f : -0.5f; + *pDst++ = (q15_t) (__SSAT((q31_t) (in), 16)); + + in = (*pIn++ * 32768.0f); + in += in > 0.0f ? 0.5f : -0.5f; + *pDst++ = (q15_t) (__SSAT((q31_t) (in), 16)); + + in = (*pIn++ * 32768.0f); + in += in > 0.0f ? 0.5f : -0.5f; + *pDst++ = (q15_t) (__SSAT((q31_t) (in), 16)); + +#else + + *pDst++ = (q15_t) __SSAT((q31_t) (*pIn++ * 32768.0f), 16); + *pDst++ = (q15_t) __SSAT((q31_t) (*pIn++ * 32768.0f), 16); + *pDst++ = (q15_t) __SSAT((q31_t) (*pIn++ * 32768.0f), 16); + *pDst++ = (q15_t) __SSAT((q31_t) (*pIn++ * 32768.0f), 16); + +#endif /* #ifdef ARM_MATH_ROUNDING */ + + /* Decrement loop counter */ + blkCnt--; + } + + /* Loop unrolling: Compute remaining outputs */ + blkCnt = blockSize % 0x4U; + +#else + + /* Initialize blkCnt with number of samples */ + blkCnt = blockSize; + +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ + + while (blkCnt > 0U) + { + /* C = A * 32768 */ + + /* convert from float to Q15 and store result in destination buffer */ +#ifdef ARM_MATH_ROUNDING + + in = (*pIn++ * 32768.0f); + in += in > 0.0f ? 0.5f : -0.5f; + *pDst++ = (q15_t) (__SSAT((q31_t) (in), 16)); + +#else + + /* C = A * 32768 */ + /* Convert from float to q15 and then store the results in the destination buffer */ + *pDst++ = (q15_t) __SSAT((q31_t) (*pIn++ * 32768.0f), 16); + +#endif /* #ifdef ARM_MATH_ROUNDING */ + + /* Decrement loop counter */ + blkCnt--; + } + +} +#endif /* #if defined(ARM_MATH_NEON) */ +#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */ + +/** + @} end of float_to_x group + */ diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_float_to_q31.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_float_to_q31.c new file mode 100644 index 0000000..a222e49 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_float_to_q31.c @@ -0,0 +1,314 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_float_to_q31.c + * Description: Converts the elements of the floating-point vector to Q31 vector + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/support_functions.h" + +/** + @ingroup groupSupport + */ + +/** + * @defgroup float_to_x Convert 32-bit floating point value + */ + +/** + @addtogroup float_to_x + @{ + */ + +/** + @brief Converts the elements of the floating-point vector to Q31 vector. + @param[in] pSrc points to the floating-point input vector + @param[out] pDst points to the Q31 output vector + @param[in] blockSize number of samples in each vector + @return none + + @par Details + The equation used for the conversion process is: + <pre> + pDst[n] = (q31_t)(pSrc[n] * 2147483648); 0 <= n < blockSize. + </pre> + + @par Scaling and Overflow Behavior + The function uses saturating arithmetic. + Results outside of the allowable Q31 range[0x80000000 0x7FFFFFFF] are saturated. + + @note + In order to apply rounding, the library should be rebuilt with the ROUNDING macro + defined in the preprocessor section of project options. + */ + +#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) +void arm_float_to_q31( + const float32_t * pSrc, + q31_t * pDst, + uint32_t blockSize) +{ + uint32_t blkCnt; + float32_t maxQ = (float32_t) Q31_MAX; + f32x4_t vecDst; +#ifdef ARM_MATH_ROUNDING + float32_t in; +#endif + + + blkCnt = blockSize >> 2U; + + /* Compute 4 outputs at a time. */ + while (blkCnt > 0U) + { + + vecDst = vldrwq_f32(pSrc); + /* C = A * 2147483648 */ + /* convert from float to Q31 and then store the results in the destination buffer */ + vecDst = vmulq(vecDst, maxQ); + + vstrwq_s32(pDst, vcvtaq_s32_f32(vecDst)); + /* + * Decrement the blockSize loop counter + * Advance vector source and destination pointers + */ + pSrc += 4; + pDst += 4; + blkCnt --; + } + + blkCnt = blockSize & 3; + + while (blkCnt > 0U) + { + /* C = A * 2147483648 */ + + /* convert from float to Q31 and store result in destination buffer */ +#ifdef ARM_MATH_ROUNDING + + in = (*pSrc++ * 2147483648.0f); + in += in > 0.0f ? 0.5f : -0.5f; + *pDst++ = clip_q63_to_q31((q63_t) (in)); + +#else + + /* C = A * 2147483648 */ + /* Convert from float to Q31 and then store the results in the destination buffer */ + *pDst++ = clip_q63_to_q31((q63_t) (*pSrc++ * 2147483648.0f)); + +#endif /* #ifdef ARM_MATH_ROUNDING */ + + /* Decrement loop counter */ + blkCnt--; + } +} +#else +#if defined(ARM_MATH_NEON) +void arm_float_to_q31( + const float32_t * pSrc, + q31_t * pDst, + uint32_t blockSize) +{ + const float32_t *pIn = pSrc; /* Src pointer */ + uint32_t blkCnt; /* loop counter */ + + float32x4_t inV; + #ifdef ARM_MATH_ROUNDING + float32_t in; + float32x4_t zeroV = vdupq_n_f32(0.0f); + float32x4_t pHalf = vdupq_n_f32(0.5f / 2147483648.0f); + float32x4_t mHalf = vdupq_n_f32(-0.5f / 2147483648.0f); + float32x4_t r; + uint32x4_t cmp; + #endif + + int32x4_t outV; + + blkCnt = blockSize >> 2U; + + /* Compute 4 outputs at a time. + ** a second loop below computes the remaining 1 to 3 samples. */ + while (blkCnt > 0U) + { + +#ifdef ARM_MATH_ROUNDING + + /* C = A * 32768 */ + /* Convert from float to Q31 and then store the results in the destination buffer */ + inV = vld1q_f32(pIn); + cmp = vcgtq_f32(inV,zeroV); + r = vbslq_f32(cmp,pHalf,mHalf); + inV = vaddq_f32(inV, r); + + pIn += 4; + + outV = vcvtq_n_s32_f32(inV,31); + + vst1q_s32(pDst, outV); + pDst += 4; + +#else + + /* C = A * 2147483648 */ + /* Convert from float to Q31 and then store the results in the destination buffer */ + inV = vld1q_f32(pIn); + + outV = vcvtq_n_s32_f32(inV,31); + + vst1q_s32(pDst, outV); + pDst += 4; + pIn += 4; + +#endif /* #ifdef ARM_MATH_ROUNDING */ + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize & 3; + + while (blkCnt > 0U) + { + +#ifdef ARM_MATH_ROUNDING + + /* C = A * 2147483648 */ + /* Convert from float to Q31 and then store the results in the destination buffer */ + in = *pIn++; + in = (in * 2147483648.0f); + in += in > 0.0f ? 0.5f : -0.5f; + *pDst++ = clip_q63_to_q31((q63_t) (in)); + +#else + + /* C = A * 2147483648 */ + /* Convert from float to Q31 and then store the results in the destination buffer */ + *pDst++ = clip_q63_to_q31((q63_t) (*pIn++ * 2147483648.0f)); + +#endif /* #ifdef ARM_MATH_ROUNDING */ + + /* Decrement the loop counter */ + blkCnt--; + } + + +} +#else +void arm_float_to_q31( + const float32_t * pSrc, + q31_t * pDst, + uint32_t blockSize) +{ + uint32_t blkCnt; /* Loop counter */ + const float32_t *pIn = pSrc; /* Source pointer */ + +#ifdef ARM_MATH_ROUNDING + float32_t in; +#endif /* #ifdef ARM_MATH_ROUNDING */ + +#if defined (ARM_MATH_LOOPUNROLL) + + /* Loop unrolling: Compute 4 outputs at a time */ + blkCnt = blockSize >> 2U; + + while (blkCnt > 0U) + { + /* C = A * 2147483648 */ + + /* convert from float to Q31 and store result in destination buffer */ +#ifdef ARM_MATH_ROUNDING + + in = (*pIn++ * 2147483648.0f); + in += in > 0.0f ? 0.5f : -0.5f; + *pDst++ = clip_q63_to_q31((q63_t) (in)); + + in = (*pIn++ * 2147483648.0f); + in += in > 0.0f ? 0.5f : -0.5f; + *pDst++ = clip_q63_to_q31((q63_t) (in)); + + in = (*pIn++ * 2147483648.0f); + in += in > 0.0f ? 0.5f : -0.5f; + *pDst++ = clip_q63_to_q31((q63_t) (in)); + + in = (*pIn++ * 2147483648.0f); + in += in > 0.0f ? 0.5f : -0.5f; + *pDst++ = clip_q63_to_q31((q63_t) (in)); + +#else + + /* C = A * 2147483648 */ + /* Convert from float to Q31 and then store the results in the destination buffer */ + *pDst++ = clip_q63_to_q31((q63_t) (*pIn++ * 2147483648.0f)); + *pDst++ = clip_q63_to_q31((q63_t) (*pIn++ * 2147483648.0f)); + *pDst++ = clip_q63_to_q31((q63_t) (*pIn++ * 2147483648.0f)); + *pDst++ = clip_q63_to_q31((q63_t) (*pIn++ * 2147483648.0f)); + +#endif /* #ifdef ARM_MATH_ROUNDING */ + + /* Decrement loop counter */ + blkCnt--; + } + + /* Loop unrolling: Compute remaining outputs */ + blkCnt = blockSize % 0x4U; + +#else + + /* Initialize blkCnt with number of samples */ + blkCnt = blockSize; + +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ + + while (blkCnt > 0U) + { + /* C = A * 2147483648 */ + + /* convert from float to Q31 and store result in destination buffer */ +#ifdef ARM_MATH_ROUNDING + + in = (*pIn++ * 2147483648.0f); + in += in > 0.0f ? 0.5f : -0.5f; + *pDst++ = clip_q63_to_q31((q63_t) (in)); + +#else + + /* C = A * 2147483648 */ + /* Convert from float to Q31 and then store the results in the destination buffer */ + *pDst++ = clip_q63_to_q31((q63_t) (*pIn++ * 2147483648.0f)); + +#endif /* #ifdef ARM_MATH_ROUNDING */ + + /* Decrement loop counter */ + blkCnt--; + } + +} +#endif /* #if defined(ARM_MATH_NEON) */ +#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */ + +/** + @} end of float_to_x group + */ diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_float_to_q7.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_float_to_q7.c new file mode 100644 index 0000000..27af520 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_float_to_q7.c @@ -0,0 +1,330 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_float_to_q7.c + * Description: Converts the elements of the floating-point vector to Q7 vector + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/support_functions.h" + +/** + @ingroup groupSupport + */ + +/** + @addtogroup float_to_x + @{ + */ + +/** + * @brief Converts the elements of the floating-point vector to Q7 vector. + * @param[in] *pSrc points to the floating-point input vector + * @param[out] *pDst points to the Q7 output vector + * @param[in] blockSize length of the input vector + * @return none. + * + *\par Description: + * \par + * The equation used for the conversion process is: + * <pre> + * pDst[n] = (q7_t)(pSrc[n] * 128); 0 <= n < blockSize. + * </pre> + * \par Scaling and Overflow Behavior: + * \par + * The function uses saturating arithmetic. + * Results outside of the allowable Q7 range [0x80 0x7F] will be saturated. + * \note + * In order to apply rounding, the library should be rebuilt with the ROUNDING macro + * defined in the preprocessor section of project options. + */ +#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) +void arm_float_to_q7( + const float32_t * pSrc, + q7_t * pDst, + uint32_t blockSize) +{ + uint32_t blkCnt; /* loop counters */ + float32_t maxQ = powf(2.0, 7); + f32x4x4_t tmp; + q15x8_t evVec, oddVec; + q7x16_t vecDst; + float32_t const *pSrcVec; +#ifdef ARM_MATH_ROUNDING + float32_t in; +#endif + + pSrcVec = (float32_t const *) pSrc; + blkCnt = blockSize >> 4; + while (blkCnt > 0U) { + tmp = vld4q(pSrcVec); + pSrcVec += 16; + /* + * C = A * 128.0 + * convert from float to q7 and then store the results in the destination buffer + */ + tmp.val[0] = vmulq(tmp.val[0], maxQ); + tmp.val[1] = vmulq(tmp.val[1], maxQ); + tmp.val[2] = vmulq(tmp.val[2], maxQ); + tmp.val[3] = vmulq(tmp.val[3], maxQ); + + /* + * convert and pack evens + */ + evVec = vqmovnbq(evVec, vcvtaq_s32_f32(tmp.val[0])); + evVec = vqmovntq(evVec, vcvtaq_s32_f32(tmp.val[2])); + /* + * convert and pack odds + */ + oddVec = vqmovnbq(oddVec, vcvtaq_s32_f32(tmp.val[1])); + oddVec = vqmovntq(oddVec, vcvtaq_s32_f32(tmp.val[3])); + /* + * merge + */ + vecDst = vqmovnbq(vecDst, evVec); + vecDst = vqmovntq(vecDst, oddVec); + + vst1q(pDst, vecDst); + pDst += 16; + /* + * Decrement the blockSize loop counter + */ + blkCnt--; + } + + blkCnt = blockSize & 0xF; + while (blkCnt > 0U) + { + /* C = A * 128 */ + + /* Convert from float to q7 and store result in destination buffer */ +#ifdef ARM_MATH_ROUNDING + + in = (*pSrcVec++ * 128); + in += in > 0.0f ? 0.5f : -0.5f; + *pDst++ = (q7_t) (__SSAT((q15_t) (in), 8)); + +#else + + *pDst++ = (q7_t) __SSAT((q31_t) (*pSrcVec++ * 128.0f), 8); + +#endif /* #ifdef ARM_MATH_ROUNDING */ + + /* Decrement loop counter */ + blkCnt--; + } + +} +#else +#if defined(ARM_MATH_NEON) +void arm_float_to_q7( + const float32_t * pSrc, + q7_t * pDst, + uint32_t blockSize) +{ + const float32_t *pIn = pSrc; /* Src pointer */ + uint32_t blkCnt; /* loop counter */ + + float32x4_t inV; + #ifdef ARM_MATH_ROUNDING + float32_t in; + float32x4_t zeroV = vdupq_n_f32(0.0f); + float32x4_t pHalf = vdupq_n_f32(0.5f / 128.0f); + float32x4_t mHalf = vdupq_n_f32(-0.5f / 128.0f); + float32x4_t r; + uint32x4_t cmp; + #endif + + int16x4_t cvt1,cvt2; + int8x8_t outV; + + blkCnt = blockSize >> 3U; + + /* Compute 8 outputs at a time. + ** a second loop below computes the remaining 1 to 7 samples. */ + while (blkCnt > 0U) + { + +#ifdef ARM_MATH_ROUNDING + /* C = A * 128 */ + /* Convert from float to q7 and then store the results in the destination buffer */ + inV = vld1q_f32(pIn); + cmp = vcgtq_f32(inV,zeroV); + r = vbslq_f32(cmp,pHalf,mHalf); + inV = vaddq_f32(inV, r); + cvt1 = vqmovn_s32(vcvtq_n_s32_f32(inV,7)); + pIn += 4; + + inV = vld1q_f32(pIn); + cmp = vcgtq_f32(inV,zeroV); + r = vbslq_f32(cmp,pHalf,mHalf); + inV = vaddq_f32(inV, r); + cvt2 = vqmovn_s32(vcvtq_n_s32_f32(inV,7)); + pIn += 4; + + outV = vqmovn_s16(vcombine_s16(cvt1,cvt2)); + vst1_s8(pDst, outV); + pDst += 8; + +#else + + /* C = A * 128 */ + /* Convert from float to q7 and then store the results in the destination buffer */ + inV = vld1q_f32(pIn); + cvt1 = vqmovn_s32(vcvtq_n_s32_f32(inV,7)); + pIn += 4; + + inV = vld1q_f32(pIn); + cvt2 = vqmovn_s32(vcvtq_n_s32_f32(inV,7)); + pIn += 4; + + outV = vqmovn_s16(vcombine_s16(cvt1,cvt2)); + + vst1_s8(pDst, outV); + pDst += 8; +#endif /* #ifdef ARM_MATH_ROUNDING */ + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize & 7; + + while (blkCnt > 0U) + { + +#ifdef ARM_MATH_ROUNDING + /* C = A * 128 */ + /* Convert from float to q7 and then store the results in the destination buffer */ + in = *pIn++; + in = (in * 128); + in += in > 0.0f ? 0.5f : -0.5f; + *pDst++ = (q7_t) (__SSAT((q15_t) (in), 8)); + +#else + + /* C = A * 128 */ + /* Convert from float to q7 and then store the results in the destination buffer */ + *pDst++ = __SSAT((q31_t) (*pIn++ * 128.0f), 8); + +#endif /* #ifdef ARM_MATH_ROUNDING */ + + /* Decrement the loop counter */ + blkCnt--; + } + +} +#else +void arm_float_to_q7( + const float32_t * pSrc, + q7_t * pDst, + uint32_t blockSize) +{ + uint32_t blkCnt; /* Loop counter */ + const float32_t *pIn = pSrc; /* Source pointer */ + +#ifdef ARM_MATH_ROUNDING + float32_t in; +#endif /* #ifdef ARM_MATH_ROUNDING */ + +#if defined (ARM_MATH_LOOPUNROLL) + + /* Loop unrolling: Compute 4 outputs at a time */ + blkCnt = blockSize >> 2U; + + while (blkCnt > 0U) + { + /* C = A * 128 */ + + /* Convert from float to q7 and store result in destination buffer */ +#ifdef ARM_MATH_ROUNDING + + in = (*pIn++ * 128); + in += in > 0.0f ? 0.5f : -0.5f; + *pDst++ = (q7_t) (__SSAT((q15_t) (in), 8)); + + in = (*pIn++ * 128); + in += in > 0.0f ? 0.5f : -0.5f; + *pDst++ = (q7_t) (__SSAT((q15_t) (in), 8)); + + in = (*pIn++ * 128); + in += in > 0.0f ? 0.5f : -0.5f; + *pDst++ = (q7_t) (__SSAT((q15_t) (in), 8)); + + in = (*pIn++ * 128); + in += in > 0.0f ? 0.5f : -0.5f; + *pDst++ = (q7_t) (__SSAT((q15_t) (in), 8)); + +#else + + *pDst++ = __SSAT((q31_t) (*pIn++ * 128.0f), 8); + *pDst++ = __SSAT((q31_t) (*pIn++ * 128.0f), 8); + *pDst++ = __SSAT((q31_t) (*pIn++ * 128.0f), 8); + *pDst++ = __SSAT((q31_t) (*pIn++ * 128.0f), 8); + +#endif /* #ifdef ARM_MATH_ROUNDING */ + + /* Decrement loop counter */ + blkCnt--; + } + + /* Loop unrolling: Compute remaining outputs */ + blkCnt = blockSize % 0x4U; + +#else + + /* Initialize blkCnt with number of samples */ + blkCnt = blockSize; + +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ + + while (blkCnt > 0U) + { + /* C = A * 128 */ + + /* Convert from float to q7 and store result in destination buffer */ +#ifdef ARM_MATH_ROUNDING + + in = (*pIn++ * 128); + in += in > 0.0f ? 0.5f : -0.5f; + *pDst++ = (q7_t) (__SSAT((q15_t) (in), 8)); + +#else + + *pDst++ = (q7_t) __SSAT((q31_t) (*pIn++ * 128.0f), 8); + +#endif /* #ifdef ARM_MATH_ROUNDING */ + + /* Decrement loop counter */ + blkCnt--; + } + +} +#endif /* #if defined(ARM_MATH_NEON) */ +#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */ + +/** + @} end of float_to_x group + */ diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_heap_sort_f32.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_heap_sort_f32.c new file mode 100644 index 0000000..5a46caa --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_heap_sort_f32.c @@ -0,0 +1,119 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_heap_sort_f32.c + * Description: Floating point heap sort + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/support_functions.h" +#include "arm_sorting.h" + + + +static void arm_heapify(float32_t * pSrc, uint32_t n, uint32_t i, uint8_t dir) +{ + /* Put all the elements of pSrc in heap order */ + uint32_t k = i; // Initialize largest/smallest as root + uint32_t l = 2*i + 1; // left = 2*i + 1 + uint32_t r = 2*i + 2; // right = 2*i + 2 + float32_t temp; + + if (l < n && dir==(pSrc[l] > pSrc[k]) ) + k = l; + + if (r < n && dir==(pSrc[r] > pSrc[k]) ) + k = r; + + if (k != i) + { + temp = pSrc[i]; + pSrc[i]=pSrc[k]; + pSrc[k]=temp; + + arm_heapify(pSrc, n, k, dir); + } +} + +/** + @ingroup groupSupport + */ + +/** + @addtogroup Sorting + @{ + */ + +/** + * @private + * @param[in] S points to an instance of the sorting structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data + * @param[in] blockSize number of samples to process. + * + * @par Algorithm + * The heap sort algorithm is a comparison algorithm that + * divides the input array into a sorted and an unsorted region, + * and shrinks the unsorted region by extracting the largest + * element and moving it to the sorted region. A heap data + * structure is used to find the maximum. + * + * @par It's an in-place algorithm. In order to obtain an out-of-place + * function, a memcpy of the source vector is performed. + */ +void arm_heap_sort_f32( + const arm_sort_instance_f32 * S, + float32_t * pSrc, + float32_t * pDst, + uint32_t blockSize) +{ + float32_t * pA; + int32_t i; + float32_t temp; + + if(pSrc != pDst) // out-of-place + { + memcpy(pDst, pSrc, blockSize*sizeof(float32_t) ); + pA = pDst; + } + else + pA = pSrc; + + // Build the heap array so that the largest value is the root + for (i = blockSize/2 - 1; i >= 0; i--) + arm_heapify(pA, blockSize, i, S->dir); + + for (i = blockSize - 1; i >= 0; i--) + { + // Swap + temp = pA[i]; + pA[i] = pA[0]; + pA[0] = temp; + + // Restore heap order + arm_heapify(pA, i, 0, S->dir); + } +} +/** + @} end of Sorting group + */ diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_insertion_sort_f32.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_insertion_sort_f32.c new file mode 100644 index 0000000..4e85043 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_insertion_sort_f32.c @@ -0,0 +1,93 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_insertion_sort_f32.c + * Description: Floating point insertion sort + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/support_functions.h" +#include "arm_sorting.h" + +/** + @ingroup groupSupport + */ + +/** + @addtogroup Sorting + @{ + */ + +/** + * @private + * @param[in] S points to an instance of the sorting structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data + * @param[in] blockSize number of samples to process. + * + * @par Algorithm + * The insertion sort is a simple sorting algorithm that + * reads all the element of the input array and removes one element + * at a time, finds the location it belongs in the final sorted list, + * and inserts it there. + * + * @par It's an in-place algorithm. In order to obtain an out-of-place + * function, a memcpy of the source vector is performed. + */ + +void arm_insertion_sort_f32( + const arm_sort_instance_f32 * S, + float32_t *pSrc, + float32_t* pDst, + uint32_t blockSize) +{ + float32_t * pA; + uint8_t dir = S->dir; + uint32_t i, j; + float32_t temp; + + if(pSrc != pDst) // out-of-place + { + memcpy(pDst, pSrc, blockSize*sizeof(float32_t) ); + pA = pDst; + } + else + pA = pSrc; + + // Real all the element of the input array + for(i=0; i<blockSize; i++) + { + // Move the i-th element to the right position + for (j = i; j>0 && dir==(pA[j]<pA[j-1]); j--) + { + // Swap + temp = pA[j]; + pA[j] = pA[j-1]; + pA[j-1] = temp; + } + } +} + +/** + @} end of Sorting group + */ diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_merge_sort_f32.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_merge_sort_f32.c new file mode 100644 index 0000000..5c21201 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_merge_sort_f32.c @@ -0,0 +1,127 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_merge_sort_f32.c + * Description: Floating point merge sort + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/support_functions.h" +#include "arm_sorting.h" + + +static void topDownMerge(float32_t * pA, uint32_t begin, uint32_t middle, uint32_t end, float32_t * pB, uint8_t dir) +{ + /* Left array is pA[begin:middle-1] + * Right Array is pA[middle:end-1] + * They are merged in pB + */ + + uint32_t i = begin; + uint32_t j = middle; + uint32_t k; + + // Read all the elements in the sublist + for (k = begin; k < end; k++) + { + // Merge + if (i < middle && (j >= end || dir==(pA[i] <= pA[j])) ) + { + pB[k] = pA[i]; + i++; + } + else + { + pB[k] = pA[j]; + j++; + } + } +} + +static void arm_merge_sort_core_f32(float32_t * pB, uint32_t begin, uint32_t end, float32_t * pA, uint8_t dir) +{ + if((int32_t)end - (int32_t)begin >= 2 ) // If run size != 1 divide + { + int32_t middle = (end + begin) / 2; // Take the middle point + + arm_merge_sort_core_f32(pA, begin, middle, pB, dir); // Sort the left part + arm_merge_sort_core_f32(pA, middle, end, pB, dir); // Sort the right part + + topDownMerge(pB, begin, middle, end, pA, dir); + } +} + + +/** + @ingroup groupSupport + */ + +/** + @addtogroup Sorting + @{ + */ + +/** + * @param[in] S points to an instance of the sorting structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data + * @param[in] blockSize number of samples to process. + * + * @par Algorithm + * The merge sort algorithm is a comparison algorithm that + * divide the input array in sublists and merge them to produce + * longer sorted sublists until there is only one list remaining. + * + * @par A work array is always needed. It must be allocated by the user + * linked to the instance at initialization time. + * + * @par It's an in-place algorithm. In order to obtain an out-of-place + * function, a memcpy of the source vector is performed + */ + + +void arm_merge_sort_f32( + const arm_merge_sort_instance_f32 * S, + float32_t *pSrc, + float32_t *pDst, + uint32_t blockSize) +{ + float32_t * pA; + + /* Out-of-place */ + if(pSrc != pDst) + { + memcpy(pDst, pSrc, blockSize*sizeof(float32_t)); + pA = pDst; + } + else + pA = pSrc; + + /* A working buffer is needed */ + memcpy(S->buffer, pSrc, blockSize*sizeof(float32_t)); + + arm_merge_sort_core_f32(S->buffer, 0, blockSize, pA, S->dir); +} +/** + @} end of Sorting group + */ diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_merge_sort_init_f32.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_merge_sort_init_f32.c new file mode 100644 index 0000000..bd93a00 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_merge_sort_init_f32.c @@ -0,0 +1,53 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_merge_sort_init_f32.c + * Description: Floating point merge sort initialization function + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/support_functions.h" + +/** + @ingroup groupSupport + */ + +/** + @addtogroup Sorting + @{ + */ + + + /** + * @param[in,out] S points to an instance of the sorting structure. + * @param[in] dir Sorting order. + * @param[in] buffer Working buffer. + */ +void arm_merge_sort_init_f32(arm_merge_sort_instance_f32 * S, arm_sort_dir dir, float32_t * buffer) +{ + S->dir = dir; + S->buffer = buffer; +} +/** + @} end of Sorting group + */ diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q15_to_f16.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q15_to_f16.c new file mode 100644 index 0000000..22a7eaa --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q15_to_f16.c @@ -0,0 +1,155 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_q15_to_float.c + * Description: Converts the elements of the Q15 vector to floating-point vector + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/support_functions_f16.h" + +#if defined(ARM_FLOAT16_SUPPORTED) + + +/** + @ingroup groupSupport + */ + +/** + * @defgroup q15_to_x Convert 16-bit fixed point value + */ + +/** + @addtogroup q15_to_x + @{ + */ + +/** + @brief Converts the elements of the Q15 vector to f16 vector. + @param[in] pSrc points to the Q15 input vector + @param[out] pDst points to the f16 output vector + @param[in] blockSize number of samples in each vector + @return none + + @par Details + The equation used for the conversion process is: + <pre> + pDst[n] = (float16_t) pSrc[n] / 32768; 0 <= n < blockSize. + </pre> + */ + +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) + +void arm_q15_to_f16( + const q15_t * pSrc, + float16_t * pDst, + uint32_t blockSize) +{ + int32_t blkCnt; /* loop counters */ + q15x8_t vecDst; + q15_t const *pSrcVec; + + pSrcVec = (q15_t const *) pSrc; + blkCnt = blockSize >> 3; + while (blkCnt > 0) + { + /* C = (float16_t) A / 32768 */ + /* convert from q15 to float and then store the results in the destination buffer */ + vecDst = vld1q(pSrcVec); pSrcVec += 8; + vstrhq(pDst, vcvtq_n_f16_s16(vecDst, 15)); pDst += 8; + /* + * Decrement the blockSize loop counter + */ + blkCnt--; + } + /* + * tail + * (will be merged thru tail predication) + */ + blkCnt = blockSize & 7; + if (blkCnt > 0) + { + mve_pred16_t p0 = vctp16q(blkCnt); + vecDst = vld1q(pSrcVec); pSrcVec += 8; + vstrhq_p(pDst, vcvtq_n_f16_s16(vecDst, 15), p0); + } +} +#else + +void arm_q15_to_f16( + const q15_t * pSrc, + float16_t * pDst, + uint32_t blockSize) +{ + uint32_t blkCnt; /* Loop counter */ + const q15_t *pIn = pSrc; /* Source pointer */ + +#if defined (ARM_MATH_LOOPUNROLL) + + /* Loop unrolling: Compute 4 outputs at a time */ + blkCnt = blockSize >> 2U; + + while (blkCnt > 0U) + { + /* C = (float16_t) A / 32768 */ + + /* Convert from q15 to float and store result in destination buffer */ + *pDst++ = ((_Float16) * pIn++ / 32768.0f16); + *pDst++ = ((_Float16) * pIn++ / 32768.0f16); + *pDst++ = ((_Float16) * pIn++ / 32768.0f16); + *pDst++ = ((_Float16) * pIn++ / 32768.0f16); + + /* Decrement loop counter */ + blkCnt--; + } + + /* Loop unrolling: Compute remaining outputs */ + blkCnt = blockSize % 0x4U; + +#else + + /* Initialize blkCnt with number of samples */ + blkCnt = blockSize; + +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ + + while (blkCnt > 0U) + { + /* C = (float16_t) A / 32768 */ + + /* Convert from q15 to float and store result in destination buffer */ + *pDst++ = ((_Float16) *pIn++ / 32768.0f16); + + /* Decrement loop counter */ + blkCnt--; + } + +} +#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */ + +/** + @} end of q15_to_x group + */ + +#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ + diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q15_to_float.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q15_to_float.c new file mode 100644 index 0000000..1a20def --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q15_to_float.c @@ -0,0 +1,207 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_q15_to_float.c + * Description: Converts the elements of the Q15 vector to floating-point vector + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/support_functions.h" + +/** + @ingroup groupSupport + */ + +/** + * @defgroup q15_to_x Convert 16-bit fixed point value + */ + +/** + @addtogroup q15_to_x + @{ + */ + +/** + @brief Converts the elements of the Q15 vector to floating-point vector. + @param[in] pSrc points to the Q15 input vector + @param[out] pDst points to the floating-point output vector + @param[in] blockSize number of samples in each vector + @return none + + @par Details + The equation used for the conversion process is: + <pre> + pDst[n] = (float32_t) pSrc[n] / 32768; 0 <= n < blockSize. + </pre> + */ + +#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) +void arm_q15_to_float( + const q15_t * pSrc, + float32_t * pDst, + uint32_t blockSize) +{ + uint32_t blkCnt; + + q15x8_t vecDst; + q15_t const *pSrcVec; + + pSrcVec = (q15_t const *) pSrc; + blkCnt = blockSize >> 2; + while (blkCnt > 0U) + { + /* C = (float32_t) A / 32768 */ + /* convert from q15 to float and then store the results in the destination buffer */ + vecDst = vldrhq_s32(pSrcVec); + pSrcVec += 4; + vstrwq(pDst, vcvtq_n_f32_s32((int32x4_t)vecDst, 15)); + pDst += 4; + /* + * Decrement the blockSize loop counter + */ + blkCnt--; + } + + blkCnt = blockSize & 3; + while (blkCnt > 0U) + { + /* C = (float32_t) A / 32768 */ + + /* Convert from q15 to float and store result in destination buffer */ + *pDst++ = ((float32_t) *pSrcVec++ / 32768.0f); + + /* Decrement loop counter */ + blkCnt--; + } +} +#else +#if defined(ARM_MATH_NEON_EXPERIMENTAL) +void arm_q15_to_float( + const q15_t * pSrc, + float32_t * pDst, + uint32_t blockSize) +{ + const q15_t *pIn = pSrc; /* Src pointer */ + uint32_t blkCnt; /* loop counter */ + + int16x8_t inV; + int32x4_t inV0, inV1; + float32x4_t outV; + + blkCnt = blockSize >> 3U; + + /* Compute 8 outputs at a time. + ** a second loop below computes the remaining 1 to 7 samples. */ + while (blkCnt > 0U) + { + /* C = (float32_t) A / 32768 */ + /* convert from q15 to float and then store the results in the destination buffer */ + inV = vld1q_s16(pIn); + pIn += 8; + + inV0 = vmovl_s16(vget_low_s16(inV)); + inV1 = vmovl_s16(vget_high_s16(inV)); + + outV = vcvtq_n_f32_s32(inV0,15); + vst1q_f32(pDst, outV); + pDst += 4; + + outV = vcvtq_n_f32_s32(inV1,15); + vst1q_f32(pDst, outV); + pDst += 4; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize is not a multiple of 8, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize & 7; + + + while (blkCnt > 0U) + { + /* C = (float32_t) A / 32768 */ + /* convert from q15 to float and then store the results in the destination buffer */ + *pDst++ = ((float32_t) * pIn++ / 32768.0f); + + /* Decrement the loop counter */ + blkCnt--; + } +} +#else +void arm_q15_to_float( + const q15_t * pSrc, + float32_t * pDst, + uint32_t blockSize) +{ + uint32_t blkCnt; /* Loop counter */ + const q15_t *pIn = pSrc; /* Source pointer */ + +#if defined (ARM_MATH_LOOPUNROLL) + + /* Loop unrolling: Compute 4 outputs at a time */ + blkCnt = blockSize >> 2U; + + while (blkCnt > 0U) + { + /* C = (float32_t) A / 32768 */ + + /* Convert from q15 to float and store result in destination buffer */ + *pDst++ = ((float32_t) * pIn++ / 32768.0f); + *pDst++ = ((float32_t) * pIn++ / 32768.0f); + *pDst++ = ((float32_t) * pIn++ / 32768.0f); + *pDst++ = ((float32_t) * pIn++ / 32768.0f); + + /* Decrement loop counter */ + blkCnt--; + } + + /* Loop unrolling: Compute remaining outputs */ + blkCnt = blockSize % 0x4U; + +#else + + /* Initialize blkCnt with number of samples */ + blkCnt = blockSize; + +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ + + while (blkCnt > 0U) + { + /* C = (float32_t) A / 32768 */ + + /* Convert from q15 to float and store result in destination buffer */ + *pDst++ = ((float32_t) *pIn++ / 32768.0f); + + /* Decrement loop counter */ + blkCnt--; + } + +} +#endif /* #if defined(ARM_MATH_NEON) */ +#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */ + +/** + @} end of q15_to_x group + */ diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q15_to_q31.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q15_to_q31.c new file mode 100644 index 0000000..fc3c868 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q15_to_q31.c @@ -0,0 +1,182 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_q15_to_q31.c + * Description: Converts the elements of the Q15 vector to Q31 vector + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/support_functions.h" + +/** + @ingroup groupSupport + */ + +/** + @addtogroup q15_to_x + @{ + */ + +/** + @brief Converts the elements of the Q15 vector to Q31 vector. + @param[in] pSrc points to the Q15 input vector + @param[out] pDst points to the Q31 output vector + @param[in] blockSize number of samples in each vector + @return none + + @par Details + The equation used for the conversion process is: + <pre> + pDst[n] = (q31_t) pSrc[n] << 16; 0 <= n < blockSize. + </pre> + */ +#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) +void arm_q15_to_q31( + const q15_t * pSrc, + q31_t * pDst, + uint32_t blockSize) +{ + + uint32_t blkCnt; + + q31x4_t vecDst; + + blkCnt = blockSize>> 2; + while (blkCnt > 0U) + { + + /* C = (q31_t)A << 16 */ + /* convert from q15 to q31 and then store the results in the destination buffer */ + /* load q15 + 32-bit widening */ + vecDst = vldrhq_s32((q15_t const *) pSrc); + vecDst = vshlq_n(vecDst, 16); + vstrwq_s32(pDst, vecDst); + + /* + * Decrement the blockSize loop counter + * Advance vector source and destination pointers + */ + pDst += 4; + pSrc += 4; + blkCnt --; + } + + blkCnt = blockSize & 3; + while (blkCnt > 0U) + { + /* C = (q31_t) A << 16 */ + + /* Convert from q15 to q31 and store result in destination buffer */ + *pDst++ = (q31_t) *pSrc++ << 16; + + /* Decrement loop counter */ + blkCnt--; + } +} +#else +void arm_q15_to_q31( + const q15_t * pSrc, + q31_t * pDst, + uint32_t blockSize) +{ + uint32_t blkCnt; /* Loop counter */ + const q15_t *pIn = pSrc; /* Source pointer */ + +#if defined (ARM_MATH_LOOPUNROLL) + q31_t in1, in2; + q31_t out1, out2, out3, out4; +#endif + +#if defined (ARM_MATH_LOOPUNROLL) + + /* Loop unrolling: Compute 4 outputs at a time */ + blkCnt = blockSize >> 2U; + + while (blkCnt > 0U) + { + /* C = (q31_t)A << 16 */ + + /* Convert from q15 to q31 and store result in destination buffer */ + in1 = read_q15x2_ia (&pIn); + in2 = read_q15x2_ia (&pIn); + +#ifndef ARM_MATH_BIG_ENDIAN + + /* extract lower 16 bits to 32 bit result */ + out1 = in1 << 16U; + /* extract upper 16 bits to 32 bit result */ + out2 = in1 & 0xFFFF0000; + /* extract lower 16 bits to 32 bit result */ + out3 = in2 << 16U; + /* extract upper 16 bits to 32 bit result */ + out4 = in2 & 0xFFFF0000; + +#else + + /* extract upper 16 bits to 32 bit result */ + out1 = in1 & 0xFFFF0000; + /* extract lower 16 bits to 32 bit result */ + out2 = in1 << 16U; + /* extract upper 16 bits to 32 bit result */ + out3 = in2 & 0xFFFF0000; + /* extract lower 16 bits to 32 bit result */ + out4 = in2 << 16U; + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + *pDst++ = out1; + *pDst++ = out2; + *pDst++ = out3; + *pDst++ = out4; + + /* Decrement loop counter */ + blkCnt--; + } + + /* Loop unrolling: Compute remaining outputs */ + blkCnt = blockSize % 0x4U; + +#else + + /* Initialize blkCnt with number of samples */ + blkCnt = blockSize; + +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ + + while (blkCnt > 0U) + { + /* C = (q31_t) A << 16 */ + + /* Convert from q15 to q31 and store result in destination buffer */ + *pDst++ = (q31_t) *pIn++ << 16; + + /* Decrement loop counter */ + blkCnt--; + } + +} +#endif /* defined(ARM_MATH_MVEI) */ + +/** + @} end of q15_to_x group + */ diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q15_to_q7.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q15_to_q7.c new file mode 100644 index 0000000..eac8105 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q15_to_q7.c @@ -0,0 +1,190 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_q15_to_q7.c + * Description: Converts the elements of the Q15 vector to Q7 vector + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/support_functions.h" + +/** + @ingroup groupSupport + */ + +/** + @addtogroup q15_to_x + @{ + */ + +/** + @brief Converts the elements of the Q15 vector to Q7 vector. + @param[in] pSrc points to the Q15 input vector + @param[out] pDst points to the Q7 output vector + @param[in] blockSize number of samples in each vector + @return none + + @par Details + The equation used for the conversion process is: + <pre> + pDst[n] = (q7_t) pSrc[n] >> 8; 0 <= n < blockSize. + </pre> + */ +#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) +void arm_q15_to_q7( + const q15_t * pSrc, + q7_t * pDst, + uint32_t blockSize) +{ + + uint32_t blkCnt; /* loop counters */ + q15x8x2_t tmp; + q15_t const *pSrcVec; + q7x16_t vecDst; + + + pSrcVec = (q15_t const *) pSrc; + blkCnt = blockSize >> 4; + while (blkCnt > 0U) + { + /* C = (q7_t) A >> 8 */ + /* convert from q15 to q7 and then store the results in the destination buffer */ + tmp = vld2q(pSrcVec); + pSrcVec += 16; + vecDst = vqshrnbq_n_s16(vecDst, tmp.val[0], 8); + vecDst = vqshrntq_n_s16(vecDst, tmp.val[1], 8); + vst1q(pDst, vecDst); + pDst += 16; + /* + * Decrement the blockSize loop counter + */ + blkCnt--; + } + + blkCnt = blockSize & 0xF; + while (blkCnt > 0U) + { + /* C = (q7_t) A >> 8 */ + + /* Convert from q15 to q7 and store result in destination buffer */ + *pDst++ = (q7_t) (*pSrcVec++ >> 8); + + /* Decrement loop counter */ + blkCnt--; + } +} +#else +void arm_q15_to_q7( + const q15_t * pSrc, + q7_t * pDst, + uint32_t blockSize) +{ + uint32_t blkCnt; /* Loop counter */ + const q15_t *pIn = pSrc; /* Source pointer */ + +#if defined (ARM_MATH_LOOPUNROLL) && defined (ARM_MATH_DSP) + q31_t in1, in2; + q31_t out1, out2; +#endif + +#if defined (ARM_MATH_LOOPUNROLL) + + /* Loop unrolling: Compute 4 outputs at a time */ + blkCnt = blockSize >> 2U; + + while (blkCnt > 0U) + { + /* C = (q7_t) A >> 8 */ + + /* Convert from q15 to q7 and store result in destination buffer */ +#if defined (ARM_MATH_DSP) + + in1 = read_q15x2_ia (&pIn); + in2 = read_q15x2_ia (&pIn); + +#ifndef ARM_MATH_BIG_ENDIAN + + out1 = __PKHTB(in2, in1, 16); + out2 = __PKHBT(in2, in1, 16); + +#else + + out1 = __PKHTB(in1, in2, 16); + out2 = __PKHBT(in1, in2, 16); + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* rotate packed value by 24 */ + out2 = ((uint32_t) out2 << 8) | ((uint32_t) out2 >> 24); + + /* anding with 0xff00ff00 to get two 8 bit values */ + out1 = out1 & 0xFF00FF00; + /* anding with 0x00ff00ff to get two 8 bit values */ + out2 = out2 & 0x00FF00FF; + + /* oring two values(contains two 8 bit values) to get four packed 8 bit values */ + out1 = out1 | out2; + + /* store 4 samples at a time to destiantion buffer */ + write_q7x4_ia (&pDst, out1); + +#else + + *pDst++ = (q7_t) (*pIn++ >> 8); + *pDst++ = (q7_t) (*pIn++ >> 8); + *pDst++ = (q7_t) (*pIn++ >> 8); + *pDst++ = (q7_t) (*pIn++ >> 8); + +#endif /* #if defined (ARM_MATH_DSP) */ + + /* Decrement loop counter */ + blkCnt--; + } + + /* Loop unrolling: Compute remaining outputs */ + blkCnt = blockSize % 0x4U; + +#else + + /* Initialize blkCnt with number of samples */ + blkCnt = blockSize; + +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ + + while (blkCnt > 0U) + { + /* C = (q7_t) A >> 8 */ + + /* Convert from q15 to q7 and store result in destination buffer */ + *pDst++ = (q7_t) (*pIn++ >> 8); + + /* Decrement loop counter */ + blkCnt--; + } + +} +#endif /* defined(ARM_MATH_MVEI) */ + +/** + @} end of q15_to_x group + */ diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q31_to_float.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q31_to_float.c new file mode 100644 index 0000000..ce7ce42 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q31_to_float.c @@ -0,0 +1,202 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_q31_to_float.c + * Description: Converts the elements of the Q31 vector to floating-point vector + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/support_functions.h" + +/** + @ingroup groupSupport + */ + +/** + * @defgroup q31_to_x Convert 32-bit fixed point value + */ + +/** + @addtogroup q31_to_x + @{ + */ + +/** + @brief Converts the elements of the Q31 vector to floating-point vector. + @param[in] pSrc points to the Q31 input vector + @param[out] pDst points to the floating-point output vector + @param[in] blockSize number of samples in each vector + @return none + + @par Details + The equation used for the conversion process is: + <pre> + pDst[n] = (float32_t) pSrc[n] / 2147483648; 0 <= n < blockSize. + </pre> + */ +#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) +void arm_q31_to_float( + const q31_t * pSrc, + float32_t * pDst, + uint32_t blockSize) +{ + uint32_t blkCnt; /* loop counters */ + q31x4_t vecDst; + q31_t const *pSrcVec; + + pSrcVec = (q31_t const *) pSrc; + blkCnt = blockSize >> 2; + while (blkCnt > 0U) + { + /* C = (float32_t) A / 2147483648 */ + /* convert from q31 to float and then store the results in the destination buffer */ + vecDst = vld1q(pSrcVec); + pSrcVec += 4; + vstrwq(pDst, vcvtq_n_f32_s32(vecDst, 31)); + pDst += 4; + /* + * Decrement the blockSize loop counter + */ + blkCnt--; + } + /* + * tail + * (will be merged thru tail predication) + */ + blkCnt = blockSize & 3; + while (blkCnt > 0U) + { + /* C = (float32_t) A / 2147483648 */ + + /* Convert from q31 to float and store result in destination buffer */ + *pDst++ = ((float32_t) *pSrcVec++ / 2147483648.0f); + + /* Decrement loop counter */ + blkCnt--; + } +} + +#else +#if defined(ARM_MATH_NEON_EXPERIMENTAL) +void arm_q31_to_float( + const q31_t * pSrc, + float32_t * pDst, + uint32_t blockSize) +{ + const q31_t *pIn = pSrc; /* Src pointer */ + uint32_t blkCnt; /* loop counter */ + + int32x4_t inV; + float32x4_t outV; + + blkCnt = blockSize >> 2U; + + /* Compute 4 outputs at a time. + ** a second loop below computes the remaining 1 to 3 samples. */ + while (blkCnt > 0U) + { + /* C = (float32_t) A / 2147483648 */ + /* Convert from q31 to float and then store the results in the destination buffer */ + inV = vld1q_s32(pIn); + pIn += 4; + + outV = vcvtq_n_f32_s32(inV,31); + + vst1q_f32(pDst, outV); + pDst += 4; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize & 3; + + + while (blkCnt > 0U) + { + /* C = (float32_t) A / 2147483648 */ + /* Convert from q31 to float and then store the results in the destination buffer */ + *pDst++ = ((float32_t) * pIn++ / 2147483648.0f); + + /* Decrement the loop counter */ + blkCnt--; + } +} +#else +void arm_q31_to_float( + const q31_t * pSrc, + float32_t * pDst, + uint32_t blockSize) +{ + const q31_t *pIn = pSrc; /* Src pointer */ + uint32_t blkCnt; /* loop counter */ + +#if defined (ARM_MATH_LOOPUNROLL) + + /* Loop unrolling */ + blkCnt = blockSize >> 2U; + + while (blkCnt > 0U) + { + /* C = (float32_t) A / 2147483648 */ + + /* Convert from q31 to float and store result in destination buffer */ + *pDst++ = ((float32_t) *pIn++ / 2147483648.0f); + *pDst++ = ((float32_t) *pIn++ / 2147483648.0f); + *pDst++ = ((float32_t) *pIn++ / 2147483648.0f); + *pDst++ = ((float32_t) *pIn++ / 2147483648.0f); + + /* Decrement loop counter */ + blkCnt--; + } + + /* Loop unrolling: Compute remaining outputs */ + blkCnt = blockSize % 0x4U; + +#else + + /* Initialize blkCnt with number of samples */ + blkCnt = blockSize; + +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ + + while (blkCnt > 0U) + { + /* C = (float32_t) A / 2147483648 */ + + /* Convert from q31 to float and store result in destination buffer */ + *pDst++ = ((float32_t) *pIn++ / 2147483648.0f); + + /* Decrement loop counter */ + blkCnt--; + } + +} +#endif /* #if defined(ARM_MATH_NEON) */ +#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */ + +/** + @} end of q31_to_x group + */ diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q31_to_q15.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q31_to_q15.c new file mode 100644 index 0000000..a25a2bb --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q31_to_q15.c @@ -0,0 +1,181 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_q31_to_q15.c + * Description: Converts the elements of the Q31 vector to Q15 vector + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/support_functions.h" + +/** + @ingroup groupSupport + */ + +/** + @addtogroup q31_to_x + @{ + */ + +/** + @brief Converts the elements of the Q31 vector to Q15 vector. + @param[in] pSrc points to the Q31 input vector + @param[out] pDst points to the Q15 output vector + @param[in] blockSize number of samples in each vector + @return none + + @par Details + The equation used for the conversion process is: + <pre> + pDst[n] = (q15_t) pSrc[n] >> 16; 0 <= n < blockSize. + </pre> + */ +#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) +void arm_q31_to_q15( + const q31_t * pSrc, + q15_t * pDst, + uint32_t blockSize) +{ + uint32_t blkCnt; /* loop counters */ + q31x4x2_t tmp; + q15x8_t vecDst; + q31_t const *pSrcVec; + + + pSrcVec = (q31_t const *) pSrc; + blkCnt = blockSize >> 3; + while (blkCnt > 0U) + { + /* C = (q15_t) A >> 16 */ + /* convert from q31 to q15 and then store the results in the destination buffer */ + tmp = vld2q(pSrcVec); + pSrcVec += 8; + vecDst = vshrnbq_n_s32(vecDst, tmp.val[0], 16); + vecDst = vshrntq_n_s32(vecDst, tmp.val[1], 16); + vst1q(pDst, vecDst); + pDst += 8; + /* + * Decrement the blockSize loop counter + */ + blkCnt--; + } + + /* + * tail + */ + blkCnt = blockSize & 7; + while (blkCnt > 0U) + { + /* C = (q15_t) (A >> 16) */ + + /* Convert from q31 to q15 and store result in destination buffer */ + *pDst++ = (q15_t) (*pSrcVec++ >> 16); + + /* Decrement loop counter */ + blkCnt--; + } +} + +#else +void arm_q31_to_q15( + const q31_t * pSrc, + q15_t * pDst, + uint32_t blockSize) +{ + uint32_t blkCnt; /* Loop counter */ + const q31_t *pIn = pSrc; /* Source pointer */ + +#if defined (ARM_MATH_LOOPUNROLL) && defined (ARM_MATH_DSP) + q31_t in1, in2, in3, in4; + q31_t out1, out2; +#endif + +#if defined (ARM_MATH_LOOPUNROLL) + + /* Loop unrolling: Compute 4 outputs at a time */ + blkCnt = blockSize >> 2U; + + while (blkCnt > 0U) + { + /* C = (q15_t) (A >> 16) */ + + /* Convert from q31 to q15 and store result in destination buffer */ +#if defined (ARM_MATH_DSP) + + in1 = *pIn++; + in2 = *pIn++; + in3 = *pIn++; + in4 = *pIn++; + + /* pack two higher 16-bit values from two 32-bit values */ +#ifndef ARM_MATH_BIG_ENDIAN + out1 = __PKHTB(in2, in1, 16); + out2 = __PKHTB(in4, in3, 16); +#else + out1 = __PKHTB(in1, in2, 16); + out2 = __PKHTB(in3, in4, 16); +#endif /* #ifdef ARM_MATH_BIG_ENDIAN */ + + write_q15x2_ia (&pDst, out1); + write_q15x2_ia (&pDst, out2); + +#else + + *pDst++ = (q15_t) (*pIn++ >> 16); + *pDst++ = (q15_t) (*pIn++ >> 16); + *pDst++ = (q15_t) (*pIn++ >> 16); + *pDst++ = (q15_t) (*pIn++ >> 16); + +#endif /* #if defined (ARM_MATH_DSP) */ + + /* Decrement loop counter */ + blkCnt--; + } + + /* Loop unrolling: Compute remaining outputs */ + blkCnt = blockSize % 0x4U; + +#else + + /* Initialize blkCnt with number of samples */ + blkCnt = blockSize; + +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ + + while (blkCnt > 0U) + { + /* C = (q15_t) (A >> 16) */ + + /* Convert from q31 to q15 and store result in destination buffer */ + *pDst++ = (q15_t) (*pIn++ >> 16); + + /* Decrement loop counter */ + blkCnt--; + } + +} +#endif /* defined(ARM_MATH_MVEI) */ + +/** + @} end of q31_to_x group + */ diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q31_to_q7.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q31_to_q7.c new file mode 100644 index 0000000..16fbe18 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q31_to_q7.c @@ -0,0 +1,169 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_q31_to_q7.c + * Description: Converts the elements of the Q31 vector to Q7 vector + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/support_functions.h" + +/** + @ingroup groupSupport + */ + +/** + @addtogroup q31_to_x + @{ + */ + +/** + @brief Converts the elements of the Q31 vector to Q7 vector. + @param[in] pSrc points to the Q31 input vector + @param[out] pDst points to the Q7 output vector + @param[in] blockSize number of samples in each vector + @return none + + @par Details + The equation used for the conversion process is: + <pre> + pDst[n] = (q7_t) pSrc[n] >> 24; 0 <= n < blockSize. + </pre> + */ +#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) +void arm_q31_to_q7( + const q31_t * pSrc, + q7_t * pDst, + uint32_t blockSize) +{ + uint32_t blkCnt; /* loop counters */ + q31x4x4_t tmp; + q15x8_t evVec, oddVec; + q7x16_t vecDst; + q31_t const *pSrcVec; + + pSrcVec = (q31_t const *) pSrc; + blkCnt = blockSize >> 4; + while (blkCnt > 0U) + { + tmp = vld4q(pSrcVec); + pSrcVec += 16; + /* C = (q7_t) A >> 24 */ + /* convert from q31 to q7 and then store the results in the destination buffer */ + /* + * narrow and pack evens + */ + evVec = vshrnbq_n_s32(evVec, tmp.val[0], 16); + evVec = vshrntq_n_s32(evVec, tmp.val[2], 16); + /* + * narrow and pack odds + */ + oddVec = vshrnbq_n_s32(oddVec, tmp.val[1], 16); + oddVec = vshrntq_n_s32(oddVec, tmp.val[3], 16); + /* + * narrow & merge + */ + vecDst = vshrnbq_n_s16(vecDst, evVec, 8); + vecDst = vshrntq_n_s16(vecDst, oddVec, 8); + + vst1q(pDst, vecDst); + pDst += 16; + /* + * Decrement the blockSize loop counter + */ + blkCnt--; + } + /* + * tail + */ + blkCnt = blockSize & 0xF; + while (blkCnt > 0U) + { + /* C = (q7_t) (A >> 24) */ + + /* Convert from q31 to q7 and store result in destination buffer */ + *pDst++ = (q7_t) (*pSrcVec++ >> 24); + + /* Decrement loop counter */ + blkCnt--; + } +} +#else +void arm_q31_to_q7( + const q31_t * pSrc, + q7_t * pDst, + uint32_t blockSize) +{ + uint32_t blkCnt; /* Loop counter */ + const q31_t *pIn = pSrc; /* Source pointer */ + +#if defined (ARM_MATH_LOOPUNROLL) + + q7_t out1, out2, out3, out4; + + /* Loop unrolling: Compute 4 outputs at a time */ + blkCnt = blockSize >> 2U; + + while (blkCnt > 0U) + { + /* C = (q7_t) (A >> 24) */ + + /* Convert from q31 to q7 and store result in destination buffer */ + + out1 = (q7_t) (*pIn++ >> 24); + out2 = (q7_t) (*pIn++ >> 24); + out3 = (q7_t) (*pIn++ >> 24); + out4 = (q7_t) (*pIn++ >> 24); + write_q7x4_ia (&pDst, __PACKq7(out1, out2, out3, out4)); + + /* Decrement loop counter */ + blkCnt--; + } + + /* Loop unrolling: Compute remaining outputs */ + blkCnt = blockSize % 0x4U; + +#else + + /* Initialize blkCnt with number of samples */ + blkCnt = blockSize; + +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ + + while (blkCnt > 0U) + { + /* C = (q7_t) (A >> 24) */ + + /* Convert from q31 to q7 and store result in destination buffer */ + *pDst++ = (q7_t) (*pIn++ >> 24); + + /* Decrement loop counter */ + blkCnt--; + } + +} +#endif /* defined(ARM_MATH_MVEI) */ + +/** + @} end of q31_to_x group + */ diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q7_to_float.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q7_to_float.c new file mode 100644 index 0000000..9cbf8ad --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q7_to_float.c @@ -0,0 +1,218 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_q7_to_float.c + * Description: Converts the elements of the Q7 vector to floating-point vector + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/support_functions.h" + +/** + @ingroup groupSupport + */ + +/** + * @defgroup q7_to_x Convert 8-bit fixed point value + */ + +/** + @addtogroup q7_to_x + @{ + */ + +/** + @brief Converts the elements of the Q7 vector to floating-point vector. + @param[in] pSrc points to the Q7 input vector + @param[out] pDst points to the floating-point output vector + @param[in] blockSize number of samples in each vector + @return none + + @par Details + The equation used for the conversion process is: + <pre> + pDst[n] = (float32_t) pSrc[n] / 128; 0 <= n < blockSize. + </pre> + */ +#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) +void arm_q7_to_float( + const q7_t * pSrc, + float32_t * pDst, + uint32_t blockSize) +{ + uint32_t blkCnt; /* loop counters */ + q7x16_t vecDst; + q7_t const *pSrcVec; + + pSrcVec = (q7_t const *) pSrc; + blkCnt = blockSize >> 2; + while (blkCnt > 0U) + { + /* C = (float32_t) A / 32768 */ + /* convert from q7 to float and then store the results in the destination buffer */ + vecDst = vldrbq_s32(pSrcVec); + pSrcVec += 4; + vstrwq(pDst, vcvtq_n_f32_s32((int32x4_t)vecDst, 7)); + pDst += 4; + /* + * Decrement the blockSize loop counter + */ + blkCnt--; + } + + blkCnt = blockSize & 3; + while (blkCnt > 0U) + { + /* C = (float32_t) A / 128 */ + + /* Convert from q7 to float and store result in destination buffer */ + *pDst++ = ((float32_t) * pSrcVec++ / 128.0f); + + /* Decrement loop counter */ + blkCnt--; + } +} +#else +#if defined(ARM_MATH_NEON) +void arm_q7_to_float( + const q7_t * pSrc, + float32_t * pDst, + uint32_t blockSize) +{ + const q7_t *pIn = pSrc; /* Src pointer */ + uint32_t blkCnt; /* loop counter */ + + int8x16_t inV; + int16x8_t inVLO, inVHI; + int32x4_t inVLL, inVLH, inVHL, inVHH; + float32x4_t outV; + + blkCnt = blockSize >> 4U; + + /* Compute 16 outputs at a time. + ** a second loop below computes the remaining 1 to 15 samples. */ + while (blkCnt > 0U) + { + /* C = (float32_t) A / 128 */ + /* Convert from q7 to float and then store the results in the destination buffer */ + inV = vld1q_s8(pIn); + pIn += 16; + + inVLO = vmovl_s8(vget_low_s8(inV)); + inVHI = vmovl_s8(vget_high_s8(inV)); + + inVLL = vmovl_s16(vget_low_s16(inVLO)); + inVLH = vmovl_s16(vget_high_s16(inVLO)); + inVHL = vmovl_s16(vget_low_s16(inVHI)); + inVHH = vmovl_s16(vget_high_s16(inVHI)); + + outV = vcvtq_n_f32_s32(inVLL,7); + vst1q_f32(pDst, outV); + pDst += 4; + + outV = vcvtq_n_f32_s32(inVLH,7); + vst1q_f32(pDst, outV); + pDst += 4; + + outV = vcvtq_n_f32_s32(inVHL,7); + vst1q_f32(pDst, outV); + pDst += 4; + + outV = vcvtq_n_f32_s32(inVHH,7); + vst1q_f32(pDst, outV); + pDst += 4; + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize is not a multiple of 16, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize & 0xF; + + while (blkCnt > 0U) + { + /* C = (float32_t) A / 128 */ + /* Convert from q7 to float and then store the results in the destination buffer */ + *pDst++ = ((float32_t) * pIn++ / 128.0f); + + /* Decrement the loop counter */ + blkCnt--; + } +} +#else +void arm_q7_to_float( + const q7_t * pSrc, + float32_t * pDst, + uint32_t blockSize) +{ + uint32_t blkCnt; /* Loop counter */ + const q7_t *pIn = pSrc; /* Source pointer */ + +#if defined (ARM_MATH_LOOPUNROLL) + + /* Loop unrolling: Compute 4 outputs at a time */ + blkCnt = blockSize >> 2U; + + while (blkCnt > 0U) + { + /* C = (float32_t) A / 128 */ + + /* Convert from q7 to float and store result in destination buffer */ + *pDst++ = ((float32_t) * pIn++ / 128.0f); + *pDst++ = ((float32_t) * pIn++ / 128.0f); + *pDst++ = ((float32_t) * pIn++ / 128.0f); + *pDst++ = ((float32_t) * pIn++ / 128.0f); + + /* Decrement loop counter */ + blkCnt--; + } + + /* Loop unrolling: Compute remaining outputs */ + blkCnt = blockSize % 0x4U; + +#else + + /* Initialize blkCnt with number of samples */ + blkCnt = blockSize; + +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ + + while (blkCnt > 0U) + { + /* C = (float32_t) A / 128 */ + + /* Convert from q7 to float and store result in destination buffer */ + *pDst++ = ((float32_t) * pIn++ / 128.0f); + + /* Decrement loop counter */ + blkCnt--; + } + +} +#endif /* #if defined(ARM_MATH_NEON) */ +#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */ + +/** + @} end of q7_to_x group + */ diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q7_to_q15.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q7_to_q15.c new file mode 100644 index 0000000..be52531 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q7_to_q15.c @@ -0,0 +1,188 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_q7_to_q15.c + * Description: Converts the elements of the Q7 vector to Q15 vector + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/support_functions.h" + +/** + @ingroup groupSupport + */ + +/** + @addtogroup q7_to_x + @{ + */ + +/** + @brief Converts the elements of the Q7 vector to Q15 vector. + @param[in] pSrc points to the Q7 input vector + @param[out] pDst points to the Q15 output vector + @param[in] blockSize number of samples in each vector + @return none + + @par Details + The equation used for the conversion process is: + <pre> + pDst[n] = (q15_t) pSrc[n] << 8; 0 <= n < blockSize. + </pre> + */ + +#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) +void arm_q7_to_q15( + const q7_t * pSrc, + q15_t * pDst, + uint32_t blockSize) +{ + + uint32_t blkCnt; /* loop counters */ + q15x8_t vecDst; + q7_t const *pSrcVec; + + + pSrcVec = (q7_t const *) pSrc; + blkCnt = blockSize >> 3; + while (blkCnt > 0U) + { + /* C = (q15_t) A << 8 */ + /* convert from q7 to q15 and then store the results in the destination buffer */ + /* load q7 + 32-bit widening */ + vecDst = vldrbq_s16(pSrcVec); + pSrcVec += 8; + vecDst = vecDst << 8; + vstrhq(pDst, vecDst); + pDst += 8; + /* + * Decrement the blockSize loop counter + */ + blkCnt--; + } + + blkCnt = blockSize & 7; + while (blkCnt > 0U) + { + /* C = (q15_t) A << 8 */ + + /* Convert from q7 to q15 and store result in destination buffer */ + *pDst++ = (q15_t) * pSrcVec++ << 8; + + /* Decrement loop counter */ + blkCnt--; + } + +} +#else +void arm_q7_to_q15( + const q7_t * pSrc, + q15_t * pDst, + uint32_t blockSize) +{ + uint32_t blkCnt; /* Loop counter */ + const q7_t *pIn = pSrc; /* Source pointer */ + +#if defined (ARM_MATH_LOOPUNROLL) && defined (ARM_MATH_DSP) + q31_t in; + q31_t in1, in2; + q31_t out1, out2; +#endif + +#if defined (ARM_MATH_LOOPUNROLL) + + /* Loop unrolling: Compute 4 outputs at a time */ + blkCnt = blockSize >> 2U; + + while (blkCnt > 0U) + { + /* C = (q15_t) A << 8 */ + + /* Convert from q7 to q15 and store result in destination buffer */ +#if defined (ARM_MATH_DSP) + + in = read_q7x4_ia (&pIn); + + /* rotatate in by 8 and extend two q7_t values to q15_t values */ + in1 = __SXTB16(__ROR(in, 8)); + + /* extend remainig two q7_t values to q15_t values */ + in2 = __SXTB16(in); + + in1 = in1 << 8U; + in2 = in2 << 8U; + + in1 = in1 & 0xFF00FF00; + in2 = in2 & 0xFF00FF00; + +#ifndef ARM_MATH_BIG_ENDIAN + out2 = __PKHTB(in1, in2, 16); + out1 = __PKHBT(in2, in1, 16); +#else + out1 = __PKHTB(in1, in2, 16); + out2 = __PKHBT(in2, in1, 16); +#endif + + write_q15x2_ia (&pDst, out1); + write_q15x2_ia (&pDst, out2); + +#else + + *pDst++ = (q15_t) *pIn++ << 8; + *pDst++ = (q15_t) *pIn++ << 8; + *pDst++ = (q15_t) *pIn++ << 8; + *pDst++ = (q15_t) *pIn++ << 8; + +#endif /* #if defined (ARM_MATH_DSP) */ + + /* Decrement loop counter */ + blkCnt--; + } + + /* Loop unrolling: Compute remaining outputs */ + blkCnt = blockSize % 0x4U; + +#else + + /* Initialize blkCnt with number of samples */ + blkCnt = blockSize; + +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ + + while (blkCnt > 0U) + { + /* C = (q15_t) A << 8 */ + + /* Convert from q7 to q15 and store result in destination buffer */ + *pDst++ = (q15_t) * pIn++ << 8; + + /* Decrement loop counter */ + blkCnt--; + } + +} +#endif /* defined(ARM_MATH_MVEI) */ + +/** + @} end of q7_to_x group + */ diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q7_to_q31.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q7_to_q31.c new file mode 100644 index 0000000..01d5f2b --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q7_to_q31.c @@ -0,0 +1,164 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_q7_to_q31.c + * Description: Converts the elements of the Q7 vector to Q31 vector + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/support_functions.h" + +/** + @ingroup groupSupport + */ + +/** + @addtogroup q7_to_x + @{ + */ + +/** + @brief Converts the elements of the Q7 vector to Q31 vector. + @param[in] pSrc points to the Q7 input vector + @param[out] pDst points to the Q31 output vector + @param[in] blockSize number of samples in each vector + @return none + + @par Details + The equation used for the conversion process is: + <pre> + pDst[n] = (q31_t) pSrc[n] << 24; 0 <= n < blockSize. + </pre> + */ +#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) +void arm_q7_to_q31( + const q7_t * pSrc, + q31_t * pDst, + uint32_t blockSize) +{ + uint32_t blkCnt; + q31x4_t vecDst; + + blkCnt = blockSize >> 2; + while (blkCnt > 0U) + { + + /* C = (q31_t)A << 16 */ + /* convert from q15 to q31 and then store the results in the destination buffer */ + /* load q15 + 32-bit widening */ + vecDst = vldrbq_s32((q7_t const *) pSrc); + vecDst = vshlq_n(vecDst, 24); + vstrwq_s32(pDst, vecDst); + + /* + * Decrement the blockSize loop counter + * Advance vector source and destination pointers + */ + pDst += 4; + pSrc += 4; + blkCnt --; + } + + blkCnt = blockSize & 3; + while (blkCnt > 0U) + { + /* C = (q31_t) A << 24 */ + + /* Convert from q7 to q31 and store result in destination buffer */ + *pDst++ = (q31_t) *pSrc++ << 24; + + /* Decrement loop counter */ + blkCnt--; + } +} + +#else +void arm_q7_to_q31( + const q7_t * pSrc, + q31_t * pDst, + uint32_t blockSize) +{ + uint32_t blkCnt; /* Loop counter */ + const q7_t *pIn = pSrc; /* Source pointer */ + +#if defined (ARM_MATH_LOOPUNROLL) + + q31_t in; + + /* Loop unrolling: Compute 4 outputs at a time */ + blkCnt = blockSize >> 2U; + + while (blkCnt > 0U) + { + /* C = (q31_t) A << 24 */ + + /* Convert from q7 to q31 and store result in destination buffer */ + in = read_q7x4_ia (&pIn); + +#ifndef ARM_MATH_BIG_ENDIAN + + *pDst++ = (__ROR(in, 8)) & 0xFF000000; + *pDst++ = (__ROR(in, 16)) & 0xFF000000; + *pDst++ = (__ROR(in, 24)) & 0xFF000000; + *pDst++ = (in & 0xFF000000); + +#else + + *pDst++ = (in & 0xFF000000); + *pDst++ = (__ROR(in, 24)) & 0xFF000000; + *pDst++ = (__ROR(in, 16)) & 0xFF000000; + *pDst++ = (__ROR(in, 8)) & 0xFF000000; + +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* Decrement loop counter */ + blkCnt--; + } + + /* Loop unrolling: Compute remaining outputs */ + blkCnt = blockSize % 0x4U; + +#else + + /* Initialize blkCnt with number of samples */ + blkCnt = blockSize; + +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ + + while (blkCnt > 0U) + { + /* C = (q31_t) A << 24 */ + + /* Convert from q7 to q31 and store result in destination buffer */ + *pDst++ = (q31_t) * pIn++ << 24; + + /* Decrement loop counter */ + blkCnt--; + } + +} +#endif /* defined(ARM_MATH_MVEI) */ + +/** + @} end of q7_to_x group + */ diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_quick_sort_f32.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_quick_sort_f32.c new file mode 100644 index 0000000..c6c44d9 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_quick_sort_f32.c @@ -0,0 +1,181 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_quick_sort_f32.c + * Description: Floating point quick sort + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_sorting.h" + +static uint32_t arm_quick_sort_partition_f32(float32_t *pSrc, int32_t first, int32_t last, uint8_t dir) +{ + /* This function will be called */ + int32_t i, j, pivot_index; + float32_t pivot; + float32_t temp; + + /* The first element is the pivot */ + pivot_index = first; + pivot = pSrc[pivot_index]; + + /* Initialize indices for do-while loops */ + i = first - 1; + j = last + 1; + + while(i < j) + { + /* The loop will stop as soon as the indices i and j cross each other. + * + * This event will happen surely since the values of the indices are incremented and + * decrement in the do-while loops that are executed at least once. + * It is impossible to loop forever inside the do-while loops since the pivot is + * always an element of the array and the conditions cannot be always true (at least + * the i-th or the j-th element will be equal to the pivot-th element). + * For example, in the extreme case of an ordered array the do-while loop related to i will stop + * at the first iteration (because pSrc[i]=pSrc[pivot] already), and the loop related to j + * will stop after (last-first) iterations (when j=pivot=i=first). j is returned and + * j+1 is going to be used as pivot by other calls of the function, until j=pivot=last. */ + + /* Move indices to the right and to the left */ + if(dir) + { + /* Compare left elements with pivot */ + do + { + i++; + } while (pSrc[i] < pivot && i<last); + + /* Compare right elements with pivot */ + do + { + j--; + } while (pSrc[j] > pivot); + } + else + { + /* Compare left elements with pivot */ + do + { + i++; + } while (pSrc[i] > pivot && i<last); + + /* Compare right elements with pivot */ + do + { + j--; + } while (pSrc[j] < pivot); + } + + /* If the indices didn't cross each other */ + if (i < j) + { + /* i and j are in the wrong position -> Swap */ + temp=pSrc[i]; + pSrc[i]=pSrc[j]; + pSrc[j]=temp; + } + } + + return j; +} + +static void arm_quick_sort_core_f32(float32_t *pSrc, int32_t first, int32_t last, uint8_t dir) +{ + /* If the array [first ... last] has more than one element */ + if(first<last) + { + int32_t pivot; + + /* Compute pivot */ + pivot = arm_quick_sort_partition_f32(pSrc, first, last, dir); + + /* Iterate algorithm with two sub-arrays [first ... pivot] and [pivot+1 ... last] */ + arm_quick_sort_core_f32(pSrc, first, pivot, dir); + arm_quick_sort_core_f32(pSrc, pivot+1, last, dir); + } +} + +/** + @ingroup groupSupport + */ + +/** + @addtogroup Sorting + @{ + */ + +/** + * @private + * @param[in] S points to an instance of the sorting structure. + * @param[in,out] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data. + * @param[in] blockSize number of samples to process. + * + * @par Algorithm + * The quick sort algorithm is a comparison algorithm that + * divides the input array into two smaller sub-arrays and + * recursively sort them. An element of the array (the pivot) + * is chosen, all the elements with values smaller than the + * pivot are moved before the pivot, while all elements with + * values greater than the pivot are moved after it (partition). + * + * @par + * In this implementation the Hoare partition scheme has been + * used [Hoare, C. A. R. (1 January 1962). "Quicksort". The Computer + * Journal. 5 (1): 10...16.] The first element has always been chosen + * as the pivot. The partition algorithm guarantees that the returned + * pivot is never placed outside the vector, since it is returned only + * when the pointers crossed each other. In this way it isn't + * possible to obtain empty partitions and infinite recursion is avoided. + * + * @par + * It's an in-place algorithm. In order to obtain an out-of-place + * function, a memcpy of the source vector is performed. + */ + +void arm_quick_sort_f32( + const arm_sort_instance_f32 * S, + float32_t * pSrc, + float32_t * pDst, + uint32_t blockSize) +{ + float32_t * pA; + + /* Out-of-place */ + if(pSrc != pDst) + { + memcpy(pDst, pSrc, blockSize*sizeof(float32_t) ); + pA = pDst; + } + else + pA = pSrc; + + arm_quick_sort_core_f32(pA, 0, blockSize-1, S->dir); + /* The previous function could be called recursively a maximum + * of (blockSize-1) times, generating a stack consumption of 4*(blockSize-1) bytes. */ +} + +/** + @} end of Sorting group + */ diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_selection_sort_f32.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_selection_sort_f32.c new file mode 100644 index 0000000..ad534bc --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_selection_sort_f32.c @@ -0,0 +1,107 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_selection_sort_f32.c + * Description: Floating point selection sort + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_sorting.h" + +/** + @ingroup groupSupport + */ + +/** + @addtogroup Sorting + @{ + */ + +/** + * @private + * @param[in] S points to an instance of the sorting structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data + * @param[in] blockSize number of samples to process. + * + * @par Algorithm + * The Selection sort algorithm is a comparison algorithm that + * divides the input array into a sorted and an unsorted sublist + * (initially the sorted sublist is empty and the unsorted sublist + * is the input array), looks for the smallest (or biggest) + * element in the unsorted sublist, swapping it with the leftmost + * one, and moving the sublists boundary one element to the right. + * + * @par It's an in-place algorithm. In order to obtain an out-of-place + * function, a memcpy of the source vector is performed. + */ + +void arm_selection_sort_f32( + const arm_sort_instance_f32 * S, + float32_t * pSrc, + float32_t * pDst, + uint32_t blockSize) +{ + uint32_t i, j, k; + uint8_t dir = S->dir; + float32_t temp; + + float32_t * pA; + + if(pSrc != pDst) // out-of-place + { + memcpy(pDst, pSrc, blockSize*sizeof(float32_t) ); + pA = pDst; + } + else + pA = pSrc; + + /* Move the boundary one element to the right */ + for (i=0; i<blockSize-1; i++) + { + /* Initialize the minimum/maximum as the first element */ + k = i; + + /* Look in the unsorted list to find the minimum/maximum value */ + for (j=i+1; j<blockSize; j++) + { + if (dir==(pA[j] < pA[k]) ) + { + /* Update value */ + k = j; + } + } + + if (k != i) + { + /* Swap the minimum/maximum with the leftmost element */ + temp=pA[i]; + pA[i]=pA[k]; + pA[k]=temp; + } + } +} + +/** + @} end of Sorting group + */ diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_sort_f32.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_sort_f32.c new file mode 100644 index 0000000..43786dc --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_sort_f32.c @@ -0,0 +1,86 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_sort_f32.c + * Description: Floating point sort + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_sorting.h" + +/** + @ingroup groupSupport + */ + +/** + @addtogroup Sorting + @{ + */ + + +/** + * @brief Generic sorting function + * + * @param[in] S points to an instance of the sorting structure. + * @param[in] pSrc points to the block of input data. + * @param[out] pDst points to the block of output data. + * @param[in] blockSize number of samples to process. + */ + +void arm_sort_f32( + const arm_sort_instance_f32 * S, + float32_t * pSrc, + float32_t * pDst, + uint32_t blockSize) +{ + switch(S->alg) + { + case ARM_SORT_BITONIC: + arm_bitonic_sort_f32(S, pSrc, pDst, blockSize); + break; + + case ARM_SORT_BUBBLE: + arm_bubble_sort_f32(S, pSrc, pDst, blockSize); + break; + + case ARM_SORT_HEAP: + arm_heap_sort_f32(S, pSrc, pDst, blockSize); + break; + + case ARM_SORT_INSERTION: + arm_insertion_sort_f32(S, pSrc, pDst, blockSize); + break; + + case ARM_SORT_QUICK: + arm_quick_sort_f32(S, pSrc, pDst, blockSize); + break; + + case ARM_SORT_SELECTION: + arm_selection_sort_f32(S, pSrc, pDst, blockSize); + break; + } +} + +/** + @} end of Sorting group + */ diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_sort_init_f32.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_sort_init_f32.c new file mode 100644 index 0000000..72ad9c5 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_sort_init_f32.c @@ -0,0 +1,54 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_sort_init_f32.c + * Description: Floating point sort initialization function + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_sorting.h" + +/** + @ingroup groupSupport + */ + +/** + @addtogroup Sorting + @{ + */ + + + /** + * @param[in,out] S points to an instance of the sorting structure. + * @param[in] alg Selected algorithm. + * @param[in] dir Sorting order. + */ +void arm_sort_init_f32(arm_sort_instance_f32 * S, arm_sort_alg alg, arm_sort_dir dir) +{ + S->alg = alg; + S->dir = dir; +} + +/** + @} end of Sorting group + */ diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_weighted_sum_f16.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_weighted_sum_f16.c new file mode 100644 index 0000000..f9aae6b --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_weighted_sum_f16.c @@ -0,0 +1,146 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_weighted_sum_f16.c + * Description: Weighted Sum + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <limits.h> +#include <math.h> + +#include "dsp/support_functions_f16.h" + +#if defined(ARM_FLOAT16_SUPPORTED) + +/** + @ingroup groupSupport + */ + +/** + @defgroup weightedsum Weighted Sum + + Weighted sum of values + */ + + +/** + * @addtogroup weightedsum + * @{ + */ + + +/** + * @brief Weighted sum + * + * + * @param[in] *in Array of input values. + * @param[in] *weigths Weights + * @param[in] blockSize Number of samples in the input array. + * @return Weighted sum + * + */ + +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) + +#include "arm_helium_utils.h" + +float16_t arm_weighted_sum_f16(const float16_t *in,const float16_t *weigths, uint32_t blockSize) +{ + _Float16 accum1, accum2; + float16x8_t accum1V, accum2V; + float16x8_t inV, wV; + const float16_t *pIn, *pW; + uint32_t blkCnt; + + + pIn = in; + pW = weigths; + + + accum1V = vdupq_n_f16(0.0f16); + accum2V = vdupq_n_f16(0.0f16); + + blkCnt = blockSize >> 3; + while (blkCnt > 0) + { + inV = vld1q(pIn); + wV = vld1q(pW); + + pIn += 4; + pW += 4; + + accum1V = vfmaq(accum1V, inV, wV); + accum2V = vaddq(accum2V, wV); + blkCnt--; + } + + accum1 = vecAddAcrossF16Mve(accum1V); + accum2 = vecAddAcrossF16Mve(accum2V); + + blkCnt = blockSize & 7; + while(blkCnt > 0) + { + accum1 += (_Float16)*pIn++ * (_Float16)*pW; + accum2 += (_Float16)*pW++; + blkCnt--; + } + + + return (accum1 / accum2); +} + +#else + +float16_t arm_weighted_sum_f16(const float16_t *in, const float16_t *weigths, uint32_t blockSize) +{ + + _Float16 accum1, accum2; + const float16_t *pIn, *pW; + uint32_t blkCnt; + + + pIn = in; + pW = weigths; + + accum1=0.0f16; + accum2=0.0f16; + + blkCnt = blockSize; + while(blkCnt > 0) + { + accum1 += (_Float16)*pIn++ * (_Float16)*pW; + accum2 += (_Float16)*pW++; + blkCnt--; + } + + return(accum1 / accum2); +} +#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */ + +/** + * @} end of weightedsum group + */ + +#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ + diff --git a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_weighted_sum_f32.c b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_weighted_sum_f32.c new file mode 100644 index 0000000..dd37861 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_weighted_sum_f32.c @@ -0,0 +1,187 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_weighted_sum_f32.c + * Description: Weighted Sum + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <limits.h> +#include <math.h> + +#include "dsp/support_functions.h" + +/** + * @addtogroup weightedsum + * @{ + */ + + +/** + * @brief Weighted sum + * + * + * @param[in] *in Array of input values. + * @param[in] *weigths Weights + * @param[in] blockSize Number of samples in the input array. + * @return Weighted sum + * + */ + +#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) + +#include "arm_helium_utils.h" + +float32_t arm_weighted_sum_f32(const float32_t *in,const float32_t *weigths, uint32_t blockSize) +{ + float32_t accum1, accum2; + f32x4_t accum1V, accum2V; + f32x4_t inV, wV; + const float32_t *pIn, *pW; + uint32_t blkCnt; + + + pIn = in; + pW = weigths; + + + accum1V = vdupq_n_f32(0.0); + accum2V = vdupq_n_f32(0.0); + + blkCnt = blockSize >> 2; + while (blkCnt > 0) + { + inV = vld1q(pIn); + wV = vld1q(pW); + + pIn += 4; + pW += 4; + + accum1V = vfmaq(accum1V, inV, wV); + accum2V = vaddq(accum2V, wV); + blkCnt--; + } + + accum1 = vecAddAcrossF32Mve(accum1V); + accum2 = vecAddAcrossF32Mve(accum2V); + + blkCnt = blockSize & 3; + while(blkCnt > 0) + { + accum1 += *pIn++ * *pW; + accum2 += *pW++; + blkCnt--; + } + + + return (accum1 / accum2); +} + +#else +#if defined(ARM_MATH_NEON) + +#include "NEMath.h" +float32_t arm_weighted_sum_f32(const float32_t *in,const float32_t *weigths, uint32_t blockSize) +{ + + float32_t accum1, accum2; + float32x4_t accum1V, accum2V; + float32x2_t tempV; + + float32x4_t inV,wV; + + const float32_t *pIn, *pW; + uint32_t blkCnt; + + + pIn = in; + pW = weigths; + + accum1=0.0f; + accum2=0.0f; + + accum1V = vdupq_n_f32(0.0f); + accum2V = vdupq_n_f32(0.0f); + + blkCnt = blockSize >> 2; + while(blkCnt > 0) + { + inV = vld1q_f32(pIn); + wV = vld1q_f32(pW); + + pIn += 4; + pW += 4; + + accum1V = vmlaq_f32(accum1V,inV,wV); + accum2V = vaddq_f32(accum2V,wV); + blkCnt--; + } + + tempV = vpadd_f32(vget_low_f32(accum1V),vget_high_f32(accum1V)); + accum1 = vget_lane_f32(tempV, 0) + vget_lane_f32(tempV, 1); + + tempV = vpadd_f32(vget_low_f32(accum2V),vget_high_f32(accum2V)); + accum2 = vget_lane_f32(tempV, 0) + vget_lane_f32(tempV, 1); + + blkCnt = blockSize & 3; + while(blkCnt > 0) + { + accum1 += *pIn++ * *pW; + accum2 += *pW++; + blkCnt--; + } + + + return(accum1 / accum2); +} +#else +float32_t arm_weighted_sum_f32(const float32_t *in, const float32_t *weigths, uint32_t blockSize) +{ + + float32_t accum1, accum2; + const float32_t *pIn, *pW; + uint32_t blkCnt; + + + pIn = in; + pW = weigths; + + accum1=0.0f; + accum2=0.0f; + + blkCnt = blockSize; + while(blkCnt > 0) + { + accum1 += *pIn++ * *pW; + accum2 += *pW++; + blkCnt--; + } + + return(accum1 / accum2); +} +#endif +#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */ + +/** + * @} end of weightedsum group + */ |