diff options
Diffstat (limited to 'Drivers/CMSIS/DSP/Source/TransformFunctions')
61 files changed, 22989 insertions, 0 deletions
diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/CMakeLists.txt b/Drivers/CMSIS/DSP/Source/TransformFunctions/CMakeLists.txt new file mode 100644 index 0000000..e598f82 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/CMakeLists.txt @@ -0,0 +1,213 @@ +cmake_minimum_required (VERSION 3.14) + +project(CMSISDSPTransform) + +include(configLib) +include(configDsp) + +add_library(CMSISDSPTransform STATIC) +configLib(CMSISDSPTransform ${ROOT}) +configDsp(CMSISDSPTransform ${ROOT}) + +include(fft) +fft(CMSISDSPTransform) + +if (CONFIGTABLE AND ALLFFT) +target_compile_definitions(CMSISDSPTransform PUBLIC ARM_ALL_FFT_TABLES) +endif() + +target_sources(CMSISDSPTransform PRIVATE arm_bitreversal.c) +target_sources(CMSISDSPTransform PRIVATE arm_bitreversal2.c) + +if ((NOT ARMAC5) AND (NOT DISABLEFLOAT16)) +target_sources(CMSISDSPTransform PRIVATE arm_bitreversal_f16.c) +endif() + +if (NOT CONFIGTABLE OR ALLFFT OR CFFT_F32_16 OR CFFT_F32_32 OR CFFT_F32_64 OR CFFT_F32_128 OR CFFT_F32_256 OR CFFT_F32_512 + OR CFFT_F32_1024 OR CFFT_F32_2048 OR CFFT_F32_4096) +target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix2_f32.c) +target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix4_f32.c) +target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix8_f32.c) +target_sources(CMSISDSPTransform PRIVATE arm_cfft_f32.c) +target_sources(CMSISDSPTransform PRIVATE arm_cfft_init_f32.c) +endif() + +if ((NOT ARMAC5) AND (NOT DISABLEFLOAT16)) +if (NOT CONFIGTABLE OR ALLFFT OR CFFT_F16_16 OR CFFT_F16_32 OR CFFT_F16_64 OR CFFT_F16_128 OR CFFT_F16_256 OR CFFT_F16_512 + OR CFFT_F16_1024 OR CFFT_F16_2048 OR CFFT_F16_4096) +target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix2_f16.c) +target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix4_f16.c) +target_sources(CMSISDSPTransform PRIVATE arm_cfft_f16.c) +target_sources(CMSISDSPTransform PRIVATE arm_cfft_init_f16.c) +endif() +endif() + +if ((NOT ARMAC5) AND (NOT DISABLEFLOAT16)) +if (NOT CONFIGTABLE OR ALLFFT OR RFFT_F16_128 OR RFFT_F16_512 OR RFFT_F16_2048 OR RFFT_F16_8192) +target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix4_init_f16.c) +target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix4_f16.c) +endif() +endif() + +if (NOT CONFIGTABLE OR ALLFFT OR CFFT_F64_16 OR CFFT_F64_32 OR CFFT_F64_64 OR CFFT_F64_128 OR CFFT_F64_256 OR CFFT_F64_512 + OR CFFT_F64_1024 OR CFFT_F64_2048 OR CFFT_F64_4096) +target_sources(CMSISDSPTransform PRIVATE arm_cfft_f64.c) +target_sources(CMSISDSPTransform PRIVATE arm_cfft_init_f64.c) +endif() + +if (NOT CONFIGTABLE OR ALLFFT OR CFFT_Q15_16 OR CFFT_Q15_32 OR CFFT_Q15_64 OR CFFT_Q15_128 OR CFFT_Q15_256 OR CFFT_Q15_512 + OR CFFT_Q15_1024 OR CFFT_Q15_2048 OR CFFT_Q15_4096) +target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix2_q15.c) +target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix4_q15.c) +target_sources(CMSISDSPTransform PRIVATE arm_cfft_q15.c) +target_sources(CMSISDSPTransform PRIVATE arm_cfft_init_q15.c) +endif() + +if (NOT CONFIGTABLE OR ALLFFT OR CFFT_Q31_16 OR CFFT_Q31_32 OR CFFT_Q31_64 OR CFFT_Q31_128 OR CFFT_Q31_256 OR CFFT_Q31_512 + OR CFFT_Q31_1024 OR CFFT_Q31_2048 OR CFFT_Q31_4096) +target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix2_q31.c) +target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix4_q31.c) +target_sources(CMSISDSPTransform PRIVATE arm_cfft_q31.c) +target_sources(CMSISDSPTransform PRIVATE arm_cfft_init_q31.c) +endif() + + +if (NOT CONFIGTABLE OR ALLFFT OR DCT4_F32_128 OR DCT4_F32_512 OR DCT4_F32_2048 OR DCT4_F32_8192) +target_sources(CMSISDSPTransform PRIVATE arm_dct4_f32.c) +target_sources(CMSISDSPTransform PRIVATE arm_dct4_init_f32.c) + +target_sources(CMSISDSPTransform PRIVATE arm_rfft_init_f32.c) +target_sources(CMSISDSPTransform PRIVATE arm_rfft_f32.c) +target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix4_init_f32.c) +target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix4_f32.c) +endif() + +if (NOT CONFIGTABLE OR ALLFFT OR DCT4_Q31_128 OR DCT4_Q31_512 OR DCT4_Q31_2048 OR DCT4_Q31_8192) +target_sources(CMSISDSPTransform PRIVATE arm_dct4_q31.c) +target_sources(CMSISDSPTransform PRIVATE arm_dct4_init_q31.c) + +target_sources(CMSISDSPTransform PRIVATE arm_rfft_init_q31.c) +target_sources(CMSISDSPTransform PRIVATE arm_rfft_q31.c) +target_sources(CMSISDSPTransform PRIVATE arm_cfft_q31.c) +target_sources(CMSISDSPTransform PRIVATE arm_cfft_init_q31.c) +target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix4_init_q31.c) +target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix4_q31.c) +endif() + +if (NOT CONFIGTABLE OR ALLFFT OR ALLFFT OR DCT4_Q15_128 OR DCT4_Q15_512 OR DCT4_Q15_2048 OR DCT4_Q15_8192) +target_sources(CMSISDSPTransform PRIVATE arm_dct4_init_q15.c) +target_sources(CMSISDSPTransform PRIVATE arm_dct4_q15.c) + +target_sources(CMSISDSPTransform PRIVATE arm_rfft_init_q15.c) +target_sources(CMSISDSPTransform PRIVATE arm_rfft_q15.c) +target_sources(CMSISDSPTransform PRIVATE arm_cfft_q15.c) +target_sources(CMSISDSPTransform PRIVATE arm_cfft_init_q15.c) +target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix4_init_q15.c) +target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix4_q15.c) +endif() + +if (NOT CONFIGTABLE OR ALLFFT OR RFFT_FAST_F32_32 OR RFFT_FAST_F32_64 OR RFFT_FAST_F32_128 + OR RFFT_FAST_F32_256 OR RFFT_FAST_F32_512 OR RFFT_FAST_F32_1024 OR RFFT_FAST_F32_2048 + OR RFFT_FAST_F32_4096 ) +target_sources(CMSISDSPTransform PRIVATE arm_rfft_fast_f32.c) +target_sources(CMSISDSPTransform PRIVATE arm_rfft_fast_init_f32.c) +target_sources(CMSISDSPTransform PRIVATE arm_cfft_f32.c) +target_sources(CMSISDSPTransform PRIVATE arm_cfft_init_f32.c) +target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix8_f32.c) +endif() + +if (NOT CONFIGTABLE OR ALLFFT OR RFFT_FAST_F64_32 OR RFFT_FAST_F64_64 OR RFFT_FAST_F64_128 + OR RFFT_FAST_F64_256 OR RFFT_FAST_F64_512 OR RFFT_FAST_F64_1024 OR RFFT_FAST_F64_2048 + OR RFFT_FAST_F64_4096 ) +target_sources(CMSISDSPTransform PRIVATE arm_rfft_fast_f64.c) +target_sources(CMSISDSPTransform PRIVATE arm_rfft_fast_init_f64.c) +endif() + +if ((NOT DISABLEFLOAT16)) +if (NOT CONFIGTABLE OR ALLFFT OR RFFT_FAST_F16_32 OR RFFT_FAST_F16_64 OR RFFT_FAST_F16_128 + OR RFFT_FAST_F16_256 OR RFFT_FAST_F16_512 OR RFFT_FAST_F16_1024 OR RFFT_FAST_F16_2048 + OR RFFT_FAST_F16_4096 ) +target_sources(CMSISDSPTransform PRIVATE arm_rfft_fast_f16.c) +target_sources(CMSISDSPTransform PRIVATE arm_rfft_fast_init_f16.c) +target_sources(CMSISDSPTransform PRIVATE arm_cfft_f16.c) +target_sources(CMSISDSPTransform PRIVATE arm_cfft_init_f16.c) +target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix8_f16.c) +endif() +endif() + +if (NOT CONFIGTABLE OR ALLFFT OR RFFT_F32_128 OR RFFT_F32_512 OR RFFT_F32_2048 OR RFFT_F32_8192) +target_sources(CMSISDSPTransform PRIVATE arm_rfft_init_f32.c) +target_sources(CMSISDSPTransform PRIVATE arm_rfft_f32.c) +target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix4_init_f32.c) +target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix4_f32.c) +endif() + +if (NOT CONFIGTABLE OR ALLFFT OR RFFT_Q15_32 OR RFFT_Q15_64 OR RFFT_Q15_128 OR RFFT_Q15_256 + OR RFFT_Q15_512 OR RFFT_Q15_1024 OR RFFT_Q15_2048 OR RFFT_Q15_4096 OR RFFT_Q15_8192) +target_sources(CMSISDSPTransform PRIVATE arm_rfft_init_q15.c) +target_sources(CMSISDSPTransform PRIVATE arm_rfft_q15.c) +target_sources(CMSISDSPTransform PRIVATE arm_cfft_q15.c) +target_sources(CMSISDSPTransform PRIVATE arm_cfft_init_q15.c) +target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix4_q15.c) +endif() + +if (NOT CONFIGTABLE OR ALLFFT OR RFFT_Q31_32 OR RFFT_Q31_64 OR RFFT_Q31_128 OR RFFT_Q31_256 + OR RFFT_Q31_512 OR RFFT_Q31_1024 OR RFFT_Q31_2048 OR RFFT_Q31_4096 OR RFFT_Q31_8192) +target_sources(CMSISDSPTransform PRIVATE arm_rfft_init_q31.c) +target_sources(CMSISDSPTransform PRIVATE arm_rfft_q31.c) +target_sources(CMSISDSPTransform PRIVATE arm_cfft_q31.c) +target_sources(CMSISDSPTransform PRIVATE arm_cfft_init_q31.c) +target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix4_q31.c) +endif() + +if (WRAPPER OR ARM_CFFT_RADIX2_Q15) + target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix2_init_q15.c) +endif() + +if (NOT CONFIGTABLE OR ALLFFT OR ARM_CFFT_RADIX4_Q15) + target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix4_init_q15.c) +endif() + +if (WRAPPER OR ARM_CFFT_RADIX2_Q31) + target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix2_init_q31.c) +endif() + +if (NOT CONFIGTABLE OR ALLFFT OR ARM_CFFT_RADIX4_Q31) + target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix4_init_q31.c) +endif() + +# For scipy or wrappers or benchmarks +if (WRAPPER) +target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix2_init_f32.c) +if ((NOT ARMAC5) AND (NOT DISABLEFLOAT16)) +target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix2_init_f16.c) +endif() + + target_compile_definitions(CMSISDSPTransform PUBLIC ARM_TABLE_BITREV_1024) + target_compile_definitions(CMSISDSPTransform PUBLIC ARM_TABLE_TWIDDLECOEF_F32_4096) + target_compile_definitions(CMSISDSPTransform PUBLIC ARM_TABLE_TWIDDLECOEF_Q31_4096) + target_compile_definitions(CMSISDSPTransform PUBLIC ARM_TABLE_TWIDDLECOEF_Q15_4096) +if ((NOT ARMAC5) AND (NOT DISABLEFLOAT16)) + target_compile_definitions(CMSISDSPTransform PUBLIC ARM_TABLE_TWIDDLECOEF_F16_4096) +endif() +endif() + +target_sources(CMSISDSPTransform PRIVATE arm_mfcc_init_f32.c) +target_sources(CMSISDSPTransform PRIVATE arm_mfcc_f32.c) + +target_sources(CMSISDSPTransform PRIVATE arm_mfcc_init_q31.c) +target_sources(CMSISDSPTransform PRIVATE arm_mfcc_q31.c) + +target_sources(CMSISDSPTransform PRIVATE arm_mfcc_init_q15.c) +target_sources(CMSISDSPTransform PRIVATE arm_mfcc_q15.c) + +if ((NOT ARMAC5) AND (NOT DISABLEFLOAT16)) +target_sources(CMSISDSPTransform PRIVATE arm_mfcc_init_f16.c) +target_sources(CMSISDSPTransform PRIVATE arm_mfcc_f16.c) +endif() + +### Includes +target_include_directories(CMSISDSPTransform PUBLIC "${DSP}/Include") + + + diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/TransformFunctions.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/TransformFunctions.c new file mode 100644 index 0000000..f327801 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/TransformFunctions.c @@ -0,0 +1,83 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: TransformFunctions.c + * Description: Combination of all transform function source files. + * + * $Date: 18. March 2019 + * $Revision: V1.0.0 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2019 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_bitreversal.c" +#include "arm_bitreversal2.c" +#include "arm_cfft_f32.c" +#include "arm_cfft_f64.c" +#include "arm_cfft_q15.c" +#include "arm_cfft_q31.c" +#include "arm_cfft_init_f32.c" +#include "arm_cfft_init_f64.c" +#include "arm_cfft_init_q15.c" +#include "arm_cfft_init_q31.c" +#include "arm_cfft_radix2_f32.c" +#include "arm_cfft_radix2_q15.c" +#include "arm_cfft_radix2_q31.c" +#include "arm_cfft_radix4_f32.c" +#include "arm_cfft_radix4_q15.c" +#include "arm_cfft_radix4_q31.c" +#include "arm_cfft_radix8_f32.c" +#include "arm_rfft_fast_f32.c" +#include "arm_rfft_fast_f64.c" +#include "arm_rfft_fast_init_f32.c" +#include "arm_rfft_fast_init_f64.c" + +#include "arm_mfcc_init_f32.c" +#include "arm_mfcc_f32.c" + +#include "arm_mfcc_init_q31.c" +#include "arm_mfcc_q31.c" + +#include "arm_mfcc_init_q15.c" +#include "arm_mfcc_q15.c" + +/* Deprecated */ + +#include "arm_dct4_f32.c" +#include "arm_dct4_init_f32.c" +#include "arm_dct4_init_q15.c" +#include "arm_dct4_init_q31.c" +#include "arm_dct4_q15.c" +#include "arm_dct4_q31.c" + +#include "arm_rfft_f32.c" +#include "arm_rfft_q15.c" +#include "arm_rfft_q31.c" + +#include "arm_rfft_init_f32.c" +#include "arm_rfft_init_q15.c" +#include "arm_rfft_init_q31.c" + +#include "arm_cfft_radix4_init_f32.c" +#include "arm_cfft_radix4_init_q15.c" +#include "arm_cfft_radix4_init_q31.c" + +#include "arm_cfft_radix2_init_f32.c" +#include "arm_cfft_radix2_init_q15.c" +#include "arm_cfft_radix2_init_q31.c" diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/TransformFunctionsF16.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/TransformFunctionsF16.c new file mode 100644 index 0000000..b082d44 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/TransformFunctionsF16.c @@ -0,0 +1,44 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: TransformFunctionsF16.c + * Description: Combination of all transform function f16 source files. + * + * $Date: 20. April 2020 + * $Revision: V1.0.0 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2019 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_cfft_f16.c" +#include "arm_cfft_init_f16.c" +#include "arm_cfft_radix2_f16.c" +#include "arm_cfft_radix4_f16.c" +#include "arm_rfft_fast_init_f16.c" +#include "arm_rfft_fast_f16.c" +#include "arm_cfft_radix8_f16.c" + +#include "arm_bitreversal_f16.c" + +#include "arm_mfcc_init_f16.c" +#include "arm_mfcc_f16.c" + +/* Deprecated */ +#include "arm_cfft_radix2_init_f16.c" +#include "arm_cfft_radix4_init_f16.c" diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_bitreversal.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_bitreversal.c new file mode 100644 index 0000000..687a9e8 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_bitreversal.c @@ -0,0 +1,230 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_bitreversal.c + * Description: Bitreversal functions + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/transform_functions.h" +#include "arm_common_tables.h" + + +/** + @brief In-place floating-point bit reversal function. + @param[in,out] pSrc points to in-place floating-point data buffer + @param[in] fftSize length of FFT + @param[in] bitRevFactor bit reversal modifier that supports different size FFTs with the same bit reversal table + @param[in] pBitRevTab points to bit reversal table + @return none + */ + +void arm_bitreversal_f32( + float32_t * pSrc, + uint16_t fftSize, + uint16_t bitRevFactor, + const uint16_t * pBitRevTab) +{ + uint16_t fftLenBy2, fftLenBy2p1; + uint16_t i, j; + float32_t in; + + /* Initializations */ + j = 0U; + fftLenBy2 = fftSize >> 1U; + fftLenBy2p1 = (fftSize >> 1U) + 1U; + + /* Bit Reversal Implementation */ + for (i = 0U; i <= (fftLenBy2 - 2U); i += 2U) + { + if (i < j) + { + /* pSrc[i] <-> pSrc[j]; */ + in = pSrc[2U * i]; + pSrc[2U * i] = pSrc[2U * j]; + pSrc[2U * j] = in; + + /* pSrc[i+1U] <-> pSrc[j+1U] */ + in = pSrc[(2U * i) + 1U]; + pSrc[(2U * i) + 1U] = pSrc[(2U * j) + 1U]; + pSrc[(2U * j) + 1U] = in; + + /* pSrc[i+fftLenBy2p1] <-> pSrc[j+fftLenBy2p1] */ + in = pSrc[2U * (i + fftLenBy2p1)]; + pSrc[2U * (i + fftLenBy2p1)] = pSrc[2U * (j + fftLenBy2p1)]; + pSrc[2U * (j + fftLenBy2p1)] = in; + + /* pSrc[i+fftLenBy2p1+1U] <-> pSrc[j+fftLenBy2p1+1U] */ + in = pSrc[(2U * (i + fftLenBy2p1)) + 1U]; + pSrc[(2U * (i + fftLenBy2p1)) + 1U] = + pSrc[(2U * (j + fftLenBy2p1)) + 1U]; + pSrc[(2U * (j + fftLenBy2p1)) + 1U] = in; + + } + + /* pSrc[i+1U] <-> pSrc[j+1U] */ + in = pSrc[2U * (i + 1U)]; + pSrc[2U * (i + 1U)] = pSrc[2U * (j + fftLenBy2)]; + pSrc[2U * (j + fftLenBy2)] = in; + + /* pSrc[i+2U] <-> pSrc[j+2U] */ + in = pSrc[(2U * (i + 1U)) + 1U]; + pSrc[(2U * (i + 1U)) + 1U] = pSrc[(2U * (j + fftLenBy2)) + 1U]; + pSrc[(2U * (j + fftLenBy2)) + 1U] = in; + + /* Reading the index for the bit reversal */ + j = *pBitRevTab; + + /* Updating the bit reversal index depending on the fft length */ + pBitRevTab += bitRevFactor; + } +} + + +/** + @brief In-place Q31 bit reversal function. + @param[in,out] pSrc points to in-place Q31 data buffer. + @param[in] fftLen length of FFT. + @param[in] bitRevFactor bit reversal modifier that supports different size FFTs with the same bit reversal table + @param[in] pBitRevTab points to bit reversal table + @return none +*/ + +void arm_bitreversal_q31( + q31_t * pSrc, + uint32_t fftLen, + uint16_t bitRevFactor, + const uint16_t * pBitRevTab) +{ + uint32_t fftLenBy2, fftLenBy2p1, i, j; + q31_t in; + + /* Initializations */ + j = 0U; + fftLenBy2 = fftLen / 2U; + fftLenBy2p1 = (fftLen / 2U) + 1U; + + /* Bit Reversal Implementation */ + for (i = 0U; i <= (fftLenBy2 - 2U); i += 2U) + { + if (i < j) + { + /* pSrc[i] <-> pSrc[j]; */ + in = pSrc[2U * i]; + pSrc[2U * i] = pSrc[2U * j]; + pSrc[2U * j] = in; + + /* pSrc[i+1U] <-> pSrc[j+1U] */ + in = pSrc[(2U * i) + 1U]; + pSrc[(2U * i) + 1U] = pSrc[(2U * j) + 1U]; + pSrc[(2U * j) + 1U] = in; + + /* pSrc[i+fftLenBy2p1] <-> pSrc[j+fftLenBy2p1] */ + in = pSrc[2U * (i + fftLenBy2p1)]; + pSrc[2U * (i + fftLenBy2p1)] = pSrc[2U * (j + fftLenBy2p1)]; + pSrc[2U * (j + fftLenBy2p1)] = in; + + /* pSrc[i+fftLenBy2p1+1U] <-> pSrc[j+fftLenBy2p1+1U] */ + in = pSrc[(2U * (i + fftLenBy2p1)) + 1U]; + pSrc[(2U * (i + fftLenBy2p1)) + 1U] = + pSrc[(2U * (j + fftLenBy2p1)) + 1U]; + pSrc[(2U * (j + fftLenBy2p1)) + 1U] = in; + + } + + /* pSrc[i+1U] <-> pSrc[j+1U] */ + in = pSrc[2U * (i + 1U)]; + pSrc[2U * (i + 1U)] = pSrc[2U * (j + fftLenBy2)]; + pSrc[2U * (j + fftLenBy2)] = in; + + /* pSrc[i+2U] <-> pSrc[j+2U] */ + in = pSrc[(2U * (i + 1U)) + 1U]; + pSrc[(2U * (i + 1U)) + 1U] = pSrc[(2U * (j + fftLenBy2)) + 1U]; + pSrc[(2U * (j + fftLenBy2)) + 1U] = in; + + /* Reading the index for the bit reversal */ + j = *pBitRevTab; + + /* Updating the bit reversal index depending on the fft length */ + pBitRevTab += bitRevFactor; + } +} + + + +/** + @brief In-place Q15 bit reversal function. + @param[in,out] pSrc16 points to in-place Q15 data buffer + @param[in] fftLen length of FFT + @param[in] bitRevFactor bit reversal modifier that supports different size FFTs with the same bit reversal table + @param[in] pBitRevTab points to bit reversal table + @return none +*/ + +void arm_bitreversal_q15( + q15_t * pSrc16, + uint32_t fftLen, + uint16_t bitRevFactor, + const uint16_t * pBitRevTab) +{ + q31_t *pSrc = (q31_t *) pSrc16; + q31_t in; + uint32_t fftLenBy2, fftLenBy2p1; + uint32_t i, j; + + /* Initializations */ + j = 0U; + fftLenBy2 = fftLen / 2U; + fftLenBy2p1 = (fftLen / 2U) + 1U; + + /* Bit Reversal Implementation */ + for (i = 0U; i <= (fftLenBy2 - 2U); i += 2U) + { + if (i < j) + { + /* pSrc[i] <-> pSrc[j]; */ + /* pSrc[i+1U] <-> pSrc[j+1U] */ + in = pSrc[i]; + pSrc[i] = pSrc[j]; + pSrc[j] = in; + + /* pSrc[i + fftLenBy2p1] <-> pSrc[j + fftLenBy2p1]; */ + /* pSrc[i + fftLenBy2p1+1U] <-> pSrc[j + fftLenBy2p1+1U] */ + in = pSrc[i + fftLenBy2p1]; + pSrc[i + fftLenBy2p1] = pSrc[j + fftLenBy2p1]; + pSrc[j + fftLenBy2p1] = in; + } + + /* pSrc[i+1U] <-> pSrc[j+fftLenBy2]; */ + /* pSrc[i+2] <-> pSrc[j+fftLenBy2+1U] */ + in = pSrc[i + 1U]; + pSrc[i + 1U] = pSrc[j + fftLenBy2]; + pSrc[j + fftLenBy2] = in; + + /* Reading the index for the bit reversal */ + j = *pBitRevTab; + + /* Updating the bit reversal index depending on the fft length */ + pBitRevTab += bitRevFactor; + } +} diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_bitreversal2.S b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_bitreversal2.S new file mode 100644 index 0000000..c16091b --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_bitreversal2.S @@ -0,0 +1,216 @@ +;/* ---------------------------------------------------------------------- +; * Project: CMSIS DSP Library +; * Title: arm_bitreversal2.S +; * Description: arm_bitreversal_32 function done in assembly for maximum speed. +; * Called after doing an fft to reorder the output. +; * The function is loop unrolled by 2. arm_bitreversal_16 as well. +; * +; * $Date: 18. March 2019 +; * $Revision: V1.5.2 +; * +; * Target Processor: Cortex-M cores +; * -------------------------------------------------------------------- */ +;/* +; * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved. +; * +; * SPDX-License-Identifier: Apache-2.0 +; * +; * Licensed under the Apache License, Version 2.0 (the License); you may +; * not use this file except in compliance with the License. +; * You may obtain a copy of the License at +; * +; * www.apache.org/licenses/LICENSE-2.0 +; * +; * Unless required by applicable law or agreed to in writing, software +; * distributed under the License is distributed on an AS IS BASIS, WITHOUT +; * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +; * See the License for the specific language governing permissions and +; * limitations under the License. +; */ + +#if defined ( __CC_ARM ) /* Keil */ + #define CODESECT AREA ||.text||, CODE, READONLY, ALIGN=2 + #define LABEL +#elif defined ( __IASMARM__ ) /* IAR */ + #define CODESECT SECTION `.text`:CODE + #define PROC + #define LABEL + #define ENDP + #define EXPORT PUBLIC +#elif defined ( __CSMC__ ) /* Cosmic */ + #define CODESECT switch .text + #define THUMB + #define EXPORT xdef + #define PROC : + #define LABEL : + #define ENDP + #define arm_bitreversal_32 _arm_bitreversal_32 +#elif defined ( __TI_ARM__ ) /* TI ARM */ + #define THUMB .thumb + #define CODESECT .text + #define EXPORT .global + #define PROC : .asmfunc + #define LABEL : + #define ENDP .endasmfunc + #define END +#elif defined ( __GNUC__ ) /* GCC */ + #define THUMB .thumb + #define CODESECT .section .text + #define EXPORT .global + #define PROC : + #define LABEL : + #define ENDP + #define END + + .syntax unified +#endif + + CODESECT + THUMB + +;/** +; @brief In-place bit reversal function. +; @param[in,out] pSrc points to the in-place buffer of unknown 32-bit data type +; @param[in] bitRevLen bit reversal table length +; @param[in] pBitRevTab points to bit reversal table +; @return none +; */ + EXPORT arm_bitreversal_32 + EXPORT arm_bitreversal_16 + +#if defined ( __CC_ARM ) /* Keil */ +#elif defined ( __IASMARM__ ) /* IAR */ +#elif defined ( __CSMC__ ) /* Cosmic */ +#elif defined ( __TI_ARM__ ) /* TI ARM */ +#elif defined ( __GNUC__ ) /* GCC */ + .type arm_bitreversal_16, %function + .type arm_bitreversal_32, %function +#endif + +#if defined (ARM_MATH_CM0_FAMILY) + +arm_bitreversal_32 PROC + ADDS r3,r1,#1 + PUSH {r4-r6} + ADDS r1,r2,#0 + LSRS r3,r3,#1 +arm_bitreversal_32_0 LABEL + LDRH r2,[r1,#2] + LDRH r6,[r1,#0] + ADD r2,r0,r2 + ADD r6,r0,r6 + LDR r5,[r2,#0] + LDR r4,[r6,#0] + STR r5,[r6,#0] + STR r4,[r2,#0] + LDR r5,[r2,#4] + LDR r4,[r6,#4] + STR r5,[r6,#4] + STR r4,[r2,#4] + ADDS r1,r1,#4 + SUBS r3,r3,#1 + BNE arm_bitreversal_32_0 + POP {r4-r6} + BX lr + ENDP + +arm_bitreversal_16 PROC + ADDS r3,r1,#1 + PUSH {r4-r6} + ADDS r1,r2,#0 + LSRS r3,r3,#1 +arm_bitreversal_16_0 LABEL + LDRH r2,[r1,#2] + LDRH r6,[r1,#0] + LSRS r2,r2,#1 + LSRS r6,r6,#1 + ADD r2,r0,r2 + ADD r6,r0,r6 + LDR r5,[r2,#0] + LDR r4,[r6,#0] + STR r5,[r6,#0] + STR r4,[r2,#0] + ADDS r1,r1,#4 + SUBS r3,r3,#1 + BNE arm_bitreversal_16_0 + POP {r4-r6} + BX lr + ENDP + +#else + +arm_bitreversal_32 PROC + ADDS r3,r1,#1 + CMP r3,#1 + IT LS + BXLS lr + PUSH {r4-r9} + ADDS r1,r2,#2 + LSRS r3,r3,#2 +arm_bitreversal_32_0 LABEL ;/* loop unrolled by 2 */ + LDRH r8,[r1,#4] + LDRH r9,[r1,#2] + LDRH r2,[r1,#0] + LDRH r12,[r1,#-2] + ADD r8,r0,r8 + ADD r9,r0,r9 + ADD r2,r0,r2 + ADD r12,r0,r12 + LDR r7,[r9,#0] + LDR r6,[r8,#0] + LDR r5,[r2,#0] + LDR r4,[r12,#0] + STR r6,[r9,#0] + STR r7,[r8,#0] + STR r5,[r12,#0] + STR r4,[r2,#0] + LDR r7,[r9,#4] + LDR r6,[r8,#4] + LDR r5,[r2,#4] + LDR r4,[r12,#4] + STR r6,[r9,#4] + STR r7,[r8,#4] + STR r5,[r12,#4] + STR r4,[r2,#4] + ADDS r1,r1,#8 + SUBS r3,r3,#1 + BNE arm_bitreversal_32_0 + POP {r4-r9} + BX lr + ENDP + +arm_bitreversal_16 PROC + ADDS r3,r1,#1 + CMP r3,#1 + IT LS + BXLS lr + PUSH {r4-r9} + ADDS r1,r2,#2 + LSRS r3,r3,#2 +arm_bitreversal_16_0 LABEL ;/* loop unrolled by 2 */ + LDRH r8,[r1,#4] + LDRH r9,[r1,#2] + LDRH r2,[r1,#0] + LDRH r12,[r1,#-2] + ADD r8,r0,r8,LSR #1 + ADD r9,r0,r9,LSR #1 + ADD r2,r0,r2,LSR #1 + ADD r12,r0,r12,LSR #1 + LDR r7,[r9,#0] + LDR r6,[r8,#0] + LDR r5,[r2,#0] + LDR r4,[r12,#0] + STR r6,[r9,#0] + STR r7,[r8,#0] + STR r5,[r12,#0] + STR r4,[r2,#0] + ADDS r1,r1,#8 + SUBS r3,r3,#1 + BNE arm_bitreversal_16_0 + POP {r4-r9} + BX lr + ENDP + +#endif + + END diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_bitreversal2.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_bitreversal2.c new file mode 100644 index 0000000..77fac1f --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_bitreversal2.c @@ -0,0 +1,134 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_bitreversal2.c + * Description: Bitreversal functions + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2019 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/transform_functions.h" +#include "arm_common_tables.h" + + +/** + @brief In-place 64 bit reversal function. + @param[in,out] pSrc points to in-place buffer of unknown 64-bit data type + @param[in] bitRevLen bit reversal table length + @param[in] pBitRevTab points to bit reversal table + @return none +*/ + +void arm_bitreversal_64( + uint64_t *pSrc, + const uint16_t bitRevLen, + const uint16_t *pBitRevTab) +{ + uint64_t a, b, i, tmp; + + for (i = 0; i < bitRevLen; ) + { + a = pBitRevTab[i ] >> 2; + b = pBitRevTab[i + 1] >> 2; + + //real + tmp = pSrc[a]; + pSrc[a] = pSrc[b]; + pSrc[b] = tmp; + + //complex + tmp = pSrc[a+1]; + pSrc[a+1] = pSrc[b+1]; + pSrc[b+1] = tmp; + + i += 2; + } +} + +/** + @brief In-place 32 bit reversal function. + @param[in,out] pSrc points to in-place buffer of unknown 32-bit data type + @param[in] bitRevLen bit reversal table length + @param[in] pBitRevTab points to bit reversal table + @return none +*/ + +void arm_bitreversal_32( + uint32_t *pSrc, + const uint16_t bitRevLen, + const uint16_t *pBitRevTab) +{ + uint32_t a, b, i, tmp; + + for (i = 0; i < bitRevLen; ) + { + a = pBitRevTab[i ] >> 2; + b = pBitRevTab[i + 1] >> 2; + + //real + tmp = pSrc[a]; + pSrc[a] = pSrc[b]; + pSrc[b] = tmp; + + //complex + tmp = pSrc[a+1]; + pSrc[a+1] = pSrc[b+1]; + pSrc[b+1] = tmp; + + i += 2; + } +} + + +/** + @brief In-place 16 bit reversal function. + @param[in,out] pSrc points to in-place buffer of unknown 16-bit data type + @param[in] bitRevLen bit reversal table length + @param[in] pBitRevTab points to bit reversal table + @return none +*/ + +void arm_bitreversal_16( + uint16_t *pSrc, + const uint16_t bitRevLen, + const uint16_t *pBitRevTab) +{ + uint16_t a, b, i, tmp; + + for (i = 0; i < bitRevLen; ) + { + a = pBitRevTab[i ] >> 2; + b = pBitRevTab[i + 1] >> 2; + + //real + tmp = pSrc[a]; + pSrc[a] = pSrc[b]; + pSrc[b] = tmp; + + //complex + tmp = pSrc[a+1]; + pSrc[a+1] = pSrc[b+1]; + pSrc[b+1] = tmp; + + i += 2; + } +} diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_bitreversal_f16.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_bitreversal_f16.c new file mode 100644 index 0000000..2e64636 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_bitreversal_f16.c @@ -0,0 +1,102 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_bitreversal_f16.c + * Description: Bitreversal functions + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/transform_functions_f16.h" + +/* +* @brief In-place bit reversal function. +* @param[in, out] *pSrc points to the in-place buffer of floating-point data type. +* @param[in] fftSize length of the FFT. +* @param[in] bitRevFactor bit reversal modifier that supports different size FFTs with the same bit reversal table. +* @param[in] *pBitRevTab points to the bit reversal table. +* @return none. +*/ + +#if defined(ARM_FLOAT16_SUPPORTED) + +void arm_bitreversal_f16( +float16_t * pSrc, +uint16_t fftSize, +uint16_t bitRevFactor, +const uint16_t * pBitRevTab) +{ + uint16_t fftLenBy2, fftLenBy2p1; + uint16_t i, j; + float16_t in; + + /* Initializations */ + j = 0U; + fftLenBy2 = fftSize >> 1U; + fftLenBy2p1 = (fftSize >> 1U) + 1U; + + /* Bit Reversal Implementation */ + for (i = 0U; i <= (fftLenBy2 - 2U); i += 2U) + { + if (i < j) + { + /* pSrc[i] <-> pSrc[j]; */ + in = pSrc[2U * i]; + pSrc[2U * i] = pSrc[2U * j]; + pSrc[2U * j] = in; + + /* pSrc[i+1U] <-> pSrc[j+1U] */ + in = pSrc[(2U * i) + 1U]; + pSrc[(2U * i) + 1U] = pSrc[(2U * j) + 1U]; + pSrc[(2U * j) + 1U] = in; + + /* pSrc[i+fftLenBy2p1] <-> pSrc[j+fftLenBy2p1] */ + in = pSrc[2U * (i + fftLenBy2p1)]; + pSrc[2U * (i + fftLenBy2p1)] = pSrc[2U * (j + fftLenBy2p1)]; + pSrc[2U * (j + fftLenBy2p1)] = in; + + /* pSrc[i+fftLenBy2p1+1U] <-> pSrc[j+fftLenBy2p1+1U] */ + in = pSrc[(2U * (i + fftLenBy2p1)) + 1U]; + pSrc[(2U * (i + fftLenBy2p1)) + 1U] = + pSrc[(2U * (j + fftLenBy2p1)) + 1U]; + pSrc[(2U * (j + fftLenBy2p1)) + 1U] = in; + + } + + /* pSrc[i+1U] <-> pSrc[j+1U] */ + in = pSrc[2U * (i + 1U)]; + pSrc[2U * (i + 1U)] = pSrc[2U * (j + fftLenBy2)]; + pSrc[2U * (j + fftLenBy2)] = in; + + /* pSrc[i+2U] <-> pSrc[j+2U] */ + in = pSrc[(2U * (i + 1U)) + 1U]; + pSrc[(2U * (i + 1U)) + 1U] = pSrc[(2U * (j + fftLenBy2)) + 1U]; + pSrc[(2U * (j + fftLenBy2)) + 1U] = in; + + /* Reading the index for the bit reversal */ + j = *pBitRevTab; + + /* Updating the bit reversal index depending on the fft length */ + pBitRevTab += bitRevFactor; + } +} +#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_f16.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_f16.c new file mode 100644 index 0000000..edfc0b4 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_f16.c @@ -0,0 +1,842 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_cfft_f32.c + * Description: Combined Radix Decimation in Frequency CFFT Floating point processing function + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/transform_functions_f16.h" +#include "arm_common_tables_f16.h" + + +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) + +#include "arm_helium_utils.h" +#include "arm_vec_fft.h" +#include "arm_mve_tables_f16.h" + + +static float16_t arm_inverse_fft_length_f16(uint16_t fftLen) +{ + float16_t retValue=1.0; + + switch (fftLen) + { + + case 4096U: + retValue = (float16_t)0.000244140625f; + break; + + case 2048U: + retValue = (float16_t)0.00048828125f; + break; + + case 1024U: + retValue = (float16_t)0.0009765625f; + break; + + case 512U: + retValue = (float16_t)0.001953125f; + break; + + case 256U: + retValue = (float16_t)0.00390625f; + break; + + case 128U: + retValue = (float16_t)0.0078125f; + break; + + case 64U: + retValue = (float16_t)0.015625f; + break; + + case 32U: + retValue = (float16_t)0.03125f; + break; + + case 16U: + retValue = (float16_t)0.0625f; + break; + + + default: + break; + } + return(retValue); +} + + +static void _arm_radix4_butterfly_f16_mve(const arm_cfft_instance_f16 * S,float16_t * pSrc, uint32_t fftLen) +{ + f16x8_t vecTmp0, vecTmp1; + f16x8_t vecSum0, vecDiff0, vecSum1, vecDiff1; + f16x8_t vecA, vecB, vecC, vecD; + uint32_t blkCnt; + uint32_t n1, n2; + uint32_t stage = 0; + int32_t iter = 1; + static const int32_t strides[4] = + { ( 0 - 16) * (int32_t)sizeof(float16_t *) + , ( 4 - 16) * (int32_t)sizeof(float16_t *) + , ( 8 - 16) * (int32_t)sizeof(float16_t *) + , (12 - 16) * (int32_t)sizeof(float16_t *)}; + + n2 = fftLen; + n1 = n2; + n2 >>= 2u; + for (int k = fftLen / 4u; k > 1; k >>= 2) + { + float16_t const *p_rearranged_twiddle_tab_stride1 = + &S->rearranged_twiddle_stride1[ + S->rearranged_twiddle_tab_stride1_arr[stage]]; + float16_t const *p_rearranged_twiddle_tab_stride2 = + &S->rearranged_twiddle_stride2[ + S->rearranged_twiddle_tab_stride2_arr[stage]]; + float16_t const *p_rearranged_twiddle_tab_stride3 = + &S->rearranged_twiddle_stride3[ + S->rearranged_twiddle_tab_stride3_arr[stage]]; + float16_t * pBase = pSrc; + for (int i = 0; i < iter; i++) + { + float16_t *inA = pBase; + float16_t *inB = inA + n2 * CMPLX_DIM; + float16_t *inC = inB + n2 * CMPLX_DIM; + float16_t *inD = inC + n2 * CMPLX_DIM; + float16_t const *pW1 = p_rearranged_twiddle_tab_stride1; + float16_t const *pW2 = p_rearranged_twiddle_tab_stride2; + float16_t const *pW3 = p_rearranged_twiddle_tab_stride3; + f16x8_t vecW; + + blkCnt = n2 / 4; + /* + * load 2 f16 complex pair + */ + vecA = vldrhq_f16(inA); + vecC = vldrhq_f16(inC); + while (blkCnt > 0U) + { + vecB = vldrhq_f16(inB); + vecD = vldrhq_f16(inD); + + vecSum0 = vecA + vecC; /* vecSum0 = vaddq(vecA, vecC) */ + vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */ + + vecSum1 = vecB + vecD; + vecDiff1 = vecB - vecD; + /* + * [ 1 1 1 1 ] * [ A B C D ]' .* 1 + */ + vecTmp0 = vecSum0 + vecSum1; + vst1q(inA, vecTmp0); + inA += 8; + + /* + * [ 1 -1 1 -1 ] * [ A B C D ]' + */ + vecTmp0 = vecSum0 - vecSum1; + /* + * [ 1 -1 1 -1 ] * [ A B C D ]'.* W2 + */ + vecW = vld1q(pW2); + pW2 += 8; + vecTmp1 = MVE_CMPLX_MULT_FLT_Conj_AxB(vecW, vecTmp0); + vst1q(inB, vecTmp1); + inB += 8; + + /* + * [ 1 -i -1 +i ] * [ A B C D ]' + */ + vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1); + /* + * [ 1 -i -1 +i ] * [ A B C D ]'.* W1 + */ + vecW = vld1q(pW1); + pW1 +=8; + vecTmp1 = MVE_CMPLX_MULT_FLT_Conj_AxB(vecW, vecTmp0); + vst1q(inC, vecTmp1); + inC += 8; + + /* + * [ 1 +i -1 -i ] * [ A B C D ]' + */ + vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1); + /* + * [ 1 +i -1 -i ] * [ A B C D ]'.* W3 + */ + vecW = vld1q(pW3); + pW3 += 8; + vecTmp1 = MVE_CMPLX_MULT_FLT_Conj_AxB(vecW, vecTmp0); + vst1q(inD, vecTmp1); + inD += 8; + + vecA = vldrhq_f16(inA); + vecC = vldrhq_f16(inC); + + blkCnt--; + } + pBase += CMPLX_DIM * n1; + } + n1 = n2; + n2 >>= 2u; + iter = iter << 2; + stage++; + } + + /* + * start of Last stage process + */ + uint32x4_t vecScGathAddr = vld1q_u32((uint32_t*)strides); + vecScGathAddr = vecScGathAddr + (uint32_t) pSrc; + + /* load scheduling */ + vecA = (f16x8_t)vldrwq_gather_base_wb_f32(&vecScGathAddr, 64); + vecC = (f16x8_t)vldrwq_gather_base_f32(vecScGathAddr, 8); + + blkCnt = (fftLen >> 4); + while (blkCnt > 0U) + { + vecSum0 = vecA + vecC; /* vecSum0 = vaddq(vecA, vecC) */ + vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */ + + vecB = (f16x8_t)vldrwq_gather_base_f32(vecScGathAddr, 4); + vecD = (f16x8_t)vldrwq_gather_base_f32(vecScGathAddr, 12); + + vecSum1 = vecB + vecD; + vecDiff1 = vecB - vecD; + + /* pre-load for next iteration */ + vecA = (f16x8_t)vldrwq_gather_base_wb_f32(&vecScGathAddr, 64); + vecC = (f16x8_t)vldrwq_gather_base_f32(vecScGathAddr, 8); + + vecTmp0 = vecSum0 + vecSum1; + vstrwq_scatter_base_f32(vecScGathAddr, -64, (f32x4_t)vecTmp0); + + vecTmp0 = vecSum0 - vecSum1; + vstrwq_scatter_base_f32(vecScGathAddr, -64 + 4, (f32x4_t)vecTmp0); + + vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1); + vstrwq_scatter_base_f32(vecScGathAddr, -64 + 8, (f32x4_t)vecTmp0); + + vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1); + vstrwq_scatter_base_f32(vecScGathAddr, -64 + 12, (f32x4_t)vecTmp0); + + blkCnt--; + } + + /* + * End of last stage process + */ +} + +static void arm_cfft_radix4by2_f16_mve(const arm_cfft_instance_f16 * S, float16_t *pSrc, uint32_t fftLen) +{ + float16_t const *pCoefVec; + float16_t const *pCoef = S->pTwiddle; + float16_t *pIn0, *pIn1; + uint32_t n2; + uint32_t blkCnt; + f16x8_t vecIn0, vecIn1, vecSum, vecDiff; + f16x8_t vecCmplxTmp, vecTw; + + + n2 = fftLen >> 1; + pIn0 = pSrc; + pIn1 = pSrc + fftLen; + pCoefVec = pCoef; + + blkCnt = n2 / 4; + while (blkCnt > 0U) + { + vecIn0 = *(f16x8_t *) pIn0; + vecIn1 = *(f16x8_t *) pIn1; + vecTw = vld1q(pCoefVec); + pCoefVec += 8; + + vecSum = vaddq(vecIn0, vecIn1); + vecDiff = vsubq(vecIn0, vecIn1); + + vecCmplxTmp = MVE_CMPLX_MULT_FLT_Conj_AxB(vecTw, vecDiff); + + vst1q(pIn0, vecSum); + pIn0 += 8; + vst1q(pIn1, vecCmplxTmp); + pIn1 += 8; + + blkCnt--; + } + + _arm_radix4_butterfly_f16_mve(S, pSrc, n2); + + _arm_radix4_butterfly_f16_mve(S, pSrc + fftLen, n2); + + pIn0 = pSrc; +} + +static void _arm_radix4_butterfly_inverse_f16_mve(const arm_cfft_instance_f16 * S,float16_t * pSrc, uint32_t fftLen, float16_t onebyfftLen) +{ + f16x8_t vecTmp0, vecTmp1; + f16x8_t vecSum0, vecDiff0, vecSum1, vecDiff1; + f16x8_t vecA, vecB, vecC, vecD; + uint32_t blkCnt; + uint32_t n1, n2; + uint32_t stage = 0; + int32_t iter = 1; + static const int32_t strides[4] = { + ( 0 - 16) * (int32_t)sizeof(q31_t *), + ( 4 - 16) * (int32_t)sizeof(q31_t *), + ( 8 - 16) * (int32_t)sizeof(q31_t *), + (12 - 16) * (int32_t)sizeof(q31_t *) + }; + + n2 = fftLen; + n1 = n2; + n2 >>= 2u; + for (int k = fftLen / 4; k > 1; k >>= 2) + { + float16_t const *p_rearranged_twiddle_tab_stride1 = + &S->rearranged_twiddle_stride1[ + S->rearranged_twiddle_tab_stride1_arr[stage]]; + float16_t const *p_rearranged_twiddle_tab_stride2 = + &S->rearranged_twiddle_stride2[ + S->rearranged_twiddle_tab_stride2_arr[stage]]; + float16_t const *p_rearranged_twiddle_tab_stride3 = + &S->rearranged_twiddle_stride3[ + S->rearranged_twiddle_tab_stride3_arr[stage]]; + + float16_t * pBase = pSrc; + for (int i = 0; i < iter; i++) + { + float16_t *inA = pBase; + float16_t *inB = inA + n2 * CMPLX_DIM; + float16_t *inC = inB + n2 * CMPLX_DIM; + float16_t *inD = inC + n2 * CMPLX_DIM; + float16_t const *pW1 = p_rearranged_twiddle_tab_stride1; + float16_t const *pW2 = p_rearranged_twiddle_tab_stride2; + float16_t const *pW3 = p_rearranged_twiddle_tab_stride3; + f16x8_t vecW; + + blkCnt = n2 / 4; + /* + * load 2 f32 complex pair + */ + vecA = vldrhq_f16(inA); + vecC = vldrhq_f16(inC); + while (blkCnt > 0U) + { + vecB = vldrhq_f16(inB); + vecD = vldrhq_f16(inD); + + vecSum0 = vecA + vecC; /* vecSum0 = vaddq(vecA, vecC) */ + vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */ + + vecSum1 = vecB + vecD; + vecDiff1 = vecB - vecD; + /* + * [ 1 1 1 1 ] * [ A B C D ]' .* 1 + */ + vecTmp0 = vecSum0 + vecSum1; + vst1q(inA, vecTmp0); + inA += 8; + /* + * [ 1 -1 1 -1 ] * [ A B C D ]' + */ + vecTmp0 = vecSum0 - vecSum1; + /* + * [ 1 -1 1 -1 ] * [ A B C D ]'.* W1 + */ + vecW = vld1q(pW2); + pW2 += 8; + vecTmp1 = MVE_CMPLX_MULT_FLT_AxB(vecW, vecTmp0); + vst1q(inB, vecTmp1); + inB += 8; + + /* + * [ 1 -i -1 +i ] * [ A B C D ]' + */ + vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1); + /* + * [ 1 -i -1 +i ] * [ A B C D ]'.* W2 + */ + vecW = vld1q(pW1); + pW1 += 8; + vecTmp1 = MVE_CMPLX_MULT_FLT_AxB(vecW, vecTmp0); + vst1q(inC, vecTmp1); + inC += 8; + + /* + * [ 1 +i -1 -i ] * [ A B C D ]' + */ + vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1); + /* + * [ 1 +i -1 -i ] * [ A B C D ]'.* W3 + */ + vecW = vld1q(pW3); + pW3 += 8; + vecTmp1 = MVE_CMPLX_MULT_FLT_AxB(vecW, vecTmp0); + vst1q(inD, vecTmp1); + inD += 8; + + vecA = vldrhq_f16(inA); + vecC = vldrhq_f16(inC); + + blkCnt--; + } + pBase += CMPLX_DIM * n1; + } + n1 = n2; + n2 >>= 2u; + iter = iter << 2; + stage++; + } + + /* + * start of Last stage process + */ + uint32x4_t vecScGathAddr = vld1q_u32((uint32_t*)strides); + vecScGathAddr = vecScGathAddr + (uint32_t) pSrc; + + /* + * load scheduling + */ + vecA = (f16x8_t)vldrwq_gather_base_wb_f32(&vecScGathAddr, 64); + vecC = (f16x8_t)vldrwq_gather_base_f32(vecScGathAddr, 8); + + blkCnt = (fftLen >> 4); + while (blkCnt > 0U) + { + vecSum0 = vecA + vecC; /* vecSum0 = vaddq(vecA, vecC) */ + vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */ + + vecB = (f16x8_t)vldrwq_gather_base_f32(vecScGathAddr, 4); + vecD = (f16x8_t)vldrwq_gather_base_f32(vecScGathAddr, 12); + + vecSum1 = vecB + vecD; + vecDiff1 = vecB - vecD; + + vecA = (f16x8_t)vldrwq_gather_base_wb_f32(&vecScGathAddr, 64); + vecC = (f16x8_t)vldrwq_gather_base_f32(vecScGathAddr, 8); + + vecTmp0 = vecSum0 + vecSum1; + vecTmp0 = vecTmp0 * onebyfftLen; + vstrwq_scatter_base_f32(vecScGathAddr, -64, (f32x4_t)vecTmp0); + + vecTmp0 = vecSum0 - vecSum1; + vecTmp0 = vecTmp0 * onebyfftLen; + vstrwq_scatter_base_f32(vecScGathAddr, -64 + 4, (f32x4_t)vecTmp0); + + vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1); + vecTmp0 = vecTmp0 * onebyfftLen; + vstrwq_scatter_base_f32(vecScGathAddr, -64 + 8, (f32x4_t)vecTmp0); + + vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1); + vecTmp0 = vecTmp0 * onebyfftLen; + vstrwq_scatter_base_f32(vecScGathAddr, -64 + 12, (f32x4_t)vecTmp0); + + blkCnt--; + } + + /* + * End of last stage process + */ +} + +static void arm_cfft_radix4by2_inverse_f16_mve(const arm_cfft_instance_f16 * S,float16_t *pSrc, uint32_t fftLen) +{ + float16_t const *pCoefVec; + float16_t const *pCoef = S->pTwiddle; + float16_t *pIn0, *pIn1; + uint32_t n2; + float16_t onebyfftLen = arm_inverse_fft_length_f16(fftLen); + uint32_t blkCnt; + f16x8_t vecIn0, vecIn1, vecSum, vecDiff; + f16x8_t vecCmplxTmp, vecTw; + + + n2 = fftLen >> 1; + pIn0 = pSrc; + pIn1 = pSrc + fftLen; + pCoefVec = pCoef; + + blkCnt = n2 / 4; + while (blkCnt > 0U) + { + vecIn0 = *(f16x8_t *) pIn0; + vecIn1 = *(f16x8_t *) pIn1; + vecTw = vld1q(pCoefVec); + pCoefVec += 8; + + vecSum = vaddq(vecIn0, vecIn1); + vecDiff = vsubq(vecIn0, vecIn1); + + vecCmplxTmp = MVE_CMPLX_MULT_FLT_AxB(vecTw, vecDiff); + + vst1q(pIn0, vecSum); + pIn0 += 8; + vst1q(pIn1, vecCmplxTmp); + pIn1 += 8; + + blkCnt--; + } + + _arm_radix4_butterfly_inverse_f16_mve(S, pSrc, n2, onebyfftLen); + + _arm_radix4_butterfly_inverse_f16_mve(S, pSrc + fftLen, n2, onebyfftLen); +} + + +/** + @addtogroup ComplexFFT + @{ + */ + +/** + @brief Processing function for the floating-point complex FFT. + @param[in] S points to an instance of the floating-point CFFT structure + @param[in,out] p1 points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place + @param[in] ifftFlag flag that selects transform direction + - value = 0: forward transform + - value = 1: inverse transform + @param[in] bitReverseFlag flag that enables / disables bit reversal of output + - value = 0: disables bit reversal of output + - value = 1: enables bit reversal of output + @return none + */ + + +void arm_cfft_f16( + const arm_cfft_instance_f16 * S, + float16_t * pSrc, + uint8_t ifftFlag, + uint8_t bitReverseFlag) +{ + uint32_t fftLen = S->fftLen; + + if (ifftFlag == 1U) { + + switch (fftLen) { + case 16: + case 64: + case 256: + case 1024: + case 4096: + _arm_radix4_butterfly_inverse_f16_mve(S, pSrc, fftLen, arm_inverse_fft_length_f16(S->fftLen)); + break; + + case 32: + case 128: + case 512: + case 2048: + arm_cfft_radix4by2_inverse_f16_mve(S, pSrc, fftLen); + break; + } + } else { + switch (fftLen) { + case 16: + case 64: + case 256: + case 1024: + case 4096: + _arm_radix4_butterfly_f16_mve(S, pSrc, fftLen); + break; + + case 32: + case 128: + case 512: + case 2048: + arm_cfft_radix4by2_f16_mve(S, pSrc, fftLen); + break; + } + } + + + if (bitReverseFlag) + { + + arm_bitreversal_16_inpl_mve((uint16_t*)pSrc, S->bitRevLength, S->pBitRevTable); + + } +} + +#else + +#if defined(ARM_FLOAT16_SUPPORTED) + +extern void arm_bitreversal_16( + uint16_t * pSrc, + const uint16_t bitRevLen, + const uint16_t * pBitRevTable); + + +extern void arm_cfft_radix4by2_f16( + float16_t * pSrc, + uint32_t fftLen, + const float16_t * pCoef); + +extern void arm_radix4_butterfly_f16( + float16_t * pSrc, + uint16_t fftLen, + const float16_t * pCoef, + uint16_t twidCoefModifier); + +/** + @ingroup groupTransforms + */ + +/** + @defgroup ComplexFFT Complex FFT Functions + + @par + The Fast Fourier Transform (FFT) is an efficient algorithm for computing the + Discrete Fourier Transform (DFT). The FFT can be orders of magnitude faster + than the DFT, especially for long lengths. + The algorithms described in this section + operate on complex data. A separate set of functions is devoted to handling + of real sequences. + @par + There are separate algorithms for handling floating-point, Q15, and Q31 data + types. The algorithms available for each data type are described next. + @par + The FFT functions operate in-place. That is, the array holding the input data + will also be used to hold the corresponding result. The input data is complex + and contains <code>2*fftLen</code> interleaved values as shown below. + <pre>{real[0], imag[0], real[1], imag[1], ...} </pre> + The FFT result will be contained in the same array and the frequency domain + values will have the same interleaving. + + @par Floating-point + The floating-point complex FFT uses a mixed-radix algorithm. Multiple radix-8 + stages are performed along with a single radix-2 or radix-4 stage, as needed. + The algorithm supports lengths of [16, 32, 64, ..., 4096] and each length uses + a different twiddle factor table. + @par + The function uses the standard FFT definition and output values may grow by a + factor of <code>fftLen</code> when computing the forward transform. The + inverse transform includes a scale of <code>1/fftLen</code> as part of the + calculation and this matches the textbook definition of the inverse FFT. + @par + For the MVE version, the new arm_cfft_init_f32 initialization function is + <b>mandatory</b>. <b>Compilation flags are available to include only the required tables for the + needed FFTs.</b> Other FFT versions can continue to be initialized as + explained below. + @par + For not MVE versions, pre-initialized data structures containing twiddle factors + and bit reversal tables are provided and defined in <code>arm_const_structs.h</code>. Include + this header in your function and then pass one of the constant structures as + an argument to arm_cfft_f32. For example: + @par + <code>arm_cfft_f32(arm_cfft_sR_f32_len64, pSrc, 1, 1)</code> + @par + computes a 64-point inverse complex FFT including bit reversal. + The data structures are treated as constant data and not modified during the + calculation. The same data structure can be reused for multiple transforms + including mixing forward and inverse transforms. + @par + Earlier releases of the library provided separate radix-2 and radix-4 + algorithms that operated on floating-point data. These functions are still + provided but are deprecated. The older functions are slower and less general + than the new functions. + @par + An example of initialization of the constants for the arm_cfft_f32 function follows: + @code + const static arm_cfft_instance_f32 *S; + ... + switch (length) { + case 16: + S = &arm_cfft_sR_f32_len16; + break; + case 32: + S = &arm_cfft_sR_f32_len32; + break; + case 64: + S = &arm_cfft_sR_f32_len64; + break; + case 128: + S = &arm_cfft_sR_f32_len128; + break; + case 256: + S = &arm_cfft_sR_f32_len256; + break; + case 512: + S = &arm_cfft_sR_f32_len512; + break; + case 1024: + S = &arm_cfft_sR_f32_len1024; + break; + case 2048: + S = &arm_cfft_sR_f32_len2048; + break; + case 4096: + S = &arm_cfft_sR_f32_len4096; + break; + } + @endcode + @par + The new arm_cfft_init_f32 can also be used. + @par Q15 and Q31 + The floating-point complex FFT uses a mixed-radix algorithm. Multiple radix-4 + stages are performed along with a single radix-2 stage, as needed. + The algorithm supports lengths of [16, 32, 64, ..., 4096] and each length uses + a different twiddle factor table. + @par + The function uses the standard FFT definition and output values may grow by a + factor of <code>fftLen</code> when computing the forward transform. The + inverse transform includes a scale of <code>1/fftLen</code> as part of the + calculation and this matches the textbook definition of the inverse FFT. + @par + Pre-initialized data structures containing twiddle factors and bit reversal + tables are provided and defined in <code>arm_const_structs.h</code>. Include + this header in your function and then pass one of the constant structures as + an argument to arm_cfft_q31. For example: + @par + <code>arm_cfft_q31(arm_cfft_sR_q31_len64, pSrc, 1, 1)</code> + @par + computes a 64-point inverse complex FFT including bit reversal. + The data structures are treated as constant data and not modified during the + calculation. The same data structure can be reused for multiple transforms + including mixing forward and inverse transforms. + @par + Earlier releases of the library provided separate radix-2 and radix-4 + algorithms that operated on floating-point data. These functions are still + provided but are deprecated. The older functions are slower and less general + than the new functions. + @par + An example of initialization of the constants for the arm_cfft_q31 function follows: + @code + const static arm_cfft_instance_q31 *S; + ... + switch (length) { + case 16: + S = &arm_cfft_sR_q31_len16; + break; + case 32: + S = &arm_cfft_sR_q31_len32; + break; + case 64: + S = &arm_cfft_sR_q31_len64; + break; + case 128: + S = &arm_cfft_sR_q31_len128; + break; + case 256: + S = &arm_cfft_sR_q31_len256; + break; + case 512: + S = &arm_cfft_sR_q31_len512; + break; + case 1024: + S = &arm_cfft_sR_q31_len1024; + break; + case 2048: + S = &arm_cfft_sR_q31_len2048; + break; + case 4096: + S = &arm_cfft_sR_q31_len4096; + break; + } + @endcode + + */ + + +/** + @addtogroup ComplexFFT + @{ + */ + +/** + @brief Processing function for the floating-point complex FFT. + @param[in] S points to an instance of the floating-point CFFT structure + @param[in,out] p1 points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place + @param[in] ifftFlag flag that selects transform direction + - value = 0: forward transform + - value = 1: inverse transform + @param[in] bitReverseFlag flag that enables / disables bit reversal of output + - value = 0: disables bit reversal of output + - value = 1: enables bit reversal of output + @return none + */ + +void arm_cfft_f16( + const arm_cfft_instance_f16 * S, + float16_t * p1, + uint8_t ifftFlag, + uint8_t bitReverseFlag) +{ + uint32_t L = S->fftLen, l; + float16_t invL, * pSrc; + + if (ifftFlag == 1U) + { + /* Conjugate input data */ + pSrc = p1 + 1; + for(l=0; l<L; l++) + { + *pSrc = -(_Float16)*pSrc; + pSrc += 2; + } + } + + switch (L) + { + + case 16: + case 64: + case 256: + case 1024: + case 4096: + arm_radix4_butterfly_f16 (p1, L, (float16_t*)S->pTwiddle, 1U); + break; + + case 32: + case 128: + case 512: + case 2048: + arm_cfft_radix4by2_f16 ( p1, L, (float16_t*)S->pTwiddle); + break; + + } + + if ( bitReverseFlag ) + arm_bitreversal_16((uint16_t*)p1, S->bitRevLength,(uint16_t*)S->pBitRevTable); + + if (ifftFlag == 1U) + { + invL = 1.0f16/(_Float16)L; + /* Conjugate and scale output data */ + pSrc = p1; + for(l=0; l<L; l++) + { + *pSrc++ *= (_Float16)invL ; + *pSrc = -(_Float16)(*pSrc) * (_Float16)invL; + pSrc++; + } + } +} +#endif /* if defined(ARM_FLOAT16_SUPPORTED) */ +#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */ + +/** + @} end of ComplexFFT group + */ diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_f32.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_f32.c new file mode 100644 index 0000000..0b33e8e --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_f32.c @@ -0,0 +1,1192 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_cfft_f32.c + * Description: Combined Radix Decimation in Frequency CFFT Floating point processing function + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/transform_functions.h" +#include "arm_common_tables.h" + +#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) + +#include "arm_helium_utils.h" +#include "arm_vec_fft.h" +#include "arm_mve_tables.h" + + +static float32_t arm_inverse_fft_length_f32(uint16_t fftLen) +{ + float32_t retValue=1.0; + + switch (fftLen) + { + + case 4096U: + retValue = 0.000244140625; + break; + + case 2048U: + retValue = 0.00048828125; + break; + + case 1024U: + retValue = 0.0009765625f; + break; + + case 512U: + retValue = 0.001953125; + break; + + case 256U: + retValue = 0.00390625f; + break; + + case 128U: + retValue = 0.0078125; + break; + + case 64U: + retValue = 0.015625f; + break; + + case 32U: + retValue = 0.03125; + break; + + case 16U: + retValue = 0.0625f; + break; + + + default: + break; + } + return(retValue); +} + + + + +static void _arm_radix4_butterfly_f32_mve(const arm_cfft_instance_f32 * S,float32_t * pSrc, uint32_t fftLen) +{ + f32x4_t vecTmp0, vecTmp1; + f32x4_t vecSum0, vecDiff0, vecSum1, vecDiff1; + f32x4_t vecA, vecB, vecC, vecD; + uint32_t blkCnt; + uint32_t n1, n2; + uint32_t stage = 0; + int32_t iter = 1; + static const int32_t strides[4] = { + (0 - 16) * (int32_t)sizeof(q31_t *), + (1 - 16) * (int32_t)sizeof(q31_t *), + (8 - 16) * (int32_t)sizeof(q31_t *), + (9 - 16) * (int32_t)sizeof(q31_t *) + }; + + n2 = fftLen; + n1 = n2; + n2 >>= 2u; + for (int k = fftLen / 4u; k > 1; k >>= 2) + { + float32_t const *p_rearranged_twiddle_tab_stride1 = + &S->rearranged_twiddle_stride1[ + S->rearranged_twiddle_tab_stride1_arr[stage]]; + float32_t const *p_rearranged_twiddle_tab_stride2 = + &S->rearranged_twiddle_stride2[ + S->rearranged_twiddle_tab_stride2_arr[stage]]; + float32_t const *p_rearranged_twiddle_tab_stride3 = + &S->rearranged_twiddle_stride3[ + S->rearranged_twiddle_tab_stride3_arr[stage]]; + + float32_t * pBase = pSrc; + for (int i = 0; i < iter; i++) + { + float32_t *inA = pBase; + float32_t *inB = inA + n2 * CMPLX_DIM; + float32_t *inC = inB + n2 * CMPLX_DIM; + float32_t *inD = inC + n2 * CMPLX_DIM; + float32_t const *pW1 = p_rearranged_twiddle_tab_stride1; + float32_t const *pW2 = p_rearranged_twiddle_tab_stride2; + float32_t const *pW3 = p_rearranged_twiddle_tab_stride3; + f32x4_t vecW; + + blkCnt = n2 / 2; + /* + * load 2 f32 complex pair + */ + vecA = vldrwq_f32(inA); + vecC = vldrwq_f32(inC); + while (blkCnt > 0U) + { + vecB = vldrwq_f32(inB); + vecD = vldrwq_f32(inD); + + vecSum0 = vecA + vecC; /* vecSum0 = vaddq(vecA, vecC) */ + vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */ + + vecSum1 = vecB + vecD; + vecDiff1 = vecB - vecD; + /* + * [ 1 1 1 1 ] * [ A B C D ]' .* 1 + */ + vecTmp0 = vecSum0 + vecSum1; + vst1q(inA, vecTmp0); + inA += 4; + + /* + * [ 1 -1 1 -1 ] * [ A B C D ]' + */ + vecTmp0 = vecSum0 - vecSum1; + /* + * [ 1 -1 1 -1 ] * [ A B C D ]'.* W2 + */ + vecW = vld1q(pW2); + pW2 += 4; + vecTmp1 = MVE_CMPLX_MULT_FLT_Conj_AxB(vecW, vecTmp0); + vst1q(inB, vecTmp1); + inB += 4; + + /* + * [ 1 -i -1 +i ] * [ A B C D ]' + */ + vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1); + /* + * [ 1 -i -1 +i ] * [ A B C D ]'.* W1 + */ + vecW = vld1q(pW1); + pW1 +=4; + vecTmp1 = MVE_CMPLX_MULT_FLT_Conj_AxB(vecW, vecTmp0); + vst1q(inC, vecTmp1); + inC += 4; + + /* + * [ 1 +i -1 -i ] * [ A B C D ]' + */ + vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1); + /* + * [ 1 +i -1 -i ] * [ A B C D ]'.* W3 + */ + vecW = vld1q(pW3); + pW3 += 4; + vecTmp1 = MVE_CMPLX_MULT_FLT_Conj_AxB(vecW, vecTmp0); + vst1q(inD, vecTmp1); + inD += 4; + + vecA = vldrwq_f32(inA); + vecC = vldrwq_f32(inC); + + blkCnt--; + } + pBase += CMPLX_DIM * n1; + } + n1 = n2; + n2 >>= 2u; + iter = iter << 2; + stage++; + } + + /* + * start of Last stage process + */ + uint32x4_t vecScGathAddr = vld1q_u32((uint32_t*)strides); + vecScGathAddr = vecScGathAddr + (uint32_t) pSrc; + + /* load scheduling */ + vecA = vldrwq_gather_base_wb_f32(&vecScGathAddr, 64); + vecC = vldrwq_gather_base_f32(vecScGathAddr, 16); + + blkCnt = (fftLen >> 3); + while (blkCnt > 0U) + { + vecSum0 = vecA + vecC; /* vecSum0 = vaddq(vecA, vecC) */ + vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */ + + vecB = vldrwq_gather_base_f32(vecScGathAddr, 8); + vecD = vldrwq_gather_base_f32(vecScGathAddr, 24); + + vecSum1 = vecB + vecD; + vecDiff1 = vecB - vecD; + + /* pre-load for next iteration */ + vecA = vldrwq_gather_base_wb_f32(&vecScGathAddr, 64); + vecC = vldrwq_gather_base_f32(vecScGathAddr, 16); + + vecTmp0 = vecSum0 + vecSum1; + vstrwq_scatter_base_f32(vecScGathAddr, -64, vecTmp0); + + vecTmp0 = vecSum0 - vecSum1; + vstrwq_scatter_base_f32(vecScGathAddr, -64 + 8, vecTmp0); + + vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1); + vstrwq_scatter_base_f32(vecScGathAddr, -64 + 16, vecTmp0); + + vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1); + vstrwq_scatter_base_f32(vecScGathAddr, -64 + 24, vecTmp0); + + blkCnt--; + } + + /* + * End of last stage process + */ +} + +static void arm_cfft_radix4by2_f32_mve(const arm_cfft_instance_f32 * S, float32_t *pSrc, uint32_t fftLen) +{ + float32_t const *pCoefVec; + float32_t const *pCoef = S->pTwiddle; + float32_t *pIn0, *pIn1; + uint32_t n2; + uint32_t blkCnt; + f32x4_t vecIn0, vecIn1, vecSum, vecDiff; + f32x4_t vecCmplxTmp, vecTw; + + + n2 = fftLen >> 1; + pIn0 = pSrc; + pIn1 = pSrc + fftLen; + pCoefVec = pCoef; + + blkCnt = n2 / 2; + while (blkCnt > 0U) + { + vecIn0 = *(f32x4_t *) pIn0; + vecIn1 = *(f32x4_t *) pIn1; + vecTw = vld1q(pCoefVec); + pCoefVec += 4; + + vecSum = vecIn0 + vecIn1; + vecDiff = vecIn0 - vecIn1; + + vecCmplxTmp = MVE_CMPLX_MULT_FLT_Conj_AxB(vecTw, vecDiff); + + vst1q(pIn0, vecSum); + pIn0 += 4; + vst1q(pIn1, vecCmplxTmp); + pIn1 += 4; + + blkCnt--; + } + + _arm_radix4_butterfly_f32_mve(S, pSrc, n2); + + _arm_radix4_butterfly_f32_mve(S, pSrc + fftLen, n2); + + pIn0 = pSrc; +} + +static void _arm_radix4_butterfly_inverse_f32_mve(const arm_cfft_instance_f32 * S,float32_t * pSrc, uint32_t fftLen, float32_t onebyfftLen) +{ + f32x4_t vecTmp0, vecTmp1; + f32x4_t vecSum0, vecDiff0, vecSum1, vecDiff1; + f32x4_t vecA, vecB, vecC, vecD; + uint32_t blkCnt; + uint32_t n1, n2; + uint32_t stage = 0; + int32_t iter = 1; + static const int32_t strides[4] = { + (0 - 16) * (int32_t)sizeof(q31_t *), + (1 - 16) * (int32_t)sizeof(q31_t *), + (8 - 16) * (int32_t)sizeof(q31_t *), + (9 - 16) * (int32_t)sizeof(q31_t *) + }; + + n2 = fftLen; + n1 = n2; + n2 >>= 2u; + for (int k = fftLen / 4; k > 1; k >>= 2) + { + float32_t const *p_rearranged_twiddle_tab_stride1 = + &S->rearranged_twiddle_stride1[ + S->rearranged_twiddle_tab_stride1_arr[stage]]; + float32_t const *p_rearranged_twiddle_tab_stride2 = + &S->rearranged_twiddle_stride2[ + S->rearranged_twiddle_tab_stride2_arr[stage]]; + float32_t const *p_rearranged_twiddle_tab_stride3 = + &S->rearranged_twiddle_stride3[ + S->rearranged_twiddle_tab_stride3_arr[stage]]; + + float32_t * pBase = pSrc; + for (int i = 0; i < iter; i++) + { + float32_t *inA = pBase; + float32_t *inB = inA + n2 * CMPLX_DIM; + float32_t *inC = inB + n2 * CMPLX_DIM; + float32_t *inD = inC + n2 * CMPLX_DIM; + float32_t const *pW1 = p_rearranged_twiddle_tab_stride1; + float32_t const *pW2 = p_rearranged_twiddle_tab_stride2; + float32_t const *pW3 = p_rearranged_twiddle_tab_stride3; + f32x4_t vecW; + + blkCnt = n2 / 2; + /* + * load 2 f32 complex pair + */ + vecA = vldrwq_f32(inA); + vecC = vldrwq_f32(inC); + while (blkCnt > 0U) + { + vecB = vldrwq_f32(inB); + vecD = vldrwq_f32(inD); + + vecSum0 = vecA + vecC; /* vecSum0 = vaddq(vecA, vecC) */ + vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */ + + vecSum1 = vecB + vecD; + vecDiff1 = vecB - vecD; + /* + * [ 1 1 1 1 ] * [ A B C D ]' .* 1 + */ + vecTmp0 = vecSum0 + vecSum1; + vst1q(inA, vecTmp0); + inA += 4; + /* + * [ 1 -1 1 -1 ] * [ A B C D ]' + */ + vecTmp0 = vecSum0 - vecSum1; + /* + * [ 1 -1 1 -1 ] * [ A B C D ]'.* W1 + */ + vecW = vld1q(pW2); + pW2 += 4; + vecTmp1 = MVE_CMPLX_MULT_FLT_AxB(vecW, vecTmp0); + vst1q(inB, vecTmp1); + inB += 4; + + /* + * [ 1 -i -1 +i ] * [ A B C D ]' + */ + vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1); + /* + * [ 1 -i -1 +i ] * [ A B C D ]'.* W2 + */ + vecW = vld1q(pW1); + pW1 += 4; + vecTmp1 = MVE_CMPLX_MULT_FLT_AxB(vecW, vecTmp0); + vst1q(inC, vecTmp1); + inC += 4; + + /* + * [ 1 +i -1 -i ] * [ A B C D ]' + */ + vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1); + /* + * [ 1 +i -1 -i ] * [ A B C D ]'.* W3 + */ + vecW = vld1q(pW3); + pW3 += 4; + vecTmp1 = MVE_CMPLX_MULT_FLT_AxB(vecW, vecTmp0); + vst1q(inD, vecTmp1); + inD += 4; + + vecA = vldrwq_f32(inA); + vecC = vldrwq_f32(inC); + + blkCnt--; + } + pBase += CMPLX_DIM * n1; + } + n1 = n2; + n2 >>= 2u; + iter = iter << 2; + stage++; + } + + /* + * start of Last stage process + */ + uint32x4_t vecScGathAddr = vld1q_u32 ((uint32_t*)strides); + vecScGathAddr = vecScGathAddr + (uint32_t) pSrc; + + /* + * load scheduling + */ + vecA = vldrwq_gather_base_wb_f32(&vecScGathAddr, 64); + vecC = vldrwq_gather_base_f32(vecScGathAddr, 16); + + blkCnt = (fftLen >> 3); + while (blkCnt > 0U) + { + vecSum0 = vecA + vecC; /* vecSum0 = vaddq(vecA, vecC) */ + vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */ + + vecB = vldrwq_gather_base_f32(vecScGathAddr, 8); + vecD = vldrwq_gather_base_f32(vecScGathAddr, 24); + + vecSum1 = vecB + vecD; + vecDiff1 = vecB - vecD; + + vecA = vldrwq_gather_base_wb_f32(&vecScGathAddr, 64); + vecC = vldrwq_gather_base_f32(vecScGathAddr, 16); + + vecTmp0 = vecSum0 + vecSum1; + vecTmp0 = vecTmp0 * onebyfftLen; + vstrwq_scatter_base_f32(vecScGathAddr, -64, vecTmp0); + + vecTmp0 = vecSum0 - vecSum1; + vecTmp0 = vecTmp0 * onebyfftLen; + vstrwq_scatter_base_f32(vecScGathAddr, -64 + 8, vecTmp0); + + vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1); + vecTmp0 = vecTmp0 * onebyfftLen; + vstrwq_scatter_base_f32(vecScGathAddr, -64 + 16, vecTmp0); + + vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1); + vecTmp0 = vecTmp0 * onebyfftLen; + vstrwq_scatter_base_f32(vecScGathAddr, -64 + 24, vecTmp0); + + blkCnt--; + } + + /* + * End of last stage process + */ +} + +static void arm_cfft_radix4by2_inverse_f32_mve(const arm_cfft_instance_f32 * S,float32_t *pSrc, uint32_t fftLen) +{ + float32_t const *pCoefVec; + float32_t const *pCoef = S->pTwiddle; + float32_t *pIn0, *pIn1; + uint32_t n2; + float32_t onebyfftLen = arm_inverse_fft_length_f32(fftLen); + uint32_t blkCnt; + f32x4_t vecIn0, vecIn1, vecSum, vecDiff; + f32x4_t vecCmplxTmp, vecTw; + + + n2 = fftLen >> 1; + pIn0 = pSrc; + pIn1 = pSrc + fftLen; + pCoefVec = pCoef; + + blkCnt = n2 / 2; + while (blkCnt > 0U) + { + vecIn0 = *(f32x4_t *) pIn0; + vecIn1 = *(f32x4_t *) pIn1; + vecTw = vld1q(pCoefVec); + pCoefVec += 4; + + vecSum = vecIn0 + vecIn1; + vecDiff = vecIn0 - vecIn1; + + vecCmplxTmp = MVE_CMPLX_MULT_FLT_AxB(vecTw, vecDiff); + + vst1q(pIn0, vecSum); + pIn0 += 4; + vst1q(pIn1, vecCmplxTmp); + pIn1 += 4; + + blkCnt--; + } + + _arm_radix4_butterfly_inverse_f32_mve(S, pSrc, n2, onebyfftLen); + + _arm_radix4_butterfly_inverse_f32_mve(S, pSrc + fftLen, n2, onebyfftLen); +} + + +/** + @addtogroup ComplexFFT + @{ + */ + +/** + @brief Processing function for the floating-point complex FFT. + @param[in] S points to an instance of the floating-point CFFT structure + @param[in,out] p1 points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place + @param[in] ifftFlag flag that selects transform direction + - value = 0: forward transform + - value = 1: inverse transform + @param[in] bitReverseFlag flag that enables / disables bit reversal of output + - value = 0: disables bit reversal of output + - value = 1: enables bit reversal of output + @return none + */ + + +void arm_cfft_f32( + const arm_cfft_instance_f32 * S, + float32_t * pSrc, + uint8_t ifftFlag, + uint8_t bitReverseFlag) +{ + uint32_t fftLen = S->fftLen; + + if (ifftFlag == 1U) { + + switch (fftLen) { + case 16: + case 64: + case 256: + case 1024: + case 4096: + _arm_radix4_butterfly_inverse_f32_mve(S, pSrc, fftLen, arm_inverse_fft_length_f32(S->fftLen)); + break; + + case 32: + case 128: + case 512: + case 2048: + arm_cfft_radix4by2_inverse_f32_mve(S, pSrc, fftLen); + break; + } + } else { + switch (fftLen) { + case 16: + case 64: + case 256: + case 1024: + case 4096: + _arm_radix4_butterfly_f32_mve(S, pSrc, fftLen); + break; + + case 32: + case 128: + case 512: + case 2048: + arm_cfft_radix4by2_f32_mve(S, pSrc, fftLen); + break; + } + } + + + if (bitReverseFlag) + { + + arm_bitreversal_32_inpl_mve((uint32_t*)pSrc, S->bitRevLength, S->pBitRevTable); + + } +} + + +#else +extern void arm_radix8_butterfly_f32( + float32_t * pSrc, + uint16_t fftLen, + const float32_t * pCoef, + uint16_t twidCoefModifier); + +extern void arm_bitreversal_32( + uint32_t * pSrc, + const uint16_t bitRevLen, + const uint16_t * pBitRevTable); + +/** + @ingroup groupTransforms + */ + +/** + @defgroup ComplexFFT Complex FFT Functions + + @par + The Fast Fourier Transform (FFT) is an efficient algorithm for computing the + Discrete Fourier Transform (DFT). The FFT can be orders of magnitude faster + than the DFT, especially for long lengths. + The algorithms described in this section + operate on complex data. A separate set of functions is devoted to handling + of real sequences. + @par + There are separate algorithms for handling floating-point, Q15, and Q31 data + types. The algorithms available for each data type are described next. + @par + The FFT functions operate in-place. That is, the array holding the input data + will also be used to hold the corresponding result. The input data is complex + and contains <code>2*fftLen</code> interleaved values as shown below. + <pre>{real[0], imag[0], real[1], imag[1], ...} </pre> + The FFT result will be contained in the same array and the frequency domain + values will have the same interleaving. + + @par Floating-point + The floating-point complex FFT uses a mixed-radix algorithm. Multiple radix-8 + stages are performed along with a single radix-2 or radix-4 stage, as needed. + The algorithm supports lengths of [16, 32, 64, ..., 4096] and each length uses + a different twiddle factor table. + @par + The function uses the standard FFT definition and output values may grow by a + factor of <code>fftLen</code> when computing the forward transform. The + inverse transform includes a scale of <code>1/fftLen</code> as part of the + calculation and this matches the textbook definition of the inverse FFT. + @par + For the MVE version, the new arm_cfft_init_f32 initialization function is + <b>mandatory</b>. <b>Compilation flags are available to include only the required tables for the + needed FFTs.</b> Other FFT versions can continue to be initialized as + explained below. + @par + For not MVE versions, pre-initialized data structures containing twiddle factors + and bit reversal tables are provided and defined in <code>arm_const_structs.h</code>. Include + this header in your function and then pass one of the constant structures as + an argument to arm_cfft_f32. For example: + @par + <code>arm_cfft_f32(arm_cfft_sR_f32_len64, pSrc, 1, 1)</code> + @par + computes a 64-point inverse complex FFT including bit reversal. + The data structures are treated as constant data and not modified during the + calculation. The same data structure can be reused for multiple transforms + including mixing forward and inverse transforms. + @par + Earlier releases of the library provided separate radix-2 and radix-4 + algorithms that operated on floating-point data. These functions are still + provided but are deprecated. The older functions are slower and less general + than the new functions. + @par + An example of initialization of the constants for the arm_cfft_f32 function follows: + @code + const static arm_cfft_instance_f32 *S; + ... + switch (length) { + case 16: + S = &arm_cfft_sR_f32_len16; + break; + case 32: + S = &arm_cfft_sR_f32_len32; + break; + case 64: + S = &arm_cfft_sR_f32_len64; + break; + case 128: + S = &arm_cfft_sR_f32_len128; + break; + case 256: + S = &arm_cfft_sR_f32_len256; + break; + case 512: + S = &arm_cfft_sR_f32_len512; + break; + case 1024: + S = &arm_cfft_sR_f32_len1024; + break; + case 2048: + S = &arm_cfft_sR_f32_len2048; + break; + case 4096: + S = &arm_cfft_sR_f32_len4096; + break; + } + @endcode + @par + The new arm_cfft_init_f32 can also be used. + @par Q15 and Q31 + The floating-point complex FFT uses a mixed-radix algorithm. Multiple radix-4 + stages are performed along with a single radix-2 stage, as needed. + The algorithm supports lengths of [16, 32, 64, ..., 4096] and each length uses + a different twiddle factor table. + @par + The function uses the standard FFT definition and output values may grow by a + factor of <code>fftLen</code> when computing the forward transform. The + inverse transform includes a scale of <code>1/fftLen</code> as part of the + calculation and this matches the textbook definition of the inverse FFT. + @par + Pre-initialized data structures containing twiddle factors and bit reversal + tables are provided and defined in <code>arm_const_structs.h</code>. Include + this header in your function and then pass one of the constant structures as + an argument to arm_cfft_q31. For example: + @par + <code>arm_cfft_q31(arm_cfft_sR_q31_len64, pSrc, 1, 1)</code> + @par + computes a 64-point inverse complex FFT including bit reversal. + The data structures are treated as constant data and not modified during the + calculation. The same data structure can be reused for multiple transforms + including mixing forward and inverse transforms. + @par + Earlier releases of the library provided separate radix-2 and radix-4 + algorithms that operated on floating-point data. These functions are still + provided but are deprecated. The older functions are slower and less general + than the new functions. + @par + An example of initialization of the constants for the arm_cfft_q31 function follows: + @code + const static arm_cfft_instance_q31 *S; + ... + switch (length) { + case 16: + S = &arm_cfft_sR_q31_len16; + break; + case 32: + S = &arm_cfft_sR_q31_len32; + break; + case 64: + S = &arm_cfft_sR_q31_len64; + break; + case 128: + S = &arm_cfft_sR_q31_len128; + break; + case 256: + S = &arm_cfft_sR_q31_len256; + break; + case 512: + S = &arm_cfft_sR_q31_len512; + break; + case 1024: + S = &arm_cfft_sR_q31_len1024; + break; + case 2048: + S = &arm_cfft_sR_q31_len2048; + break; + case 4096: + S = &arm_cfft_sR_q31_len4096; + break; + } + @endcode + + */ + +void arm_cfft_radix8by2_f32 (arm_cfft_instance_f32 * S, float32_t * p1) +{ + uint32_t L = S->fftLen; + float32_t * pCol1, * pCol2, * pMid1, * pMid2; + float32_t * p2 = p1 + L; + const float32_t * tw = (float32_t *) S->pTwiddle; + float32_t t1[4], t2[4], t3[4], t4[4], twR, twI; + float32_t m0, m1, m2, m3; + uint32_t l; + + pCol1 = p1; + pCol2 = p2; + + /* Define new length */ + L >>= 1; + + /* Initialize mid pointers */ + pMid1 = p1 + L; + pMid2 = p2 + L; + + /* do two dot Fourier transform */ + for (l = L >> 2; l > 0; l-- ) + { + t1[0] = p1[0]; + t1[1] = p1[1]; + t1[2] = p1[2]; + t1[3] = p1[3]; + + t2[0] = p2[0]; + t2[1] = p2[1]; + t2[2] = p2[2]; + t2[3] = p2[3]; + + t3[0] = pMid1[0]; + t3[1] = pMid1[1]; + t3[2] = pMid1[2]; + t3[3] = pMid1[3]; + + t4[0] = pMid2[0]; + t4[1] = pMid2[1]; + t4[2] = pMid2[2]; + t4[3] = pMid2[3]; + + *p1++ = t1[0] + t2[0]; + *p1++ = t1[1] + t2[1]; + *p1++ = t1[2] + t2[2]; + *p1++ = t1[3] + t2[3]; /* col 1 */ + + t2[0] = t1[0] - t2[0]; + t2[1] = t1[1] - t2[1]; + t2[2] = t1[2] - t2[2]; + t2[3] = t1[3] - t2[3]; /* for col 2 */ + + *pMid1++ = t3[0] + t4[0]; + *pMid1++ = t3[1] + t4[1]; + *pMid1++ = t3[2] + t4[2]; + *pMid1++ = t3[3] + t4[3]; /* col 1 */ + + t4[0] = t4[0] - t3[0]; + t4[1] = t4[1] - t3[1]; + t4[2] = t4[2] - t3[2]; + t4[3] = t4[3] - t3[3]; /* for col 2 */ + + twR = *tw++; + twI = *tw++; + + /* multiply by twiddle factors */ + m0 = t2[0] * twR; + m1 = t2[1] * twI; + m2 = t2[1] * twR; + m3 = t2[0] * twI; + + /* R = R * Tr - I * Ti */ + *p2++ = m0 + m1; + /* I = I * Tr + R * Ti */ + *p2++ = m2 - m3; + + /* use vertical symmetry */ + /* 0.9988 - 0.0491i <==> -0.0491 - 0.9988i */ + m0 = t4[0] * twI; + m1 = t4[1] * twR; + m2 = t4[1] * twI; + m3 = t4[0] * twR; + + *pMid2++ = m0 - m1; + *pMid2++ = m2 + m3; + + twR = *tw++; + twI = *tw++; + + m0 = t2[2] * twR; + m1 = t2[3] * twI; + m2 = t2[3] * twR; + m3 = t2[2] * twI; + + *p2++ = m0 + m1; + *p2++ = m2 - m3; + + m0 = t4[2] * twI; + m1 = t4[3] * twR; + m2 = t4[3] * twI; + m3 = t4[2] * twR; + + *pMid2++ = m0 - m1; + *pMid2++ = m2 + m3; + } + + /* first col */ + arm_radix8_butterfly_f32 (pCol1, L, (float32_t *) S->pTwiddle, 2U); + + /* second col */ + arm_radix8_butterfly_f32 (pCol2, L, (float32_t *) S->pTwiddle, 2U); +} + +void arm_cfft_radix8by4_f32 (arm_cfft_instance_f32 * S, float32_t * p1) +{ + uint32_t L = S->fftLen >> 1; + float32_t * pCol1, *pCol2, *pCol3, *pCol4, *pEnd1, *pEnd2, *pEnd3, *pEnd4; + const float32_t *tw2, *tw3, *tw4; + float32_t * p2 = p1 + L; + float32_t * p3 = p2 + L; + float32_t * p4 = p3 + L; + float32_t t2[4], t3[4], t4[4], twR, twI; + float32_t p1ap3_0, p1sp3_0, p1ap3_1, p1sp3_1; + float32_t m0, m1, m2, m3; + uint32_t l, twMod2, twMod3, twMod4; + + pCol1 = p1; /* points to real values by default */ + pCol2 = p2; + pCol3 = p3; + pCol4 = p4; + pEnd1 = p2 - 1; /* points to imaginary values by default */ + pEnd2 = p3 - 1; + pEnd3 = p4 - 1; + pEnd4 = pEnd3 + L; + + tw2 = tw3 = tw4 = (float32_t *) S->pTwiddle; + + L >>= 1; + + /* do four dot Fourier transform */ + + twMod2 = 2; + twMod3 = 4; + twMod4 = 6; + + /* TOP */ + p1ap3_0 = p1[0] + p3[0]; + p1sp3_0 = p1[0] - p3[0]; + p1ap3_1 = p1[1] + p3[1]; + p1sp3_1 = p1[1] - p3[1]; + + /* col 2 */ + t2[0] = p1sp3_0 + p2[1] - p4[1]; + t2[1] = p1sp3_1 - p2[0] + p4[0]; + /* col 3 */ + t3[0] = p1ap3_0 - p2[0] - p4[0]; + t3[1] = p1ap3_1 - p2[1] - p4[1]; + /* col 4 */ + t4[0] = p1sp3_0 - p2[1] + p4[1]; + t4[1] = p1sp3_1 + p2[0] - p4[0]; + /* col 1 */ + *p1++ = p1ap3_0 + p2[0] + p4[0]; + *p1++ = p1ap3_1 + p2[1] + p4[1]; + + /* Twiddle factors are ones */ + *p2++ = t2[0]; + *p2++ = t2[1]; + *p3++ = t3[0]; + *p3++ = t3[1]; + *p4++ = t4[0]; + *p4++ = t4[1]; + + tw2 += twMod2; + tw3 += twMod3; + tw4 += twMod4; + + for (l = (L - 2) >> 1; l > 0; l-- ) + { + /* TOP */ + p1ap3_0 = p1[0] + p3[0]; + p1sp3_0 = p1[0] - p3[0]; + p1ap3_1 = p1[1] + p3[1]; + p1sp3_1 = p1[1] - p3[1]; + /* col 2 */ + t2[0] = p1sp3_0 + p2[1] - p4[1]; + t2[1] = p1sp3_1 - p2[0] + p4[0]; + /* col 3 */ + t3[0] = p1ap3_0 - p2[0] - p4[0]; + t3[1] = p1ap3_1 - p2[1] - p4[1]; + /* col 4 */ + t4[0] = p1sp3_0 - p2[1] + p4[1]; + t4[1] = p1sp3_1 + p2[0] - p4[0]; + /* col 1 - top */ + *p1++ = p1ap3_0 + p2[0] + p4[0]; + *p1++ = p1ap3_1 + p2[1] + p4[1]; + + /* BOTTOM */ + p1ap3_1 = pEnd1[-1] + pEnd3[-1]; + p1sp3_1 = pEnd1[-1] - pEnd3[-1]; + p1ap3_0 = pEnd1[ 0] + pEnd3[0]; + p1sp3_0 = pEnd1[ 0] - pEnd3[0]; + /* col 2 */ + t2[2] = pEnd2[0] - pEnd4[0] + p1sp3_1; + t2[3] = pEnd1[0] - pEnd3[0] - pEnd2[-1] + pEnd4[-1]; + /* col 3 */ + t3[2] = p1ap3_1 - pEnd2[-1] - pEnd4[-1]; + t3[3] = p1ap3_0 - pEnd2[ 0] - pEnd4[ 0]; + /* col 4 */ + t4[2] = pEnd2[ 0] - pEnd4[ 0] - p1sp3_1; + t4[3] = pEnd4[-1] - pEnd2[-1] - p1sp3_0; + /* col 1 - Bottom */ + *pEnd1-- = p1ap3_0 + pEnd2[ 0] + pEnd4[ 0]; + *pEnd1-- = p1ap3_1 + pEnd2[-1] + pEnd4[-1]; + + /* COL 2 */ + /* read twiddle factors */ + twR = *tw2++; + twI = *tw2++; + /* multiply by twiddle factors */ + /* let Z1 = a + i(b), Z2 = c + i(d) */ + /* => Z1 * Z2 = (a*c - b*d) + i(b*c + a*d) */ + + /* Top */ + m0 = t2[0] * twR; + m1 = t2[1] * twI; + m2 = t2[1] * twR; + m3 = t2[0] * twI; + + *p2++ = m0 + m1; + *p2++ = m2 - m3; + /* use vertical symmetry col 2 */ + /* 0.9997 - 0.0245i <==> 0.0245 - 0.9997i */ + /* Bottom */ + m0 = t2[3] * twI; + m1 = t2[2] * twR; + m2 = t2[2] * twI; + m3 = t2[3] * twR; + + *pEnd2-- = m0 - m1; + *pEnd2-- = m2 + m3; + + /* COL 3 */ + twR = tw3[0]; + twI = tw3[1]; + tw3 += twMod3; + /* Top */ + m0 = t3[0] * twR; + m1 = t3[1] * twI; + m2 = t3[1] * twR; + m3 = t3[0] * twI; + + *p3++ = m0 + m1; + *p3++ = m2 - m3; + /* use vertical symmetry col 3 */ + /* 0.9988 - 0.0491i <==> -0.9988 - 0.0491i */ + /* Bottom */ + m0 = -t3[3] * twR; + m1 = t3[2] * twI; + m2 = t3[2] * twR; + m3 = t3[3] * twI; + + *pEnd3-- = m0 - m1; + *pEnd3-- = m3 - m2; + + /* COL 4 */ + twR = tw4[0]; + twI = tw4[1]; + tw4 += twMod4; + /* Top */ + m0 = t4[0] * twR; + m1 = t4[1] * twI; + m2 = t4[1] * twR; + m3 = t4[0] * twI; + + *p4++ = m0 + m1; + *p4++ = m2 - m3; + /* use vertical symmetry col 4 */ + /* 0.9973 - 0.0736i <==> -0.0736 + 0.9973i */ + /* Bottom */ + m0 = t4[3] * twI; + m1 = t4[2] * twR; + m2 = t4[2] * twI; + m3 = t4[3] * twR; + + *pEnd4-- = m0 - m1; + *pEnd4-- = m2 + m3; + } + + /* MIDDLE */ + /* Twiddle factors are */ + /* 1.0000 0.7071-0.7071i -1.0000i -0.7071-0.7071i */ + p1ap3_0 = p1[0] + p3[0]; + p1sp3_0 = p1[0] - p3[0]; + p1ap3_1 = p1[1] + p3[1]; + p1sp3_1 = p1[1] - p3[1]; + + /* col 2 */ + t2[0] = p1sp3_0 + p2[1] - p4[1]; + t2[1] = p1sp3_1 - p2[0] + p4[0]; + /* col 3 */ + t3[0] = p1ap3_0 - p2[0] - p4[0]; + t3[1] = p1ap3_1 - p2[1] - p4[1]; + /* col 4 */ + t4[0] = p1sp3_0 - p2[1] + p4[1]; + t4[1] = p1sp3_1 + p2[0] - p4[0]; + /* col 1 - Top */ + *p1++ = p1ap3_0 + p2[0] + p4[0]; + *p1++ = p1ap3_1 + p2[1] + p4[1]; + + /* COL 2 */ + twR = tw2[0]; + twI = tw2[1]; + + m0 = t2[0] * twR; + m1 = t2[1] * twI; + m2 = t2[1] * twR; + m3 = t2[0] * twI; + + *p2++ = m0 + m1; + *p2++ = m2 - m3; + /* COL 3 */ + twR = tw3[0]; + twI = tw3[1]; + + m0 = t3[0] * twR; + m1 = t3[1] * twI; + m2 = t3[1] * twR; + m3 = t3[0] * twI; + + *p3++ = m0 + m1; + *p3++ = m2 - m3; + /* COL 4 */ + twR = tw4[0]; + twI = tw4[1]; + + m0 = t4[0] * twR; + m1 = t4[1] * twI; + m2 = t4[1] * twR; + m3 = t4[0] * twI; + + *p4++ = m0 + m1; + *p4++ = m2 - m3; + + /* first col */ + arm_radix8_butterfly_f32 (pCol1, L, (float32_t *) S->pTwiddle, 4U); + + /* second col */ + arm_radix8_butterfly_f32 (pCol2, L, (float32_t *) S->pTwiddle, 4U); + + /* third col */ + arm_radix8_butterfly_f32 (pCol3, L, (float32_t *) S->pTwiddle, 4U); + + /* fourth col */ + arm_radix8_butterfly_f32 (pCol4, L, (float32_t *) S->pTwiddle, 4U); +} + +/** + @addtogroup ComplexFFT + @{ + */ + +/** + @brief Processing function for the floating-point complex FFT. + @param[in] S points to an instance of the floating-point CFFT structure + @param[in,out] p1 points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place + @param[in] ifftFlag flag that selects transform direction + - value = 0: forward transform + - value = 1: inverse transform + @param[in] bitReverseFlag flag that enables / disables bit reversal of output + - value = 0: disables bit reversal of output + - value = 1: enables bit reversal of output + @return none + */ + +void arm_cfft_f32( + const arm_cfft_instance_f32 * S, + float32_t * p1, + uint8_t ifftFlag, + uint8_t bitReverseFlag) +{ + uint32_t L = S->fftLen, l; + float32_t invL, * pSrc; + + if (ifftFlag == 1U) + { + /* Conjugate input data */ + pSrc = p1 + 1; + for (l = 0; l < L; l++) + { + *pSrc = -*pSrc; + pSrc += 2; + } + } + + switch (L) + { + case 16: + case 128: + case 1024: + arm_cfft_radix8by2_f32 ( (arm_cfft_instance_f32 *) S, p1); + break; + case 32: + case 256: + case 2048: + arm_cfft_radix8by4_f32 ( (arm_cfft_instance_f32 *) S, p1); + break; + case 64: + case 512: + case 4096: + arm_radix8_butterfly_f32 ( p1, L, (float32_t *) S->pTwiddle, 1); + break; + } + + if ( bitReverseFlag ) + arm_bitreversal_32 ((uint32_t*) p1, S->bitRevLength, S->pBitRevTable); + + if (ifftFlag == 1U) + { + invL = 1.0f / (float32_t)L; + + /* Conjugate and scale output data */ + pSrc = p1; + for (l= 0; l < L; l++) + { + *pSrc++ *= invL ; + *pSrc = -(*pSrc) * invL; + pSrc++; + } + } +} +#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */ + +/** + @} end of ComplexFFT group + */ diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_f64.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_f64.c new file mode 100644 index 0000000..d686eb8 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_f64.c @@ -0,0 +1,318 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_cfft_f64.c + * Description: Combined Radix Decimation in Frequency CFFT Double Precision Floating point processing function + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/transform_functions.h" +#include "arm_common_tables.h" + + +extern void arm_radix4_butterfly_f64( + float64_t * pSrc, + uint16_t fftLen, + const float64_t * pCoef, + uint16_t twidCoefModifier); + +extern void arm_bitreversal_64( + uint64_t * pSrc, + const uint16_t bitRevLen, + const uint16_t * pBitRevTable); + +/** +* @} end of ComplexFFT group +*/ + +/* ---------------------------------------------------------------------- + * Internal helper function used by the FFTs + * ---------------------------------------------------------------------- */ + +/* +* @brief Core function for the Double Precision floating-point CFFT butterfly process. +* @param[in, out] *pSrc points to the in-place buffer of F64 data type. +* @param[in] fftLen length of the FFT. +* @param[in] *pCoef points to the twiddle coefficient buffer. +* @param[in] twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. +* @return none. +*/ + +void arm_radix4_butterfly_f64( + float64_t * pSrc, + uint16_t fftLen, + const float64_t * pCoef, + uint16_t twidCoefModifier) +{ + + float64_t co1, co2, co3, si1, si2, si3; + uint32_t ia1, ia2, ia3; + uint32_t i0, i1, i2, i3; + uint32_t n1, n2, j, k; + + float64_t t1, t2, r1, r2, s1, s2; + + + /* Initializations for the fft calculation */ + n2 = fftLen; + n1 = n2; + for (k = fftLen; k > 1U; k >>= 2U) + { + /* Initializations for the fft calculation */ + n1 = n2; + n2 >>= 2U; + ia1 = 0U; + + /* FFT Calculation */ + j = 0; + do + { + /* index calculation for the coefficients */ + ia2 = ia1 + ia1; + ia3 = ia2 + ia1; + co1 = pCoef[ia1 * 2U]; + si1 = pCoef[(ia1 * 2U) + 1U]; + co2 = pCoef[ia2 * 2U]; + si2 = pCoef[(ia2 * 2U) + 1U]; + co3 = pCoef[ia3 * 2U]; + si3 = pCoef[(ia3 * 2U) + 1U]; + + /* Twiddle coefficients index modifier */ + ia1 = ia1 + twidCoefModifier; + + i0 = j; + do + { + /* index calculation for the input as, */ + /* pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2], pSrc[i0 + 3fftLen/4] */ + i1 = i0 + n2; + i2 = i1 + n2; + i3 = i2 + n2; + + /* xa + xc */ + r1 = pSrc[(2U * i0)] + pSrc[(2U * i2)]; + + /* xa - xc */ + r2 = pSrc[(2U * i0)] - pSrc[(2U * i2)]; + + /* ya + yc */ + s1 = pSrc[(2U * i0) + 1U] + pSrc[(2U * i2) + 1U]; + + /* ya - yc */ + s2 = pSrc[(2U * i0) + 1U] - pSrc[(2U * i2) + 1U]; + + /* xb + xd */ + t1 = pSrc[2U * i1] + pSrc[2U * i3]; + + /* xa' = xa + xb + xc + xd */ + pSrc[2U * i0] = r1 + t1; + + /* xa + xc -(xb + xd) */ + r1 = r1 - t1; + + /* yb + yd */ + t2 = pSrc[(2U * i1) + 1U] + pSrc[(2U * i3) + 1U]; + + /* ya' = ya + yb + yc + yd */ + pSrc[(2U * i0) + 1U] = s1 + t2; + + /* (ya + yc) - (yb + yd) */ + s1 = s1 - t2; + + /* (yb - yd) */ + t1 = pSrc[(2U * i1) + 1U] - pSrc[(2U * i3) + 1U]; + + /* (xb - xd) */ + t2 = pSrc[2U * i1] - pSrc[2U * i3]; + + /* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */ + pSrc[2U * i1] = (r1 * co2) + (s1 * si2); + + /* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */ + pSrc[(2U * i1) + 1U] = (s1 * co2) - (r1 * si2); + + /* (xa - xc) + (yb - yd) */ + r1 = r2 + t1; + + /* (xa - xc) - (yb - yd) */ + r2 = r2 - t1; + + /* (ya - yc) - (xb - xd) */ + s1 = s2 - t2; + + /* (ya - yc) + (xb - xd) */ + s2 = s2 + t2; + + /* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */ + pSrc[2U * i2] = (r1 * co1) + (s1 * si1); + + /* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */ + pSrc[(2U * i2) + 1U] = (s1 * co1) - (r1 * si1); + + /* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */ + pSrc[2U * i3] = (r2 * co3) + (s2 * si3); + + /* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */ + pSrc[(2U * i3) + 1U] = (s2 * co3) - (r2 * si3); + + i0 += n1; + } while ( i0 < fftLen); + j++; + } while (j <= (n2 - 1U)); + twidCoefModifier <<= 2U; + } +} + +/* +* @brief Core function for the Double Precision floating-point CFFT butterfly process. +* @param[in, out] *pSrc points to the in-place buffer of F64 data type. +* @param[in] fftLen length of the FFT. +* @param[in] *pCoef points to the twiddle coefficient buffer. +* @param[in] twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. +* @return none. +*/ + +void arm_cfft_radix4by2_f64( + float64_t * pSrc, + uint32_t fftLen, + const float64_t * pCoef) +{ + uint32_t i, l; + uint32_t n2, ia; + float64_t xt, yt, cosVal, sinVal; + float64_t p0, p1,p2,p3,a0,a1; + + n2 = fftLen >> 1; + ia = 0; + for (i = 0; i < n2; i++) + { + cosVal = pCoef[2*ia]; + sinVal = pCoef[2*ia + 1]; + ia++; + + l = i + n2; + + /* Butterfly implementation */ + a0 = pSrc[2 * i] + pSrc[2 * l]; + xt = pSrc[2 * i] - pSrc[2 * l]; + + yt = pSrc[2 * i + 1] - pSrc[2 * l + 1]; + a1 = pSrc[2 * l + 1] + pSrc[2 * i + 1]; + + p0 = xt * cosVal; + p1 = yt * sinVal; + p2 = yt * cosVal; + p3 = xt * sinVal; + + pSrc[2 * i] = a0; + pSrc[2 * i + 1] = a1; + + pSrc[2 * l] = p0 + p1; + pSrc[2 * l + 1] = p2 - p3; + + } + + // first col + arm_radix4_butterfly_f64( pSrc, n2, (float64_t*)pCoef, 2U); + // second col + arm_radix4_butterfly_f64( pSrc + fftLen, n2, (float64_t*)pCoef, 2U); + +} + +/** + @addtogroup ComplexFFT + @{ + */ + +/** + @brief Processing function for the Double Precision floating-point complex FFT. + @param[in] S points to an instance of the Double Precision floating-point CFFT structure + @param[in,out] p1 points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place + @param[in] ifftFlag flag that selects transform direction + - value = 0: forward transform + - value = 1: inverse transform + @param[in] bitReverseFlag flag that enables / disables bit reversal of output + - value = 0: disables bit reversal of output + - value = 1: enables bit reversal of output + @return none + */ + +void arm_cfft_f64( + const arm_cfft_instance_f64 * S, + float64_t * p1, + uint8_t ifftFlag, + uint8_t bitReverseFlag) +{ + uint32_t L = S->fftLen, l; + float64_t invL, * pSrc; + + if (ifftFlag == 1U) + { + /* Conjugate input data */ + pSrc = p1 + 1; + for(l=0; l<L; l++) + { + *pSrc = -*pSrc; + pSrc += 2; + } + } + + switch (L) + { + case 16: + case 64: + case 256: + case 1024: + case 4096: + arm_radix4_butterfly_f64 (p1, L, (float64_t*)S->pTwiddle, 1U); + break; + + case 32: + case 128: + case 512: + case 2048: + arm_cfft_radix4by2_f64 ( p1, L, (float64_t*)S->pTwiddle); + break; + + } + + if ( bitReverseFlag ) + arm_bitreversal_64((uint64_t*)p1, S->bitRevLength,S->pBitRevTable); + + if (ifftFlag == 1U) + { + invL = 1.0 / (float64_t)L; + /* Conjugate and scale output data */ + pSrc = p1; + for(l=0; l<L; l++) + { + *pSrc++ *= invL ; + *pSrc = -(*pSrc) * invL; + pSrc++; + } + } +} + +/** + @} end of ComplexFFT group + */ diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_init_f16.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_init_f16.c new file mode 100644 index 0000000..fdf887b --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_init_f16.c @@ -0,0 +1,363 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_cfft_init_f16.c + * Description: Initialization function for cfft f16 instance + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#define FFTINIT(EXT,SIZE) \ + S->bitRevLength = arm_cfft_sR_##EXT##_len##SIZE.bitRevLength; \ + S->pBitRevTable = arm_cfft_sR_##EXT##_len##SIZE.pBitRevTable; \ + S->pTwiddle = arm_cfft_sR_##EXT##_len##SIZE.pTwiddle; + +/** + @addtogroup ComplexFFT + @{ + */ + +/** + @brief Initialization function for the cfft f16 function + @param[in,out] S points to an instance of the floating-point CFFT structure + @param[in] fftLen fft length (number of complex samples) + @return execution status + - \ref ARM_MATH_SUCCESS : Operation successful + - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected + + @par Use of this function is mandatory only for the MVE version of the FFT. + Other versions can still initialize directly the data structure using + variables declared in arm_const_structs.h + */ + +#include "dsp/transform_functions_f16.h" +#include "arm_common_tables_f16.h" +#include "arm_const_structs_f16.h" + + +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) + +#include "arm_vec_fft.h" +#include "arm_mve_tables_f16.h" + +arm_status arm_cfft_radix4by2_rearrange_twiddles_f16(arm_cfft_instance_f16 *S, int twidCoefModifier) +{ + + switch (S->fftLen >> (twidCoefModifier - 1)) { + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) \ + || defined(ARM_TABLE_TWIDDLECOEF_F16_4096) + case 4096U: + S->rearranged_twiddle_tab_stride1_arr = rearranged_twiddle_tab_stride1_arr_4096_f16; + S->rearranged_twiddle_stride1 = rearranged_twiddle_stride1_4096_f16; + + S->rearranged_twiddle_tab_stride2_arr = rearranged_twiddle_tab_stride2_arr_4096_f16; + S->rearranged_twiddle_stride2 = rearranged_twiddle_stride2_4096_f16; + + S->rearranged_twiddle_tab_stride3_arr = rearranged_twiddle_tab_stride3_arr_4096_f16; + S->rearranged_twiddle_stride3 = rearranged_twiddle_stride3_4096_f16; + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) \ + || defined(ARM_TABLE_TWIDDLECOEF_F16_1024) || defined(ARM_TABLE_TWIDDLECOEF_F16_2048) + case 1024U: + S->rearranged_twiddle_tab_stride1_arr = rearranged_twiddle_tab_stride1_arr_1024_f16; + S->rearranged_twiddle_stride1 = rearranged_twiddle_stride1_1024_f16; + + S->rearranged_twiddle_tab_stride2_arr = rearranged_twiddle_tab_stride2_arr_1024_f16; + S->rearranged_twiddle_stride2 = rearranged_twiddle_stride2_1024_f16; + + S->rearranged_twiddle_tab_stride3_arr = rearranged_twiddle_tab_stride3_arr_1024_f16; + S->rearranged_twiddle_stride3 = rearranged_twiddle_stride3_1024_f16; + break; + #endif + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) \ + || defined(ARM_TABLE_TWIDDLECOEF_F16_256) || defined(ARM_TABLE_TWIDDLECOEF_F16_512) + case 256U: + S->rearranged_twiddle_tab_stride1_arr = rearranged_twiddle_tab_stride1_arr_256_f16; + S->rearranged_twiddle_stride1 = rearranged_twiddle_stride1_256_f16; + + S->rearranged_twiddle_tab_stride2_arr = rearranged_twiddle_tab_stride2_arr_256_f16; + S->rearranged_twiddle_stride2 = rearranged_twiddle_stride2_256_f16; + + S->rearranged_twiddle_tab_stride3_arr = rearranged_twiddle_tab_stride3_arr_256_f16; + S->rearranged_twiddle_stride3 = rearranged_twiddle_stride3_256_f16; + + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) \ + || defined(ARM_TABLE_TWIDDLECOEF_F16_64) || defined(ARM_TABLE_TWIDDLECOEF_F16_128) + case 64U: + S->rearranged_twiddle_tab_stride1_arr = rearranged_twiddle_tab_stride1_arr_64_f16; + S->rearranged_twiddle_stride1 = rearranged_twiddle_stride1_64_f16; + + S->rearranged_twiddle_tab_stride2_arr = rearranged_twiddle_tab_stride2_arr_64_f16; + S->rearranged_twiddle_stride2 = rearranged_twiddle_stride2_64_f16; + + S->rearranged_twiddle_tab_stride3_arr = rearranged_twiddle_tab_stride3_arr_64_f16; + S->rearranged_twiddle_stride3 = rearranged_twiddle_stride3_64_f16; + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) \ + || defined(ARM_TABLE_TWIDDLECOEF_F16_16) || defined(ARM_TABLE_TWIDDLECOEF_F16_32) + case 16U: + S->rearranged_twiddle_tab_stride1_arr = rearranged_twiddle_tab_stride1_arr_16_f16; + S->rearranged_twiddle_stride1 = rearranged_twiddle_stride1_16_f16; + + S->rearranged_twiddle_tab_stride2_arr = rearranged_twiddle_tab_stride2_arr_16_f16; + S->rearranged_twiddle_stride2 = rearranged_twiddle_stride2_16_f16; + + S->rearranged_twiddle_tab_stride3_arr = rearranged_twiddle_tab_stride3_arr_16_f16; + S->rearranged_twiddle_stride3 = rearranged_twiddle_stride3_16_f16; + break; +#endif + + default: + return(ARM_MATH_ARGUMENT_ERROR); + break; + /* invalid sizes already filtered */ + } + + return(ARM_MATH_SUCCESS); + +} + +arm_status arm_cfft_init_f16( + arm_cfft_instance_f16 * S, + uint16_t fftLen) +{ + + /* Initialise the default arm status */ + arm_status status = ARM_MATH_SUCCESS; + + /* Initialise the FFT length */ + S->fftLen = fftLen; + + /* Initialise the Twiddle coefficient pointer */ + S->pTwiddle = NULL; + + + /* Initializations of Instance structure depending on the FFT length */ + switch (S->fftLen) { + /* Initializations of structure parameters for 4096 point FFT */ +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_4096) && defined(ARM_TABLE_TWIDDLECOEF_F16_4096)) + case 4096U: + /* Initialise the bit reversal table modifier */ + S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_4096_TABLE_LENGTH; + S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_4096; + S->pTwiddle = (float16_t *)twiddleCoefF16_4096; + status=arm_cfft_radix4by2_rearrange_twiddles_f16(S, 1); + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_2048) && defined(ARM_TABLE_TWIDDLECOEF_F16_2048)) + /* Initializations of structure parameters for 2048 point FFT */ + case 2048U: + /* Initialise the bit reversal table modifier */ + S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_2048_TABLE_LENGTH; + S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_2048; + S->pTwiddle = (float16_t *)twiddleCoefF16_2048; + status=arm_cfft_radix4by2_rearrange_twiddles_f16(S, 2); + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_1024) && defined(ARM_TABLE_TWIDDLECOEF_F16_1024)) + /* Initializations of structure parameters for 1024 point FFT */ + case 1024U: + /* Initialise the bit reversal table modifier */ + S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_1024_TABLE_LENGTH; + S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_1024; + S->pTwiddle = (float16_t *)twiddleCoefF16_1024; + status=arm_cfft_radix4by2_rearrange_twiddles_f16(S, 1); + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_512) && defined(ARM_TABLE_TWIDDLECOEF_F16_512)) + /* Initializations of structure parameters for 512 point FFT */ + case 512U: + /* Initialise the bit reversal table modifier */ + S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_512_TABLE_LENGTH; + S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_512; + S->pTwiddle = (float16_t *)twiddleCoefF16_512; + status=arm_cfft_radix4by2_rearrange_twiddles_f16(S, 2); + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_256) && defined(ARM_TABLE_TWIDDLECOEF_F16_256)) + case 256U: + S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_256_TABLE_LENGTH; + S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_256; + S->pTwiddle = (float16_t *)twiddleCoefF16_256; + status=arm_cfft_radix4by2_rearrange_twiddles_f16(S, 1); + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_128) && defined(ARM_TABLE_TWIDDLECOEF_F16_128)) + case 128U: + S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_128_TABLE_LENGTH; + S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_128; + S->pTwiddle = (float16_t *)twiddleCoefF16_128; + status=arm_cfft_radix4by2_rearrange_twiddles_f16(S, 2); + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_64) && defined(ARM_TABLE_TWIDDLECOEF_F16_64)) + case 64U: + S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_64_TABLE_LENGTH; + S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_64; + S->pTwiddle = (float16_t *)twiddleCoefF16_64; + status=arm_cfft_radix4by2_rearrange_twiddles_f16(S, 1); + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_32) && defined(ARM_TABLE_TWIDDLECOEF_F16_32)) + case 32U: + S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_32_TABLE_LENGTH; + S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_32; + S->pTwiddle = (float16_t *)twiddleCoefF16_32; + status=arm_cfft_radix4by2_rearrange_twiddles_f16(S, 2); + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_16) && defined(ARM_TABLE_TWIDDLECOEF_F16_16)) + case 16U: + /* Initializations of structure parameters for 16 point FFT */ + S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_16_TABLE_LENGTH; + S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_16; + S->pTwiddle = (float16_t *)twiddleCoefF16_16; + status=arm_cfft_radix4by2_rearrange_twiddles_f16(S, 1); + break; +#endif + + default: + /* Reporting argument error if fftSize is not valid value */ + status = ARM_MATH_ARGUMENT_ERROR; + break; + } + + + return (status); +} +#else + +#if defined(ARM_FLOAT16_SUPPORTED) + +arm_status arm_cfft_init_f16( + arm_cfft_instance_f16 * S, + uint16_t fftLen) +{ + /* Initialise the default arm status */ + arm_status status = ARM_MATH_SUCCESS; + + /* Initialise the FFT length */ + S->fftLen = fftLen; + + /* Initialise the Twiddle coefficient pointer */ + S->pTwiddle = NULL; + + + /* Initializations of Instance structure depending on the FFT length */ + switch (S->fftLen) { +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_4096) && defined(ARM_TABLE_BITREVIDX_FLT_4096)) + /* Initializations of structure parameters for 4096 point FFT */ + case 4096U: + /* Initialise the bit reversal table modifier */ + FFTINIT(f16,4096); + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_2048) && defined(ARM_TABLE_BITREVIDX_FLT_2048)) + /* Initializations of structure parameters for 2048 point FFT */ + case 2048U: + /* Initialise the bit reversal table modifier */ + FFTINIT(f16,2048); + + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_1024) && defined(ARM_TABLE_BITREVIDX_FLT_1024)) + /* Initializations of structure parameters for 1024 point FFT */ + case 1024U: + /* Initialise the bit reversal table modifier */ + FFTINIT(f16,1024); + + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_512) && defined(ARM_TABLE_BITREVIDX_FLT_512)) + /* Initializations of structure parameters for 512 point FFT */ + case 512U: + /* Initialise the bit reversal table modifier */ + FFTINIT(f16,512); + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_256) && defined(ARM_TABLE_BITREVIDX_FLT_256)) + case 256U: + FFTINIT(f16,256); + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_128) && defined(ARM_TABLE_BITREVIDX_FLT_128)) + case 128U: + FFTINIT(f16,128); + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_64) && defined(ARM_TABLE_BITREVIDX_FLT_64)) + case 64U: + FFTINIT(f16,64); + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_32) && defined(ARM_TABLE_BITREVIDX_FLT_32)) + case 32U: + FFTINIT(f16,32); + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_16) && defined(ARM_TABLE_BITREVIDX_FLT_16)) + case 16U: + /* Initializations of structure parameters for 16 point FFT */ + FFTINIT(f16,16); + break; +#endif + + default: + /* Reporting argument error if fftSize is not valid value */ + status = ARM_MATH_ARGUMENT_ERROR; + break; + } + + + return (status); +} +#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ +#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */ + +/** + @} end of ComplexFFT group + */ diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_init_f32.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_init_f32.c new file mode 100644 index 0000000..abfab4a --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_init_f32.c @@ -0,0 +1,358 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_cfft_init_f32.c + * Description: Initialization function for cfft f32 instance + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#define FFTINIT(EXT,SIZE) \ + S->bitRevLength = arm_cfft_sR_##EXT##_len##SIZE.bitRevLength; \ + S->pBitRevTable = arm_cfft_sR_##EXT##_len##SIZE.pBitRevTable; \ + S->pTwiddle = arm_cfft_sR_##EXT##_len##SIZE.pTwiddle; + +/** + @addtogroup ComplexFFT + @{ + */ + +/** + @brief Initialization function for the cfft f32 function + @param[in,out] S points to an instance of the floating-point CFFT structure + @param[in] fftLen fft length (number of complex samples) + @return execution status + - \ref ARM_MATH_SUCCESS : Operation successful + - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected + + @par Use of this function is mandatory only for the MVE version of the FFT. + Other versions can still initialize directly the data structure using + variables declared in arm_const_structs.h + */ + +#include "dsp/transform_functions.h" +#include "arm_common_tables.h" +#include "arm_const_structs.h" + +#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) + +#include "arm_vec_fft.h" +#include "arm_mve_tables.h" + +arm_status arm_cfft_radix4by2_rearrange_twiddles_f32(arm_cfft_instance_f32 *S, int twidCoefModifier) +{ + + switch (S->fftLen >> (twidCoefModifier - 1)) { + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) \ + || defined(ARM_TABLE_TWIDDLECOEF_F32_4096) + case 4096U: + S->rearranged_twiddle_tab_stride1_arr = rearranged_twiddle_tab_stride1_arr_4096_f32; + S->rearranged_twiddle_stride1 = rearranged_twiddle_stride1_4096_f32; + + S->rearranged_twiddle_tab_stride2_arr = rearranged_twiddle_tab_stride2_arr_4096_f32; + S->rearranged_twiddle_stride2 = rearranged_twiddle_stride2_4096_f32; + + S->rearranged_twiddle_tab_stride3_arr = rearranged_twiddle_tab_stride3_arr_4096_f32; + S->rearranged_twiddle_stride3 = rearranged_twiddle_stride3_4096_f32; + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) \ + || defined(ARM_TABLE_TWIDDLECOEF_F32_1024) || defined(ARM_TABLE_TWIDDLECOEF_F32_2048) + case 1024U: + S->rearranged_twiddle_tab_stride1_arr = rearranged_twiddle_tab_stride1_arr_1024_f32; + S->rearranged_twiddle_stride1 = rearranged_twiddle_stride1_1024_f32; + + S->rearranged_twiddle_tab_stride2_arr = rearranged_twiddle_tab_stride2_arr_1024_f32; + S->rearranged_twiddle_stride2 = rearranged_twiddle_stride2_1024_f32; + + S->rearranged_twiddle_tab_stride3_arr = rearranged_twiddle_tab_stride3_arr_1024_f32; + S->rearranged_twiddle_stride3 = rearranged_twiddle_stride3_1024_f32; + break; + #endif + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) \ + || defined(ARM_TABLE_TWIDDLECOEF_F32_256) || defined(ARM_TABLE_TWIDDLECOEF_F32_512) + case 256U: + S->rearranged_twiddle_tab_stride1_arr = rearranged_twiddle_tab_stride1_arr_256_f32; + S->rearranged_twiddle_stride1 = rearranged_twiddle_stride1_256_f32; + + S->rearranged_twiddle_tab_stride2_arr = rearranged_twiddle_tab_stride2_arr_256_f32; + S->rearranged_twiddle_stride2 = rearranged_twiddle_stride2_256_f32; + + S->rearranged_twiddle_tab_stride3_arr = rearranged_twiddle_tab_stride3_arr_256_f32; + S->rearranged_twiddle_stride3 = rearranged_twiddle_stride3_256_f32; + + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) \ + || defined(ARM_TABLE_TWIDDLECOEF_F32_64) || defined(ARM_TABLE_TWIDDLECOEF_F32_128) + case 64U: + S->rearranged_twiddle_tab_stride1_arr = rearranged_twiddle_tab_stride1_arr_64_f32; + S->rearranged_twiddle_stride1 = rearranged_twiddle_stride1_64_f32; + + S->rearranged_twiddle_tab_stride2_arr = rearranged_twiddle_tab_stride2_arr_64_f32; + S->rearranged_twiddle_stride2 = rearranged_twiddle_stride2_64_f32; + + S->rearranged_twiddle_tab_stride3_arr = rearranged_twiddle_tab_stride3_arr_64_f32; + S->rearranged_twiddle_stride3 = rearranged_twiddle_stride3_64_f32; + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) \ + || defined(ARM_TABLE_TWIDDLECOEF_F32_16) || defined(ARM_TABLE_TWIDDLECOEF_F32_32) + case 16U: + S->rearranged_twiddle_tab_stride1_arr = rearranged_twiddle_tab_stride1_arr_16_f32; + S->rearranged_twiddle_stride1 = rearranged_twiddle_stride1_16_f32; + + S->rearranged_twiddle_tab_stride2_arr = rearranged_twiddle_tab_stride2_arr_16_f32; + S->rearranged_twiddle_stride2 = rearranged_twiddle_stride2_16_f32; + + S->rearranged_twiddle_tab_stride3_arr = rearranged_twiddle_tab_stride3_arr_16_f32; + S->rearranged_twiddle_stride3 = rearranged_twiddle_stride3_16_f32; + break; +#endif + + default: + return(ARM_MATH_ARGUMENT_ERROR); + break; + /* invalid sizes already filtered */ + } + + return(ARM_MATH_SUCCESS); + +} + +arm_status arm_cfft_init_f32( + arm_cfft_instance_f32 * S, + uint16_t fftLen) +{ + + /* Initialise the default arm status */ + arm_status status = ARM_MATH_SUCCESS; + + /* Initialise the FFT length */ + S->fftLen = fftLen; + + /* Initialise the Twiddle coefficient pointer */ + S->pTwiddle = NULL; + + + /* Initializations of Instance structure depending on the FFT length */ + switch (S->fftLen) { + /* Initializations of structure parameters for 4096 point FFT */ +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_4096) && defined(ARM_TABLE_TWIDDLECOEF_F32_4096)) + case 4096U: + /* Initialise the bit reversal table modifier */ + S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_4096_TABLE_LENGTH; + S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_4096; + S->pTwiddle = (float32_t *)twiddleCoef_4096; + status=arm_cfft_radix4by2_rearrange_twiddles_f32(S, 1); + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_2048) && defined(ARM_TABLE_TWIDDLECOEF_F32_2048)) + /* Initializations of structure parameters for 2048 point FFT */ + case 2048U: + /* Initialise the bit reversal table modifier */ + S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_2048_TABLE_LENGTH; + S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_2048; + S->pTwiddle = (float32_t *)twiddleCoef_2048; + status=arm_cfft_radix4by2_rearrange_twiddles_f32(S, 2); + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_1024) && defined(ARM_TABLE_TWIDDLECOEF_F32_1024)) + /* Initializations of structure parameters for 1024 point FFT */ + case 1024U: + /* Initialise the bit reversal table modifier */ + S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_1024_TABLE_LENGTH; + S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_1024; + S->pTwiddle = (float32_t *)twiddleCoef_1024; + status=arm_cfft_radix4by2_rearrange_twiddles_f32(S, 1); + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_512) && defined(ARM_TABLE_TWIDDLECOEF_F32_512)) + /* Initializations of structure parameters for 512 point FFT */ + case 512U: + /* Initialise the bit reversal table modifier */ + S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_512_TABLE_LENGTH; + S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_512; + S->pTwiddle = (float32_t *)twiddleCoef_512; + status=arm_cfft_radix4by2_rearrange_twiddles_f32(S, 2); + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_256) && defined(ARM_TABLE_TWIDDLECOEF_F32_256)) + case 256U: + S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_256_TABLE_LENGTH; + S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_256; + S->pTwiddle = (float32_t *)twiddleCoef_256; + status=arm_cfft_radix4by2_rearrange_twiddles_f32(S, 1); + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_128) && defined(ARM_TABLE_TWIDDLECOEF_F32_128)) + case 128U: + S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_128_TABLE_LENGTH; + S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_128; + S->pTwiddle = (float32_t *)twiddleCoef_128; + status=arm_cfft_radix4by2_rearrange_twiddles_f32(S, 2); + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_64) && defined(ARM_TABLE_TWIDDLECOEF_F32_64)) + case 64U: + S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_64_TABLE_LENGTH; + S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_64; + S->pTwiddle = (float32_t *)twiddleCoef_64; + status=arm_cfft_radix4by2_rearrange_twiddles_f32(S, 1); + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_32) && defined(ARM_TABLE_TWIDDLECOEF_F32_32)) + case 32U: + S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_32_TABLE_LENGTH; + S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_32; + S->pTwiddle = (float32_t *)twiddleCoef_32; + status=arm_cfft_radix4by2_rearrange_twiddles_f32(S, 2); + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_16) && defined(ARM_TABLE_TWIDDLECOEF_F32_16)) + case 16U: + /* Initializations of structure parameters for 16 point FFT */ + S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_16_TABLE_LENGTH; + S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_16; + S->pTwiddle = (float32_t *)twiddleCoef_16; + status=arm_cfft_radix4by2_rearrange_twiddles_f32(S, 1); + break; +#endif + + default: + /* Reporting argument error if fftSize is not valid value */ + status = ARM_MATH_ARGUMENT_ERROR; + break; + } + + + return (status); +} +#else +arm_status arm_cfft_init_f32( + arm_cfft_instance_f32 * S, + uint16_t fftLen) +{ + /* Initialise the default arm status */ + arm_status status = ARM_MATH_SUCCESS; + + /* Initialise the FFT length */ + S->fftLen = fftLen; + + /* Initialise the Twiddle coefficient pointer */ + S->pTwiddle = NULL; + + + /* Initializations of Instance structure depending on the FFT length */ + switch (S->fftLen) { +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_4096) && defined(ARM_TABLE_BITREVIDX_FLT_4096)) + /* Initializations of structure parameters for 4096 point FFT */ + case 4096U: + /* Initialise the bit reversal table modifier */ + FFTINIT(f32,4096); + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_2048) && defined(ARM_TABLE_BITREVIDX_FLT_2048)) + /* Initializations of structure parameters for 2048 point FFT */ + case 2048U: + /* Initialise the bit reversal table modifier */ + FFTINIT(f32,2048); + + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_1024) && defined(ARM_TABLE_BITREVIDX_FLT_1024)) + /* Initializations of structure parameters for 1024 point FFT */ + case 1024U: + /* Initialise the bit reversal table modifier */ + FFTINIT(f32,1024); + + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_512) && defined(ARM_TABLE_BITREVIDX_FLT_512)) + /* Initializations of structure parameters for 512 point FFT */ + case 512U: + /* Initialise the bit reversal table modifier */ + FFTINIT(f32,512); + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_256) && defined(ARM_TABLE_BITREVIDX_FLT_256)) + case 256U: + FFTINIT(f32,256); + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_128) && defined(ARM_TABLE_BITREVIDX_FLT_128)) + case 128U: + FFTINIT(f32,128); + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_64) && defined(ARM_TABLE_BITREVIDX_FLT_64)) + case 64U: + FFTINIT(f32,64); + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_32) && defined(ARM_TABLE_BITREVIDX_FLT_32)) + case 32U: + FFTINIT(f32,32); + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_16) && defined(ARM_TABLE_BITREVIDX_FLT_16)) + case 16U: + /* Initializations of structure parameters for 16 point FFT */ + FFTINIT(f32,16); + break; +#endif + + default: + /* Reporting argument error if fftSize is not valid value */ + status = ARM_MATH_ARGUMENT_ERROR; + break; + } + + + return (status); +} +#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */ + +/** + @} end of ComplexFFT group + */ diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_init_f64.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_init_f64.c new file mode 100644 index 0000000..61167fd --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_init_f64.c @@ -0,0 +1,150 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_cfft_init_f64.c + * Description: Initialization function for cfft f64 instance + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#define FFTINIT(EXT,SIZE) \ + S->bitRevLength = arm_cfft_sR_##EXT##_len##SIZE.bitRevLength; \ + S->pBitRevTable = arm_cfft_sR_##EXT##_len##SIZE.pBitRevTable; \ + S->pTwiddle = arm_cfft_sR_##EXT##_len##SIZE.pTwiddle; + +/** + @addtogroup ComplexFFT + @{ + */ + +/** + @brief Initialization function for the cfft f64 function + @param[in,out] S points to an instance of the floating-point CFFT structure + @param[in] fftLen fft length (number of complex samples) + @return execution status + - \ref ARM_MATH_SUCCESS : Operation successful + - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected + + @par Use of this function is mandatory only for the MVE version of the FFT. + Other versions can still initialize directly the data structure using + variables declared in arm_const_structs.h + */ + +#include "dsp/transform_functions.h" +#include "arm_common_tables.h" +#include "arm_const_structs.h" + + +arm_status arm_cfft_init_f64( + arm_cfft_instance_f64 * S, + uint16_t fftLen) +{ + /* Initialise the default arm status */ + arm_status status = ARM_MATH_SUCCESS; + + /* Initialise the FFT length */ + S->fftLen = fftLen; + + /* Initialise the Twiddle coefficient pointer */ + S->pTwiddle = NULL; + + + /* Initializations of Instance structure depending on the FFT length */ + switch (S->fftLen) { +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_4096) && defined(ARM_TABLE_BITREVIDX_FLT_4096)) + /* Initializations of structure parameters for 4096 point FFT */ + case 4096U: + /* Initialise the bit reversal table modifier */ + FFTINIT(f64,4096); + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_2048) && defined(ARM_TABLE_BITREVIDX_FLT_2048)) + /* Initializations of structure parameters for 2048 point FFT */ + case 2048U: + /* Initialise the bit reversal table modifier */ + FFTINIT(f64,2048); + + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_1024) && defined(ARM_TABLE_BITREVIDX_FLT_1024)) + /* Initializations of structure parameters for 1024 point FFT */ + case 1024U: + /* Initialise the bit reversal table modifier */ + FFTINIT(f64,1024); + + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_512) && defined(ARM_TABLE_BITREVIDX_FLT_512)) + /* Initializations of structure parameters for 512 point FFT */ + case 512U: + /* Initialise the bit reversal table modifier */ + FFTINIT(f64,512); + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_256) && defined(ARM_TABLE_BITREVIDX_FLT_256)) + case 256U: + FFTINIT(f64,256); + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_128) && defined(ARM_TABLE_BITREVIDX_FLT_128)) + case 128U: + FFTINIT(f64,128); + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_64) && defined(ARM_TABLE_BITREVIDX_FLT_64)) + case 64U: + FFTINIT(f64,64); + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_32) && defined(ARM_TABLE_BITREVIDX_FLT_32)) + case 32U: + FFTINIT(f64,32); + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_16) && defined(ARM_TABLE_BITREVIDX_FLT_16)) + case 16U: + /* Initializations of structure parameters for 16 point FFT */ + FFTINIT(f64,16); + break; +#endif + + default: + /* Reporting argument error if fftSize is not valid value */ + status = ARM_MATH_ARGUMENT_ERROR; + break; + } + + + return (status); +} + +/** + @} end of ComplexFFT group + */ diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_init_q15.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_init_q15.c new file mode 100644 index 0000000..3cf6e3a --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_init_q15.c @@ -0,0 +1,356 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_cfft_init_q15.c + * Description: Initialization function for cfft q15 instance + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#define FFTINIT(EXT,SIZE) \ + S->bitRevLength = arm_cfft_sR_##EXT##_len##SIZE.bitRevLength; \ + S->pBitRevTable = arm_cfft_sR_##EXT##_len##SIZE.pBitRevTable; \ + S->pTwiddle = arm_cfft_sR_##EXT##_len##SIZE.pTwiddle; + +/** + @addtogroup ComplexFFT + @{ + */ + +/** + @brief Initialization function for the cfft q15 function + @param[in,out] S points to an instance of the floating-point CFFT structure + @param[in] fftLen fft length (number of complex samples) + @return execution status + - \ref ARM_MATH_SUCCESS : Operation successful + - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected + + @par Use of this function is mandatory only for the MVE version of the FFT. + Other versions can still initialize directly the data structure using + variables declared in arm_const_structs.h + */ + +#include "dsp/transform_functions.h" +#include "arm_common_tables.h" +#include "arm_const_structs.h" + +#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) + +#include "arm_vec_fft.h" +#include "arm_mve_tables.h" + + +arm_status arm_cfft_radix4by2_rearrange_twiddles_q15(arm_cfft_instance_q15 *S, int twidCoefModifier) +{ + + switch (S->fftLen >> (twidCoefModifier - 1)) { + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_4096) && defined(ARM_TABLE_TWIDDLECOEF_Q15_4096)) + case 4096U: + S->rearranged_twiddle_tab_stride1_arr = rearranged_twiddle_tab_stride1_arr_4096_q15; + S->rearranged_twiddle_stride1 = rearranged_twiddle_stride1_4096_q15; + + S->rearranged_twiddle_tab_stride2_arr = rearranged_twiddle_tab_stride2_arr_4096_q15; + S->rearranged_twiddle_stride2 = rearranged_twiddle_stride2_4096_q15; + + S->rearranged_twiddle_tab_stride3_arr = rearranged_twiddle_tab_stride3_arr_4096_q15; + S->rearranged_twiddle_stride3 = rearranged_twiddle_stride3_4096_q15; + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_1024) && defined(ARM_TABLE_TWIDDLECOEF_Q15_1024)) || (defined(ARM_TABLE_BITREVIDX_FXT_2048) && defined(ARM_TABLE_TWIDDLECOEF_Q15_2048)) + case 1024U: + S->rearranged_twiddle_tab_stride1_arr = rearranged_twiddle_tab_stride1_arr_1024_q15; + S->rearranged_twiddle_stride1 = rearranged_twiddle_stride1_1024_q15; + + S->rearranged_twiddle_tab_stride2_arr = rearranged_twiddle_tab_stride2_arr_1024_q15; + S->rearranged_twiddle_stride2 = rearranged_twiddle_stride2_1024_q15; + + S->rearranged_twiddle_tab_stride3_arr = rearranged_twiddle_tab_stride3_arr_1024_q15; + S->rearranged_twiddle_stride3 = rearranged_twiddle_stride3_1024_q15; + break; + #endif + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_256) && defined(ARM_TABLE_TWIDDLECOEF_Q15_256)) || (defined(ARM_TABLE_BITREVIDX_FXT_512) && defined(ARM_TABLE_TWIDDLECOEF_Q15_512)) + case 256U: + S->rearranged_twiddle_tab_stride1_arr = rearranged_twiddle_tab_stride1_arr_256_q15; + S->rearranged_twiddle_stride1 = rearranged_twiddle_stride1_256_q15; + + S->rearranged_twiddle_tab_stride2_arr = rearranged_twiddle_tab_stride2_arr_256_q15; + S->rearranged_twiddle_stride2 = rearranged_twiddle_stride2_256_q15; + + S->rearranged_twiddle_tab_stride3_arr = rearranged_twiddle_tab_stride3_arr_256_q15; + S->rearranged_twiddle_stride3 = rearranged_twiddle_stride3_256_q15; + + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_64) && defined(ARM_TABLE_TWIDDLECOEF_Q15_64)) || (defined(ARM_TABLE_BITREVIDX_FXT_128) && defined(ARM_TABLE_TWIDDLECOEF_Q15_128)) + case 64U: + S->rearranged_twiddle_tab_stride1_arr = rearranged_twiddle_tab_stride1_arr_64_q15; + S->rearranged_twiddle_stride1 = rearranged_twiddle_stride1_64_q15; + + S->rearranged_twiddle_tab_stride2_arr = rearranged_twiddle_tab_stride2_arr_64_q15; + S->rearranged_twiddle_stride2 = rearranged_twiddle_stride2_64_q15; + + S->rearranged_twiddle_tab_stride3_arr = rearranged_twiddle_tab_stride3_arr_64_q15; + S->rearranged_twiddle_stride3 = rearranged_twiddle_stride3_64_q15; + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_16) && defined(ARM_TABLE_TWIDDLECOEF_Q15_16)) || (defined(ARM_TABLE_BITREVIDX_FXT_32) && defined(ARM_TABLE_TWIDDLECOEF_Q15_32)) + case 16U: + S->rearranged_twiddle_tab_stride1_arr = rearranged_twiddle_tab_stride1_arr_16_q15; + S->rearranged_twiddle_stride1 = rearranged_twiddle_stride1_16_q15; + + S->rearranged_twiddle_tab_stride2_arr = rearranged_twiddle_tab_stride2_arr_16_q15; + S->rearranged_twiddle_stride2 = rearranged_twiddle_stride2_16_q15; + + S->rearranged_twiddle_tab_stride3_arr = rearranged_twiddle_tab_stride3_arr_16_q15; + S->rearranged_twiddle_stride3 = rearranged_twiddle_stride3_16_q15; + break; +#endif + + default: + return(ARM_MATH_ARGUMENT_ERROR); + break; + /* invalid sizes already filtered */ + } + + return(ARM_MATH_SUCCESS); + +} + + + +arm_status arm_cfft_init_q15( + arm_cfft_instance_q15 * S, + uint16_t fftLen) +{ + + /* Initialise the default arm status */ + arm_status status = ARM_MATH_SUCCESS; + + /* Initialise the FFT length */ + S->fftLen = fftLen; + + /* Initialise the Twiddle coefficient pointer */ + S->pTwiddle = NULL; + + + /* Initializations of Instance structure depending on the FFT length */ + switch (S->fftLen) { + /* Initializations of structure parameters for 4096 point FFT */ +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_4096) && defined(ARM_TABLE_TWIDDLECOEF_Q15_4096)) + case 4096U: + /* Initialise the bit reversal table modifier */ + S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_4096_TABLE_LENGTH; + S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_4096; + S->pTwiddle = (q15_t *)twiddleCoef_4096_q15; + status=arm_cfft_radix4by2_rearrange_twiddles_q15(S, 1); + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_2048) && defined(ARM_TABLE_TWIDDLECOEF_Q15_2048)) + /* Initializations of structure parameters for 2048 point FFT */ + case 2048U: + /* Initialise the bit reversal table modifier */ + S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_2048_TABLE_LENGTH; + S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_2048; + S->pTwiddle = (q15_t *)twiddleCoef_2048_q15; + status=arm_cfft_radix4by2_rearrange_twiddles_q15(S, 2); + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_1024) && defined(ARM_TABLE_TWIDDLECOEF_Q15_1024)) + /* Initializations of structure parameters for 1024 point FFT */ + case 1024U: + /* Initialise the bit reversal table modifier */ + S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_1024_TABLE_LENGTH; + S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_1024; + S->pTwiddle = (q15_t *)twiddleCoef_1024_q15; + status=arm_cfft_radix4by2_rearrange_twiddles_q15(S, 1); + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_512) && defined(ARM_TABLE_TWIDDLECOEF_Q15_512)) + /* Initializations of structure parameters for 512 point FFT */ + case 512U: + /* Initialise the bit reversal table modifier */ + S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_512_TABLE_LENGTH; + S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_512; + S->pTwiddle = (q15_t *)twiddleCoef_512_q15; + status=arm_cfft_radix4by2_rearrange_twiddles_q15(S, 2); + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_256) && defined(ARM_TABLE_TWIDDLECOEF_Q15_256)) + case 256U: + S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_256_TABLE_LENGTH; + S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_256; + S->pTwiddle = (q15_t *)twiddleCoef_256_q15; + status=arm_cfft_radix4by2_rearrange_twiddles_q15(S, 1); + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_128) && defined(ARM_TABLE_TWIDDLECOEF_Q15_128)) + case 128U: + S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_128_TABLE_LENGTH; + S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_128; + S->pTwiddle = (q15_t *)twiddleCoef_128_q15; + status=arm_cfft_radix4by2_rearrange_twiddles_q15(S, 2); + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_64) && defined(ARM_TABLE_TWIDDLECOEF_Q15_64)) + case 64U: + S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_64_TABLE_LENGTH; + S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_64; + S->pTwiddle = (q15_t *)twiddleCoef_64_q15; + status=arm_cfft_radix4by2_rearrange_twiddles_q15(S, 1); + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_32) && defined(ARM_TABLE_TWIDDLECOEF_Q15_32)) + case 32U: + S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_32_TABLE_LENGTH; + S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_32; + S->pTwiddle = (q15_t *)twiddleCoef_32_q15; + status=arm_cfft_radix4by2_rearrange_twiddles_q15(S, 2); + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_16) && defined(ARM_TABLE_TWIDDLECOEF_Q15_16)) + case 16U: + /* Initializations of structure parameters for 16 point FFT */ + S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_16_TABLE_LENGTH; + S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_16; + S->pTwiddle = (q15_t *)twiddleCoef_16_q15; + status=arm_cfft_radix4by2_rearrange_twiddles_q15(S, 1); + break; +#endif + + default: + /* Reporting argument error if fftSize is not valid value */ + status = ARM_MATH_ARGUMENT_ERROR; + break; + } + + + return (status); +} +#else +arm_status arm_cfft_init_q15( + arm_cfft_instance_q15 * S, + uint16_t fftLen) +{ + /* Initialise the default arm status */ + arm_status status = ARM_MATH_SUCCESS; + + /* Initialise the FFT length */ + S->fftLen = fftLen; + + /* Initialise the Twiddle coefficient pointer */ + S->pTwiddle = NULL; + + + /* Initializations of Instance structure depending on the FFT length */ + switch (S->fftLen) { +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_4096) && defined(ARM_TABLE_BITREVIDX_FXT_4096)) + /* Initializations of structure parameters for 4096 point FFT */ + case 4096U: + /* Initialise the bit reversal table modifier */ + FFTINIT(q15,4096); + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_2048) && defined(ARM_TABLE_BITREVIDX_FXT_2048)) + /* Initializations of structure parameters for 2048 point FFT */ + case 2048U: + /* Initialise the bit reversal table modifier */ + FFTINIT(q15,2048); + + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_1024) && defined(ARM_TABLE_BITREVIDX_FXT_1024)) + /* Initializations of structure parameters for 1024 point FFT */ + case 1024U: + /* Initialise the bit reversal table modifier */ + FFTINIT(q15,1024); + + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_512) && defined(ARM_TABLE_BITREVIDX_FXT_512)) + /* Initializations of structure parameters for 512 point FFT */ + case 512U: + /* Initialise the bit reversal table modifier */ + FFTINIT(q15,512); + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_256) && defined(ARM_TABLE_BITREVIDX_FXT_256)) + case 256U: + FFTINIT(q15,256); + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_128) && defined(ARM_TABLE_BITREVIDX_FXT_128)) + case 128U: + FFTINIT(q15,128); + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_64) && defined(ARM_TABLE_BITREVIDX_FXT_64)) + case 64U: + FFTINIT(q15,64); + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_32) && defined(ARM_TABLE_BITREVIDX_FXT_32)) + case 32U: + FFTINIT(q15,32); + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_16) && defined(ARM_TABLE_BITREVIDX_FXT_16)) + case 16U: + /* Initializations of structure parameters for 16 point FFT */ + FFTINIT(q15,16); + break; +#endif + + default: + /* Reporting argument error if fftSize is not valid value */ + status = ARM_MATH_ARGUMENT_ERROR; + break; + } + + + return (status); +} +#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */ + +/** + @} end of ComplexFFT group + */ diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_init_q31.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_init_q31.c new file mode 100644 index 0000000..3722a68 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_init_q31.c @@ -0,0 +1,356 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_cfft_init_q31.c + * Description: Initialization function for cfft q31 instance + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#define FFTINIT(EXT,SIZE) \ + S->bitRevLength = arm_cfft_sR_##EXT##_len##SIZE.bitRevLength; \ + S->pBitRevTable = arm_cfft_sR_##EXT##_len##SIZE.pBitRevTable; \ + S->pTwiddle = arm_cfft_sR_##EXT##_len##SIZE.pTwiddle; + +/** + @addtogroup ComplexFFT + @{ + */ + +/** + @brief Initialization function for the cfft q31 function + @param[in,out] S points to an instance of the floating-point CFFT structure + @param[in] fftLen fft length (number of complex samples) + @return execution status + - \ref ARM_MATH_SUCCESS : Operation successful + - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected + + @par Use of this function is mandatory only for the MVE version of the FFT. + Other versions can still initialize directly the data structure using + variables declared in arm_const_structs.h + */ + +#include "dsp/transform_functions.h" +#include "arm_common_tables.h" +#include "arm_const_structs.h" + +#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) + +#include "arm_vec_fft.h" +#include "arm_mve_tables.h" + + +arm_status arm_cfft_radix4by2_rearrange_twiddles_q31(arm_cfft_instance_q31 *S, int twidCoefModifier) +{ + + switch (S->fftLen >> (twidCoefModifier - 1)) { + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_4096) && defined(ARM_TABLE_TWIDDLECOEF_Q31_4096)) + case 4096U: + S->rearranged_twiddle_tab_stride1_arr = rearranged_twiddle_tab_stride1_arr_4096_q31; + S->rearranged_twiddle_stride1 = rearranged_twiddle_stride1_4096_q31; + + S->rearranged_twiddle_tab_stride2_arr = rearranged_twiddle_tab_stride2_arr_4096_q31; + S->rearranged_twiddle_stride2 = rearranged_twiddle_stride2_4096_q31; + + S->rearranged_twiddle_tab_stride3_arr = rearranged_twiddle_tab_stride3_arr_4096_q31; + S->rearranged_twiddle_stride3 = rearranged_twiddle_stride3_4096_q31; + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_1024) && defined(ARM_TABLE_TWIDDLECOEF_Q31_1024)) || (defined(ARM_TABLE_BITREVIDX_FXT_2048) && defined(ARM_TABLE_TWIDDLECOEF_Q31_2048)) + case 1024U: + S->rearranged_twiddle_tab_stride1_arr = rearranged_twiddle_tab_stride1_arr_1024_q31; + S->rearranged_twiddle_stride1 = rearranged_twiddle_stride1_1024_q31; + + S->rearranged_twiddle_tab_stride2_arr = rearranged_twiddle_tab_stride2_arr_1024_q31; + S->rearranged_twiddle_stride2 = rearranged_twiddle_stride2_1024_q31; + + S->rearranged_twiddle_tab_stride3_arr = rearranged_twiddle_tab_stride3_arr_1024_q31; + S->rearranged_twiddle_stride3 = rearranged_twiddle_stride3_1024_q31; + break; + #endif + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_256) && defined(ARM_TABLE_TWIDDLECOEF_Q31_256)) || (defined(ARM_TABLE_BITREVIDX_FXT_512) && defined(ARM_TABLE_TWIDDLECOEF_Q31_512)) + case 256U: + S->rearranged_twiddle_tab_stride1_arr = rearranged_twiddle_tab_stride1_arr_256_q31; + S->rearranged_twiddle_stride1 = rearranged_twiddle_stride1_256_q31; + + S->rearranged_twiddle_tab_stride2_arr = rearranged_twiddle_tab_stride2_arr_256_q31; + S->rearranged_twiddle_stride2 = rearranged_twiddle_stride2_256_q31; + + S->rearranged_twiddle_tab_stride3_arr = rearranged_twiddle_tab_stride3_arr_256_q31; + S->rearranged_twiddle_stride3 = rearranged_twiddle_stride3_256_q31; + + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_64) && defined(ARM_TABLE_TWIDDLECOEF_Q31_64)) || (defined(ARM_TABLE_BITREVIDX_FXT_128) && defined(ARM_TABLE_TWIDDLECOEF_Q31_128)) + case 64U: + S->rearranged_twiddle_tab_stride1_arr = rearranged_twiddle_tab_stride1_arr_64_q31; + S->rearranged_twiddle_stride1 = rearranged_twiddle_stride1_64_q31; + + S->rearranged_twiddle_tab_stride2_arr = rearranged_twiddle_tab_stride2_arr_64_q31; + S->rearranged_twiddle_stride2 = rearranged_twiddle_stride2_64_q31; + + S->rearranged_twiddle_tab_stride3_arr = rearranged_twiddle_tab_stride3_arr_64_q31; + S->rearranged_twiddle_stride3 = rearranged_twiddle_stride3_64_q31; + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_16) && defined(ARM_TABLE_TWIDDLECOEF_Q31_16)) || (defined(ARM_TABLE_BITREVIDX_FXT_32) && defined(ARM_TABLE_TWIDDLECOEF_Q31_32)) + case 16U: + S->rearranged_twiddle_tab_stride1_arr = rearranged_twiddle_tab_stride1_arr_16_q31; + S->rearranged_twiddle_stride1 = rearranged_twiddle_stride1_16_q31; + + S->rearranged_twiddle_tab_stride2_arr = rearranged_twiddle_tab_stride2_arr_16_q31; + S->rearranged_twiddle_stride2 = rearranged_twiddle_stride2_16_q31; + + S->rearranged_twiddle_tab_stride3_arr = rearranged_twiddle_tab_stride3_arr_16_q31; + S->rearranged_twiddle_stride3 = rearranged_twiddle_stride3_16_q31; + break; +#endif + + default: + return(ARM_MATH_ARGUMENT_ERROR); + break; + /* invalid sizes already filtered */ + } + + return(ARM_MATH_SUCCESS); + +} + + + +arm_status arm_cfft_init_q31( + arm_cfft_instance_q31 * S, + uint16_t fftLen) +{ + + /* Initialise the default arm status */ + arm_status status = ARM_MATH_SUCCESS; + + /* Initialise the FFT length */ + S->fftLen = fftLen; + + /* Initialise the Twiddle coefficient pointer */ + S->pTwiddle = NULL; + + + /* Initializations of Instance structure depending on the FFT length */ + switch (S->fftLen) { + /* Initializations of structure parameters for 4096 point FFT */ +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_4096) && defined(ARM_TABLE_TWIDDLECOEF_Q31_4096)) + case 4096U: + /* Initialise the bit reversal table modifier */ + S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_4096_TABLE_LENGTH; + S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_4096; + S->pTwiddle = (q31_t *)twiddleCoef_4096_q31; + status=arm_cfft_radix4by2_rearrange_twiddles_q31(S, 1); + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_2048) && defined(ARM_TABLE_TWIDDLECOEF_Q31_2048)) + /* Initializations of structure parameters for 2048 point FFT */ + case 2048U: + /* Initialise the bit reversal table modifier */ + S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_2048_TABLE_LENGTH; + S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_2048; + S->pTwiddle = (q31_t *)twiddleCoef_2048_q31; + status=arm_cfft_radix4by2_rearrange_twiddles_q31(S, 2); + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_1024) && defined(ARM_TABLE_TWIDDLECOEF_Q31_1024)) + /* Initializations of structure parameters for 1024 point FFT */ + case 1024U: + /* Initialise the bit reversal table modifier */ + S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_1024_TABLE_LENGTH; + S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_1024; + S->pTwiddle = (q31_t *)twiddleCoef_1024_q31; + status=arm_cfft_radix4by2_rearrange_twiddles_q31(S, 1); + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_512) && defined(ARM_TABLE_TWIDDLECOEF_Q31_512)) + /* Initializations of structure parameters for 512 point FFT */ + case 512U: + /* Initialise the bit reversal table modifier */ + S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_512_TABLE_LENGTH; + S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_512; + S->pTwiddle = (q31_t *)twiddleCoef_512_q31; + status=arm_cfft_radix4by2_rearrange_twiddles_q31(S, 2); + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_256) && defined(ARM_TABLE_TWIDDLECOEF_Q31_256)) + case 256U: + S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_256_TABLE_LENGTH; + S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_256; + S->pTwiddle = (q31_t *)twiddleCoef_256_q31; + status=arm_cfft_radix4by2_rearrange_twiddles_q31(S, 1); + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_128) && defined(ARM_TABLE_TWIDDLECOEF_Q31_128)) + case 128U: + S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_128_TABLE_LENGTH; + S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_128; + S->pTwiddle = (q31_t *)twiddleCoef_128_q31; + status=arm_cfft_radix4by2_rearrange_twiddles_q31(S, 2); + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_64) && defined(ARM_TABLE_TWIDDLECOEF_Q31_64)) + case 64U: + S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_64_TABLE_LENGTH; + S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_64; + S->pTwiddle = (q31_t *)twiddleCoef_64_q31; + status=arm_cfft_radix4by2_rearrange_twiddles_q31(S, 1); + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_32) && defined(ARM_TABLE_TWIDDLECOEF_Q31_32)) + case 32U: + S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_32_TABLE_LENGTH; + S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_32; + S->pTwiddle = (q31_t *)twiddleCoef_32_q31; + status=arm_cfft_radix4by2_rearrange_twiddles_q31(S, 2); + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_16) && defined(ARM_TABLE_TWIDDLECOEF_Q31_16)) + case 16U: + /* Initializations of structure parameters for 16 point FFT */ + S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_16_TABLE_LENGTH; + S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_16; + S->pTwiddle = (q31_t *)twiddleCoef_16_q31; + status=arm_cfft_radix4by2_rearrange_twiddles_q31(S, 1); + break; +#endif + + default: + /* Reporting argument error if fftSize is not valid value */ + status = ARM_MATH_ARGUMENT_ERROR; + break; + } + + + return (status); +} +#else +arm_status arm_cfft_init_q31( + arm_cfft_instance_q31 * S, + uint16_t fftLen) +{ + /* Initialise the default arm status */ + arm_status status = ARM_MATH_SUCCESS; + + /* Initialise the FFT length */ + S->fftLen = fftLen; + + /* Initialise the Twiddle coefficient pointer */ + S->pTwiddle = NULL; + + + /* Initializations of Instance structure depending on the FFT length */ + switch (S->fftLen) { +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_4096) && defined(ARM_TABLE_BITREVIDX_FXT_4096)) + /* Initializations of structure parameters for 4096 point FFT */ + case 4096U: + /* Initialise the bit reversal table modifier */ + FFTINIT(q31,4096); + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_2048) && defined(ARM_TABLE_BITREVIDX_FXT_2048)) + /* Initializations of structure parameters for 2048 point FFT */ + case 2048U: + /* Initialise the bit reversal table modifier */ + FFTINIT(q31,2048); + + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_1024) && defined(ARM_TABLE_BITREVIDX_FXT_1024)) + /* Initializations of structure parameters for 1024 point FFT */ + case 1024U: + /* Initialise the bit reversal table modifier */ + FFTINIT(q31,1024); + + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_512) && defined(ARM_TABLE_BITREVIDX_FXT_512)) + /* Initializations of structure parameters for 512 point FFT */ + case 512U: + /* Initialise the bit reversal table modifier */ + FFTINIT(q31,512); + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_256) && defined(ARM_TABLE_BITREVIDX_FXT_256)) + case 256U: + FFTINIT(q31,256); + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_128) && defined(ARM_TABLE_BITREVIDX_FXT_128)) + case 128U: + FFTINIT(q31,128); + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_64) && defined(ARM_TABLE_BITREVIDX_FXT_64)) + case 64U: + FFTINIT(q31,64); + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_32) && defined(ARM_TABLE_BITREVIDX_FXT_32)) + case 32U: + FFTINIT(q31,32); + break; +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_16) && defined(ARM_TABLE_BITREVIDX_FXT_16)) + case 16U: + /* Initializations of structure parameters for 16 point FFT */ + FFTINIT(q31,16); + break; +#endif + + default: + /* Reporting argument error if fftSize is not valid value */ + status = ARM_MATH_ARGUMENT_ERROR; + break; + } + + + return (status); +} +#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */ + +/** + @} end of ComplexFFT group + */ diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_q15.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_q15.c new file mode 100644 index 0000000..74a6e7a --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_q15.c @@ -0,0 +1,893 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_cfft_q15.c + * Description: Combined Radix Decimation in Q15 Frequency CFFT processing function + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/transform_functions.h" + +#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) + +#include "arm_vec_fft.h" + + +static void _arm_radix4_butterfly_q15_mve( + const arm_cfft_instance_q15 * S, + q15_t *pSrc, + uint32_t fftLen) +{ + q15x8_t vecTmp0, vecTmp1; + q15x8_t vecSum0, vecDiff0, vecSum1, vecDiff1; + q15x8_t vecA, vecB, vecC, vecD; + uint32_t blkCnt; + uint32_t n1, n2; + uint32_t stage = 0; + int32_t iter = 1; + static const int32_t strides[4] = { + (0 - 16) * (int32_t)sizeof(q15_t *), (4 - 16) * (int32_t)sizeof(q15_t *), + (8 - 16) * (int32_t)sizeof(q15_t *), (12 - 16) * (int32_t)sizeof(q15_t *) + }; + + /* + * Process first stages + * Each stage in middle stages provides two down scaling of the input + */ + n2 = fftLen; + n1 = n2; + n2 >>= 2u; + + for (int k = fftLen / 4u; k > 1; k >>= 2u) + { + q15_t const *p_rearranged_twiddle_tab_stride2 = + &S->rearranged_twiddle_stride2[ + S->rearranged_twiddle_tab_stride2_arr[stage]]; + q15_t const *p_rearranged_twiddle_tab_stride3 = &S->rearranged_twiddle_stride3[ + S->rearranged_twiddle_tab_stride3_arr[stage]]; + q15_t const *p_rearranged_twiddle_tab_stride1 = + &S->rearranged_twiddle_stride1[ + S->rearranged_twiddle_tab_stride1_arr[stage]]; + + q15_t * pBase = pSrc; + for (int i = 0; i < iter; i++) + { + q15_t *inA = pBase; + q15_t *inB = inA + n2 * CMPLX_DIM; + q15_t *inC = inB + n2 * CMPLX_DIM; + q15_t *inD = inC + n2 * CMPLX_DIM; + q15_t const *pW1 = p_rearranged_twiddle_tab_stride1; + q15_t const *pW2 = p_rearranged_twiddle_tab_stride2; + q15_t const *pW3 = p_rearranged_twiddle_tab_stride3; + q15x8_t vecW; + + blkCnt = n2 / 4; + /* + * load 4 x q15 complex pair + */ + vecA = vldrhq_s16(inA); + vecC = vldrhq_s16(inC); + while (blkCnt > 0U) + { + vecB = vldrhq_s16(inB); + vecD = vldrhq_s16(inD); + + vecSum0 = vhaddq(vecA, vecC); + vecDiff0 = vhsubq(vecA, vecC); + + vecSum1 = vhaddq(vecB, vecD); + vecDiff1 = vhsubq(vecB, vecD); + /* + * [ 1 1 1 1 ] * [ A B C D ]' .* 1 + */ + vecTmp0 = vhaddq(vecSum0, vecSum1); + vst1q(inA, vecTmp0); + inA += 8; + /* + * [ 1 -1 1 -1 ] * [ A B C D ]' + */ + vecTmp0 = vhsubq(vecSum0, vecSum1); + /* + * [ 1 -1 1 -1 ] * [ A B C D ]'.* W2 + */ + vecW = vld1q(pW2); + pW2 += 8; + vecTmp1 = MVE_CMPLX_MULT_FX_AxB(vecW, vecTmp0, q15x8_t); + + vst1q(inB, vecTmp1); + inB += 8; + /* + * [ 1 -i -1 +i ] * [ A B C D ]' + */ + vecTmp0 = MVE_CMPLX_SUB_FX_A_ixB(vecDiff0, vecDiff1); + /* + * [ 1 -i -1 +i ] * [ A B C D ]'.* W1 + */ + vecW = vld1q(pW1); + pW1 += 8; + vecTmp1 = MVE_CMPLX_MULT_FX_AxB(vecW, vecTmp0, q15x8_t); + vst1q(inC, vecTmp1); + inC += 8; + + /* + * [ 1 +i -1 -i ] * [ A B C D ]' + */ + vecTmp0 = MVE_CMPLX_ADD_FX_A_ixB(vecDiff0, vecDiff1); + /* + * [ 1 +i -1 -i ] * [ A B C D ]'.* W3 + */ + vecW = vld1q(pW3); + pW3 += 8; + vecTmp1 = MVE_CMPLX_MULT_FX_AxB(vecW, vecTmp0, q15x8_t); + vst1q(inD, vecTmp1); + inD += 8; + + vecA = vldrhq_s16(inA); + vecC = vldrhq_s16(inC); + + blkCnt--; + } + pBase += CMPLX_DIM * n1; + } + n1 = n2; + n2 >>= 2u; + iter = iter << 2; + stage++; + } + + /* + * start of Last stage process + */ + uint32x4_t vecScGathAddr = vld1q_u32 ((uint32_t*)strides); + vecScGathAddr = vecScGathAddr + (uint32_t) pSrc; + + /* + * load scheduling + */ + vecA = (q15x8_t) vldrwq_gather_base_wb_s32(&vecScGathAddr, 64); + vecC = (q15x8_t) vldrwq_gather_base_s32(vecScGathAddr, 8); + + blkCnt = (fftLen >> 4); + while (blkCnt > 0U) + { + vecSum0 = vhaddq(vecA, vecC); + vecDiff0 = vhsubq(vecA, vecC); + + vecB = (q15x8_t) vldrwq_gather_base_s32(vecScGathAddr, 4); + vecD = (q15x8_t) vldrwq_gather_base_s32(vecScGathAddr, 12); + + vecSum1 = vhaddq(vecB, vecD); + vecDiff1 = vhsubq(vecB, vecD); + /* + * pre-load for next iteration + */ + vecA = (q15x8_t) vldrwq_gather_base_wb_s32(&vecScGathAddr, 64); + vecC = (q15x8_t) vldrwq_gather_base_s32(vecScGathAddr, 8); + + vecTmp0 = vhaddq(vecSum0, vecSum1); + vstrwq_scatter_base_s32(vecScGathAddr, -64, (int32x4_t) vecTmp0); + + vecTmp0 = vhsubq(vecSum0, vecSum1); + vstrwq_scatter_base_s32(vecScGathAddr, -64 + 4, (int32x4_t) vecTmp0); + + vecTmp0 = MVE_CMPLX_SUB_FX_A_ixB(vecDiff0, vecDiff1); + vstrwq_scatter_base_s32(vecScGathAddr, -64 + 8, (int32x4_t) vecTmp0); + + vecTmp0 = MVE_CMPLX_ADD_FX_A_ixB(vecDiff0, vecDiff1); + vstrwq_scatter_base_s32(vecScGathAddr, -64 + 12, (int32x4_t) vecTmp0); + + blkCnt--; + } + +} + +static void arm_cfft_radix4by2_q15_mve(const arm_cfft_instance_q15 *S, q15_t *pSrc, uint32_t fftLen) +{ + uint32_t n2; + q15_t *pIn0; + q15_t *pIn1; + const q15_t *pCoef = S->pTwiddle; + uint32_t blkCnt; + q15x8_t vecIn0, vecIn1, vecSum, vecDiff; + q15x8_t vecCmplxTmp, vecTw; + q15_t const *pCoefVec; + + n2 = fftLen >> 1; + + pIn0 = pSrc; + pIn1 = pSrc + fftLen; + pCoefVec = pCoef; + + blkCnt = n2 / 4; + + while (blkCnt > 0U) + { + vecIn0 = *(q15x8_t *) pIn0; + vecIn1 = *(q15x8_t *) pIn1; + + vecIn0 = vecIn0 >> 1; + vecIn1 = vecIn1 >> 1; + vecSum = vhaddq(vecIn0, vecIn1); + vst1q(pIn0, vecSum); + pIn0 += 8; + + vecTw = vld1q(pCoefVec); + pCoefVec += 8; + + vecDiff = vhsubq(vecIn0, vecIn1); + vecCmplxTmp = MVE_CMPLX_MULT_FX_AxConjB(vecDiff, vecTw, q15x8_t); + vst1q(pIn1, vecCmplxTmp); + pIn1 += 8; + + blkCnt--; + } + + _arm_radix4_butterfly_q15_mve(S, pSrc, n2); + + _arm_radix4_butterfly_q15_mve(S, pSrc + fftLen, n2); + + + pIn0 = pSrc; + blkCnt = (fftLen << 1) >> 3; + while (blkCnt > 0U) + { + vecIn0 = *(q15x8_t *) pIn0; + vecIn0 = vecIn0 << 1; + vst1q(pIn0, vecIn0); + pIn0 += 8; + blkCnt--; + } + /* + * tail + * (will be merged thru tail predication) + */ + blkCnt = (fftLen << 1) & 7; + if (blkCnt > 0U) + { + mve_pred16_t p0 = vctp16q(blkCnt); + + vecIn0 = *(q15x8_t *) pIn0; + vecIn0 = vecIn0 << 1; + vstrhq_p(pIn0, vecIn0, p0); + } +} + +static void _arm_radix4_butterfly_inverse_q15_mve(const arm_cfft_instance_q15 *S,q15_t *pSrc, uint32_t fftLen) +{ + q15x8_t vecTmp0, vecTmp1; + q15x8_t vecSum0, vecDiff0, vecSum1, vecDiff1; + q15x8_t vecA, vecB, vecC, vecD; + uint32_t blkCnt; + uint32_t n1, n2; + uint32_t stage = 0; + int32_t iter = 1; + static const int32_t strides[4] = { + (0 - 16) * (int32_t)sizeof(q15_t *), (4 - 16) * (int32_t)sizeof(q15_t *), + (8 - 16) * (int32_t)sizeof(q15_t *), (12 - 16) * (int32_t)sizeof(q15_t *) + }; + + + /* + * Process first stages + * Each stage in middle stages provides two down scaling of the input + */ + n2 = fftLen; + n1 = n2; + n2 >>= 2u; + + for (int k = fftLen / 4u; k > 1; k >>= 2u) + { + q15_t const *p_rearranged_twiddle_tab_stride2 = + &S->rearranged_twiddle_stride2[ + S->rearranged_twiddle_tab_stride2_arr[stage]]; + q15_t const *p_rearranged_twiddle_tab_stride3 = &S->rearranged_twiddle_stride3[ + S->rearranged_twiddle_tab_stride3_arr[stage]]; + q15_t const *p_rearranged_twiddle_tab_stride1 = + &S->rearranged_twiddle_stride1[ + S->rearranged_twiddle_tab_stride1_arr[stage]]; + + q15_t * pBase = pSrc; + for (int i = 0; i < iter; i++) + { + q15_t *inA = pBase; + q15_t *inB = inA + n2 * CMPLX_DIM; + q15_t *inC = inB + n2 * CMPLX_DIM; + q15_t *inD = inC + n2 * CMPLX_DIM; + q15_t const *pW1 = p_rearranged_twiddle_tab_stride1; + q15_t const *pW2 = p_rearranged_twiddle_tab_stride2; + q15_t const *pW3 = p_rearranged_twiddle_tab_stride3; + q15x8_t vecW; + + + blkCnt = n2 / 4; + /* + * load 4 x q15 complex pair + */ + vecA = vldrhq_s16(inA); + vecC = vldrhq_s16(inC); + while (blkCnt > 0U) + { + vecB = vldrhq_s16(inB); + vecD = vldrhq_s16(inD); + + vecSum0 = vhaddq(vecA, vecC); + vecDiff0 = vhsubq(vecA, vecC); + + vecSum1 = vhaddq(vecB, vecD); + vecDiff1 = vhsubq(vecB, vecD); + /* + * [ 1 1 1 1 ] * [ A B C D ]' .* 1 + */ + vecTmp0 = vhaddq(vecSum0, vecSum1); + vst1q(inA, vecTmp0); + inA += 8; + /* + * [ 1 -1 1 -1 ] * [ A B C D ]' + */ + vecTmp0 = vhsubq(vecSum0, vecSum1); + /* + * [ 1 -1 1 -1 ] * [ A B C D ]'.* W2 + */ + vecW = vld1q(pW2); + pW2 += 8; + vecTmp1 = MVE_CMPLX_MULT_FX_AxConjB(vecTmp0, vecW, q15x8_t); + + vst1q(inB, vecTmp1); + inB += 8; + /* + * [ 1 -i -1 +i ] * [ A B C D ]' + */ + vecTmp0 = MVE_CMPLX_ADD_FX_A_ixB(vecDiff0, vecDiff1); + /* + * [ 1 -i -1 +i ] * [ A B C D ]'.* W1 + */ + vecW = vld1q(pW1); + pW1 += 8; + vecTmp1 = MVE_CMPLX_MULT_FX_AxConjB(vecTmp0, vecW, q15x8_t); + vst1q(inC, vecTmp1); + inC += 8; + /* + * [ 1 +i -1 -i ] * [ A B C D ]' + */ + vecTmp0 = MVE_CMPLX_SUB_FX_A_ixB(vecDiff0, vecDiff1); + /* + * [ 1 +i -1 -i ] * [ A B C D ]'.* W3 + */ + vecW = vld1q(pW3); + pW3 += 8; + vecTmp1 = MVE_CMPLX_MULT_FX_AxConjB(vecTmp0, vecW, q15x8_t); + vst1q(inD, vecTmp1); + inD += 8; + + vecA = vldrhq_s16(inA); + vecC = vldrhq_s16(inC); + + blkCnt--; + } + pBase += CMPLX_DIM * n1; + } + n1 = n2; + n2 >>= 2u; + iter = iter << 2; + stage++; + } + + /* + * start of Last stage process + */ + uint32x4_t vecScGathAddr = vld1q_u32((uint32_t*)strides); + vecScGathAddr = vecScGathAddr + (uint32_t) pSrc; + + /* + * load scheduling + */ + vecA = (q15x8_t) vldrwq_gather_base_wb_s32(&vecScGathAddr, 64); + vecC = (q15x8_t) vldrwq_gather_base_s32(vecScGathAddr, 8); + + blkCnt = (fftLen >> 4); + while (blkCnt > 0U) + { + vecSum0 = vhaddq(vecA, vecC); + vecDiff0 = vhsubq(vecA, vecC); + + vecB = (q15x8_t) vldrwq_gather_base_s32(vecScGathAddr, 4); + vecD = (q15x8_t) vldrwq_gather_base_s32(vecScGathAddr, 12); + + vecSum1 = vhaddq(vecB, vecD); + vecDiff1 = vhsubq(vecB, vecD); + /* + * pre-load for next iteration + */ + vecA = (q15x8_t) vldrwq_gather_base_wb_s32(&vecScGathAddr, 64); + vecC = (q15x8_t) vldrwq_gather_base_s32(vecScGathAddr, 8); + + vecTmp0 = vhaddq(vecSum0, vecSum1); + vstrwq_scatter_base_s32(vecScGathAddr, -64, (int32x4_t) vecTmp0); + + vecTmp0 = vhsubq(vecSum0, vecSum1); + vstrwq_scatter_base_s32(vecScGathAddr, -64 + 4, (int32x4_t) vecTmp0); + + vecTmp0 = MVE_CMPLX_ADD_FX_A_ixB(vecDiff0, vecDiff1); + vstrwq_scatter_base_s32(vecScGathAddr, -64 + 8, (int32x4_t) vecTmp0); + + vecTmp0 = MVE_CMPLX_SUB_FX_A_ixB(vecDiff0, vecDiff1); + vstrwq_scatter_base_s32(vecScGathAddr, -64 + 12, (int32x4_t) vecTmp0); + + blkCnt--; + } +} + +static void arm_cfft_radix4by2_inverse_q15_mve(const arm_cfft_instance_q15 *S, q15_t *pSrc, uint32_t fftLen) +{ + uint32_t n2; + q15_t *pIn0; + q15_t *pIn1; + const q15_t *pCoef = S->pTwiddle; + + uint32_t blkCnt; + q15x8_t vecIn0, vecIn1, vecSum, vecDiff; + q15x8_t vecCmplxTmp, vecTw; + q15_t const *pCoefVec; + + n2 = fftLen >> 1; + + pIn0 = pSrc; + pIn1 = pSrc + fftLen; + pCoefVec = pCoef; + + blkCnt = n2 / 4; + + while (blkCnt > 0U) + { + vecIn0 = *(q15x8_t *) pIn0; + vecIn1 = *(q15x8_t *) pIn1; + + vecIn0 = vecIn0 >> 1; + vecIn1 = vecIn1 >> 1; + vecSum = vhaddq(vecIn0, vecIn1); + vst1q(pIn0, vecSum); + pIn0 += 8; + + vecTw = vld1q(pCoefVec); + pCoefVec += 8; + + vecDiff = vhsubq(vecIn0, vecIn1); + vecCmplxTmp = vqrdmlsdhq(vuninitializedq_s16() , vecDiff, vecTw); + vecCmplxTmp = vqrdmladhxq(vecCmplxTmp, vecDiff, vecTw); + vst1q(pIn1, vecCmplxTmp); + pIn1 += 8; + + blkCnt--; + } + + + _arm_radix4_butterfly_inverse_q15_mve(S, pSrc, n2); + + _arm_radix4_butterfly_inverse_q15_mve(S, pSrc + fftLen, n2); + + pIn0 = pSrc; + blkCnt = (fftLen << 1) >> 3; + while (blkCnt > 0U) + { + vecIn0 = *(q15x8_t *) pIn0; + vecIn0 = vecIn0 << 1; + vst1q(pIn0, vecIn0); + pIn0 += 8; + blkCnt--; + } + /* + * tail + * (will be merged thru tail predication) + */ + blkCnt = (fftLen << 1) & 7; + while (blkCnt > 0U) + { + mve_pred16_t p0 = vctp16q(blkCnt); + + vecIn0 = *(q15x8_t *) pIn0; + vecIn0 = vecIn0 << 1; + vstrhq_p(pIn0, vecIn0, p0); + } +} + +/** + @ingroup groupTransforms + */ + +/** + @addtogroup ComplexFFT + @{ + */ + +/** + @brief Processing function for Q15 complex FFT. + @param[in] S points to an instance of Q15 CFFT structure + @param[in,out] p1 points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place + @param[in] ifftFlag flag that selects transform direction + - value = 0: forward transform + - value = 1: inverse transform + @param[in] bitReverseFlag flag that enables / disables bit reversal of output + - value = 0: disables bit reversal of output + - value = 1: enables bit reversal of output + @return none + */ +void arm_cfft_q15( + const arm_cfft_instance_q15 * S, + q15_t * pSrc, + uint8_t ifftFlag, + uint8_t bitReverseFlag) +{ + uint32_t fftLen = S->fftLen; + + if (ifftFlag == 1U) { + + switch (fftLen) { + case 16: + case 64: + case 256: + case 1024: + case 4096: + _arm_radix4_butterfly_inverse_q15_mve(S, pSrc, fftLen); + break; + + case 32: + case 128: + case 512: + case 2048: + arm_cfft_radix4by2_inverse_q15_mve(S, pSrc, fftLen); + break; + } + } else { + switch (fftLen) { + case 16: + case 64: + case 256: + case 1024: + case 4096: + _arm_radix4_butterfly_q15_mve(S, pSrc, fftLen); + break; + + case 32: + case 128: + case 512: + case 2048: + arm_cfft_radix4by2_q15_mve(S, pSrc, fftLen); + break; + } + } + + + if (bitReverseFlag) + { + + arm_bitreversal_16_inpl_mve((uint16_t*)pSrc, S->bitRevLength, S->pBitRevTable); + + } +} + +#else + +extern void arm_radix4_butterfly_q15( + q15_t * pSrc, + uint32_t fftLen, + const q15_t * pCoef, + uint32_t twidCoefModifier); + +extern void arm_radix4_butterfly_inverse_q15( + q15_t * pSrc, + uint32_t fftLen, + const q15_t * pCoef, + uint32_t twidCoefModifier); + +extern void arm_bitreversal_16( + uint16_t * pSrc, + const uint16_t bitRevLen, + const uint16_t * pBitRevTable); + +void arm_cfft_radix4by2_q15( + q15_t * pSrc, + uint32_t fftLen, + const q15_t * pCoef); + +void arm_cfft_radix4by2_inverse_q15( + q15_t * pSrc, + uint32_t fftLen, + const q15_t * pCoef); + +/** + @ingroup groupTransforms + */ + +/** + @addtogroup ComplexFFT + @{ + */ + +/** + @brief Processing function for Q15 complex FFT. + @param[in] S points to an instance of Q15 CFFT structure + @param[in,out] p1 points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place + @param[in] ifftFlag flag that selects transform direction + - value = 0: forward transform + - value = 1: inverse transform + @param[in] bitReverseFlag flag that enables / disables bit reversal of output + - value = 0: disables bit reversal of output + - value = 1: enables bit reversal of output + @return none + */ + +void arm_cfft_q15( + const arm_cfft_instance_q15 * S, + q15_t * p1, + uint8_t ifftFlag, + uint8_t bitReverseFlag) +{ + uint32_t L = S->fftLen; + + if (ifftFlag == 1U) + { + switch (L) + { + case 16: + case 64: + case 256: + case 1024: + case 4096: + arm_radix4_butterfly_inverse_q15 ( p1, L, (q15_t*)S->pTwiddle, 1 ); + break; + + case 32: + case 128: + case 512: + case 2048: + arm_cfft_radix4by2_inverse_q15 ( p1, L, S->pTwiddle ); + break; + } + } + else + { + switch (L) + { + case 16: + case 64: + case 256: + case 1024: + case 4096: + arm_radix4_butterfly_q15 ( p1, L, (q15_t*)S->pTwiddle, 1 ); + break; + + case 32: + case 128: + case 512: + case 2048: + arm_cfft_radix4by2_q15 ( p1, L, S->pTwiddle ); + break; + } + } + + if ( bitReverseFlag ) + arm_bitreversal_16 ((uint16_t*) p1, S->bitRevLength, S->pBitRevTable); +} + +/** + @} end of ComplexFFT group + */ + +void arm_cfft_radix4by2_q15( + q15_t * pSrc, + uint32_t fftLen, + const q15_t * pCoef) +{ + uint32_t i; + uint32_t n2; + q15_t p0, p1, p2, p3; +#if defined (ARM_MATH_DSP) + q31_t T, S, R; + q31_t coeff, out1, out2; + const q15_t *pC = pCoef; + q15_t *pSi = pSrc; + q15_t *pSl = pSrc + fftLen; +#else + uint32_t l; + q15_t xt, yt, cosVal, sinVal; +#endif + + n2 = fftLen >> 1U; + +#if defined (ARM_MATH_DSP) + + for (i = n2; i > 0; i--) + { + coeff = read_q15x2_ia (&pC); + + T = read_q15x2 (pSi); + T = __SHADD16(T, 0); /* this is just a SIMD arithmetic shift right by 1 */ + + S = read_q15x2 (pSl); + S = __SHADD16(S, 0); /* this is just a SIMD arithmetic shift right by 1 */ + + R = __QSUB16(T, S); + + write_q15x2_ia (&pSi, __SHADD16(T, S)); + +#ifndef ARM_MATH_BIG_ENDIAN + out1 = __SMUAD(coeff, R) >> 16U; + out2 = __SMUSDX(coeff, R); +#else + out1 = __SMUSDX(R, coeff) >> 16U; + out2 = __SMUAD(coeff, R); +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + write_q15x2_ia (&pSl, (q31_t)__PKHBT( out1, out2, 0 ) ); + } + +#else /* #if defined (ARM_MATH_DSP) */ + + for (i = 0; i < n2; i++) + { + cosVal = pCoef[2 * i]; + sinVal = pCoef[2 * i + 1]; + + l = i + n2; + + xt = (pSrc[2 * i] >> 1U) - (pSrc[2 * l] >> 1U); + pSrc[2 * i] = ((pSrc[2 * i] >> 1U) + (pSrc[2 * l] >> 1U)) >> 1U; + + yt = (pSrc[2 * i + 1] >> 1U) - (pSrc[2 * l + 1] >> 1U); + pSrc[2 * i + 1] = ((pSrc[2 * l + 1] >> 1U) + (pSrc[2 * i + 1] >> 1U)) >> 1U; + + pSrc[2 * l] = (((int16_t) (((q31_t) xt * cosVal) >> 16U)) + + ((int16_t) (((q31_t) yt * sinVal) >> 16U)) ); + + pSrc[2 * l + 1] = (((int16_t) (((q31_t) yt * cosVal) >> 16U)) - + ((int16_t) (((q31_t) xt * sinVal) >> 16U)) ); + } + +#endif /* #if defined (ARM_MATH_DSP) */ + + /* first col */ + arm_radix4_butterfly_q15( pSrc, n2, (q15_t*)pCoef, 2U); + + /* second col */ + arm_radix4_butterfly_q15( pSrc + fftLen, n2, (q15_t*)pCoef, 2U); + + n2 = fftLen >> 1U; + for (i = 0; i < n2; i++) + { + p0 = pSrc[4 * i + 0]; + p1 = pSrc[4 * i + 1]; + p2 = pSrc[4 * i + 2]; + p3 = pSrc[4 * i + 3]; + + p0 <<= 1U; + p1 <<= 1U; + p2 <<= 1U; + p3 <<= 1U; + + pSrc[4 * i + 0] = p0; + pSrc[4 * i + 1] = p1; + pSrc[4 * i + 2] = p2; + pSrc[4 * i + 3] = p3; + } + +} + +void arm_cfft_radix4by2_inverse_q15( + q15_t * pSrc, + uint32_t fftLen, + const q15_t * pCoef) +{ + uint32_t i; + uint32_t n2; + q15_t p0, p1, p2, p3; +#if defined (ARM_MATH_DSP) + q31_t T, S, R; + q31_t coeff, out1, out2; + const q15_t *pC = pCoef; + q15_t *pSi = pSrc; + q15_t *pSl = pSrc + fftLen; +#else + uint32_t l; + q15_t xt, yt, cosVal, sinVal; +#endif + + n2 = fftLen >> 1U; + +#if defined (ARM_MATH_DSP) + + for (i = n2; i > 0; i--) + { + coeff = read_q15x2_ia (&pC); + + T = read_q15x2 (pSi); + T = __SHADD16(T, 0); /* this is just a SIMD arithmetic shift right by 1 */ + + S = read_q15x2 (pSl); + S = __SHADD16(S, 0); /* this is just a SIMD arithmetic shift right by 1 */ + + R = __QSUB16(T, S); + + write_q15x2_ia (&pSi, __SHADD16(T, S)); + +#ifndef ARM_MATH_BIG_ENDIAN + out1 = __SMUSD(coeff, R) >> 16U; + out2 = __SMUADX(coeff, R); +#else + out1 = __SMUADX(R, coeff) >> 16U; + out2 = __SMUSD(__QSUB(0, coeff), R); +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + write_q15x2_ia (&pSl, (q31_t)__PKHBT( out1, out2, 0 )); + } + +#else /* #if defined (ARM_MATH_DSP) */ + + for (i = 0; i < n2; i++) + { + cosVal = pCoef[2 * i]; + sinVal = pCoef[2 * i + 1]; + + l = i + n2; + + xt = (pSrc[2 * i] >> 1U) - (pSrc[2 * l] >> 1U); + pSrc[2 * i] = ((pSrc[2 * i] >> 1U) + (pSrc[2 * l] >> 1U)) >> 1U; + + yt = (pSrc[2 * i + 1] >> 1U) - (pSrc[2 * l + 1] >> 1U); + pSrc[2 * i + 1] = ((pSrc[2 * l + 1] >> 1U) + (pSrc[2 * i + 1] >> 1U)) >> 1U; + + pSrc[2 * l] = (((int16_t) (((q31_t) xt * cosVal) >> 16U)) - + ((int16_t) (((q31_t) yt * sinVal) >> 16U)) ); + + pSrc[2 * l + 1] = (((int16_t) (((q31_t) yt * cosVal) >> 16U)) + + ((int16_t) (((q31_t) xt * sinVal) >> 16U)) ); + } + +#endif /* #if defined (ARM_MATH_DSP) */ + + /* first col */ + arm_radix4_butterfly_inverse_q15( pSrc, n2, (q15_t*)pCoef, 2U); + + /* second col */ + arm_radix4_butterfly_inverse_q15( pSrc + fftLen, n2, (q15_t*)pCoef, 2U); + + n2 = fftLen >> 1U; + for (i = 0; i < n2; i++) + { + p0 = pSrc[4 * i + 0]; + p1 = pSrc[4 * i + 1]; + p2 = pSrc[4 * i + 2]; + p3 = pSrc[4 * i + 3]; + + p0 <<= 1U; + p1 <<= 1U; + p2 <<= 1U; + p3 <<= 1U; + + pSrc[4 * i + 0] = p0; + pSrc[4 * i + 1] = p1; + pSrc[4 * i + 2] = p2; + pSrc[4 * i + 3] = p3; + } +} + +#endif /* defined(ARM_MATH_MVEI) */ diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_q31.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_q31.c new file mode 100644 index 0000000..78ce505 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_q31.c @@ -0,0 +1,847 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_cfft_q31.c + * Description: Combined Radix Decimation in Frequency CFFT fixed point processing function + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/transform_functions.h" + + + +#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) + +#include "arm_vec_fft.h" + + +static void _arm_radix4_butterfly_q31_mve( + const arm_cfft_instance_q31 * S, + q31_t *pSrc, + uint32_t fftLen) +{ + q31x4_t vecTmp0, vecTmp1; + q31x4_t vecSum0, vecDiff0, vecSum1, vecDiff1; + q31x4_t vecA, vecB, vecC, vecD; + uint32_t blkCnt; + uint32_t n1, n2; + uint32_t stage = 0; + int32_t iter = 1; + static const int32_t strides[4] = { + (0 - 16) * (int32_t)sizeof(q31_t *), (1 - 16) * (int32_t)sizeof(q31_t *), + (8 - 16) * (int32_t)sizeof(q31_t *), (9 - 16) * (int32_t)sizeof(q31_t *) + }; + + + /* + * Process first stages + * Each stage in middle stages provides two down scaling of the input + */ + n2 = fftLen; + n1 = n2; + n2 >>= 2u; + + for (int k = fftLen / 4u; k > 1; k >>= 2u) + { + q31_t const *p_rearranged_twiddle_tab_stride2 = + &S->rearranged_twiddle_stride2[ + S->rearranged_twiddle_tab_stride2_arr[stage]]; + q31_t const *p_rearranged_twiddle_tab_stride3 = &S->rearranged_twiddle_stride3[ + S->rearranged_twiddle_tab_stride3_arr[stage]]; + q31_t const *p_rearranged_twiddle_tab_stride1 = + &S->rearranged_twiddle_stride1[ + S->rearranged_twiddle_tab_stride1_arr[stage]]; + + q31_t * pBase = pSrc; + for (int i = 0; i < iter; i++) + { + q31_t *inA = pBase; + q31_t *inB = inA + n2 * CMPLX_DIM; + q31_t *inC = inB + n2 * CMPLX_DIM; + q31_t *inD = inC + n2 * CMPLX_DIM; + q31_t const *pW1 = p_rearranged_twiddle_tab_stride1; + q31_t const *pW2 = p_rearranged_twiddle_tab_stride2; + q31_t const *pW3 = p_rearranged_twiddle_tab_stride3; + q31x4_t vecW; + + + blkCnt = n2 / 2; + /* + * load 2 x q31 complex pair + */ + vecA = vldrwq_s32(inA); + vecC = vldrwq_s32(inC); + while (blkCnt > 0U) + { + vecB = vldrwq_s32(inB); + vecD = vldrwq_s32(inD); + + vecSum0 = vhaddq(vecA, vecC); + vecDiff0 = vhsubq(vecA, vecC); + + vecSum1 = vhaddq(vecB, vecD); + vecDiff1 = vhsubq(vecB, vecD); + /* + * [ 1 1 1 1 ] * [ A B C D ]' .* 1 + */ + vecTmp0 = vhaddq(vecSum0, vecSum1); + vst1q(inA, vecTmp0); + inA += 4; + /* + * [ 1 -1 1 -1 ] * [ A B C D ]' + */ + vecTmp0 = vhsubq(vecSum0, vecSum1); + /* + * [ 1 -1 1 -1 ] * [ A B C D ]'.* W2 + */ + vecW = vld1q(pW2); + pW2 += 4; + vecTmp1 = MVE_CMPLX_MULT_FX_AxB(vecW, vecTmp0, q31x4_t); + + vst1q(inB, vecTmp1); + inB += 4; + /* + * [ 1 -i -1 +i ] * [ A B C D ]' + */ + vecTmp0 = MVE_CMPLX_SUB_FX_A_ixB(vecDiff0, vecDiff1); + /* + * [ 1 -i -1 +i ] * [ A B C D ]'.* W1 + */ + vecW = vld1q(pW1); + pW1 += 4; + vecTmp1 = MVE_CMPLX_MULT_FX_AxB(vecW, vecTmp0, q31x4_t); + vst1q(inC, vecTmp1); + inC += 4; + /* + * [ 1 +i -1 -i ] * [ A B C D ]' + */ + vecTmp0 = MVE_CMPLX_ADD_FX_A_ixB(vecDiff0, vecDiff1); + /* + * [ 1 +i -1 -i ] * [ A B C D ]'.* W3 + */ + vecW = vld1q(pW3); + pW3 += 4; + vecTmp1 = MVE_CMPLX_MULT_FX_AxB(vecW, vecTmp0, q31x4_t); + vst1q(inD, vecTmp1); + inD += 4; + + vecA = vldrwq_s32(inA); + vecC = vldrwq_s32(inC); + + blkCnt--; + } + pBase += CMPLX_DIM * n1; + } + n1 = n2; + n2 >>= 2u; + iter = iter << 2; + stage++; + } + + /* + * End of 1st stages process + * data is in 11.21(q21) format for the 1024 point as there are 3 middle stages + * data is in 9.23(q23) format for the 256 point as there are 2 middle stages + * data is in 7.25(q25) format for the 64 point as there are 1 middle stage + * data is in 5.27(q27) format for the 16 point as there are no middle stages + */ + + /* + * start of Last stage process + */ + uint32x4_t vecScGathAddr = vld1q_u32((uint32_t*)strides); + vecScGathAddr = vecScGathAddr + (uint32_t) pSrc; + + /* + * load scheduling + */ + vecA = vldrwq_gather_base_wb_s32(&vecScGathAddr, 64); + vecC = vldrwq_gather_base_s32(vecScGathAddr, 16); + + blkCnt = (fftLen >> 3); + while (blkCnt > 0U) + { + vecSum0 = vhaddq(vecA, vecC); + vecDiff0 = vhsubq(vecA, vecC); + + vecB = vldrwq_gather_base_s32(vecScGathAddr, 8); + vecD = vldrwq_gather_base_s32(vecScGathAddr, 24); + + vecSum1 = vhaddq(vecB, vecD); + vecDiff1 = vhsubq(vecB, vecD); + /* + * pre-load for next iteration + */ + vecA = vldrwq_gather_base_wb_s32(&vecScGathAddr, 64); + vecC = vldrwq_gather_base_s32(vecScGathAddr, 16); + + vecTmp0 = vhaddq(vecSum0, vecSum1); + vstrwq_scatter_base_s32(vecScGathAddr, -64, vecTmp0); + + vecTmp0 = vhsubq(vecSum0, vecSum1); + vstrwq_scatter_base_s32(vecScGathAddr, -64 + 8, vecTmp0); + + vecTmp0 = MVE_CMPLX_SUB_FX_A_ixB(vecDiff0, vecDiff1); + vstrwq_scatter_base_s32(vecScGathAddr, -64 + 16, vecTmp0); + + vecTmp0 = MVE_CMPLX_ADD_FX_A_ixB(vecDiff0, vecDiff1); + vstrwq_scatter_base_s32(vecScGathAddr, -64 + 24, vecTmp0); + + blkCnt--; + } + + /* + * output is in 11.21(q21) format for the 1024 point + * output is in 9.23(q23) format for the 256 point + * output is in 7.25(q25) format for the 64 point + * output is in 5.27(q27) format for the 16 point + */ +} + + +static void arm_cfft_radix4by2_q31_mve(const arm_cfft_instance_q31 *S, q31_t *pSrc, uint32_t fftLen) +{ + uint32_t n2; + q31_t *pIn0; + q31_t *pIn1; + const q31_t *pCoef = S->pTwiddle; + uint32_t blkCnt; + q31x4_t vecIn0, vecIn1, vecSum, vecDiff; + q31x4_t vecCmplxTmp, vecTw; + + n2 = fftLen >> 1; + pIn0 = pSrc; + pIn1 = pSrc + fftLen; + + blkCnt = n2 / 2; + + while (blkCnt > 0U) + { + vecIn0 = vld1q_s32(pIn0); + vecIn1 = vld1q_s32(pIn1); + + vecIn0 = vecIn0 >> 1; + vecIn1 = vecIn1 >> 1; + vecSum = vhaddq(vecIn0, vecIn1); + vst1q(pIn0, vecSum); + pIn0 += 4; + + vecTw = vld1q_s32(pCoef); + pCoef += 4; + vecDiff = vhsubq(vecIn0, vecIn1); + + vecCmplxTmp = MVE_CMPLX_MULT_FX_AxConjB(vecDiff, vecTw, q31x4_t); + vst1q(pIn1, vecCmplxTmp); + pIn1 += 4; + + blkCnt--; + } + + _arm_radix4_butterfly_q31_mve(S, pSrc, n2); + + _arm_radix4_butterfly_q31_mve(S, pSrc + fftLen, n2); + + pIn0 = pSrc; + blkCnt = (fftLen << 1) >> 2; + while (blkCnt > 0U) + { + vecIn0 = vld1q_s32(pIn0); + vecIn0 = vecIn0 << 1; + vst1q(pIn0, vecIn0); + pIn0 += 4; + blkCnt--; + } + /* + * tail + * (will be merged thru tail predication) + */ + blkCnt = (fftLen << 1) & 3; + if (blkCnt > 0U) + { + mve_pred16_t p0 = vctp32q(blkCnt); + + vecIn0 = vld1q_s32(pIn0); + vecIn0 = vecIn0 << 1; + vstrwq_p(pIn0, vecIn0, p0); + } + +} + +static void _arm_radix4_butterfly_inverse_q31_mve( + const arm_cfft_instance_q31 *S, + q31_t *pSrc, + uint32_t fftLen) +{ + q31x4_t vecTmp0, vecTmp1; + q31x4_t vecSum0, vecDiff0, vecSum1, vecDiff1; + q31x4_t vecA, vecB, vecC, vecD; + uint32_t blkCnt; + uint32_t n1, n2; + uint32_t stage = 0; + int32_t iter = 1; + static const int32_t strides[4] = { + (0 - 16) * (int32_t)sizeof(q31_t *), (1 - 16) * (int32_t)sizeof(q31_t *), + (8 - 16) * (int32_t)sizeof(q31_t *), (9 - 16) * (int32_t)sizeof(q31_t *) + }; + + /* + * Process first stages + * Each stage in middle stages provides two down scaling of the input + */ + n2 = fftLen; + n1 = n2; + n2 >>= 2u; + + for (int k = fftLen / 4u; k > 1; k >>= 2u) + { + q31_t const *p_rearranged_twiddle_tab_stride2 = + &S->rearranged_twiddle_stride2[ + S->rearranged_twiddle_tab_stride2_arr[stage]]; + q31_t const *p_rearranged_twiddle_tab_stride3 = &S->rearranged_twiddle_stride3[ + S->rearranged_twiddle_tab_stride3_arr[stage]]; + q31_t const *p_rearranged_twiddle_tab_stride1 = + &S->rearranged_twiddle_stride1[ + S->rearranged_twiddle_tab_stride1_arr[stage]]; + + q31_t * pBase = pSrc; + for (int i = 0; i < iter; i++) + { + q31_t *inA = pBase; + q31_t *inB = inA + n2 * CMPLX_DIM; + q31_t *inC = inB + n2 * CMPLX_DIM; + q31_t *inD = inC + n2 * CMPLX_DIM; + q31_t const *pW1 = p_rearranged_twiddle_tab_stride1; + q31_t const *pW2 = p_rearranged_twiddle_tab_stride2; + q31_t const *pW3 = p_rearranged_twiddle_tab_stride3; + q31x4_t vecW; + + blkCnt = n2 / 2; + /* + * load 2 x q31 complex pair + */ + vecA = vldrwq_s32(inA); + vecC = vldrwq_s32(inC); + while (blkCnt > 0U) + { + vecB = vldrwq_s32(inB); + vecD = vldrwq_s32(inD); + + vecSum0 = vhaddq(vecA, vecC); + vecDiff0 = vhsubq(vecA, vecC); + + vecSum1 = vhaddq(vecB, vecD); + vecDiff1 = vhsubq(vecB, vecD); + /* + * [ 1 1 1 1 ] * [ A B C D ]' .* 1 + */ + vecTmp0 = vhaddq(vecSum0, vecSum1); + vst1q(inA, vecTmp0); + inA += 4; + /* + * [ 1 -1 1 -1 ] * [ A B C D ]' + */ + vecTmp0 = vhsubq(vecSum0, vecSum1); + /* + * [ 1 -1 1 -1 ] * [ A B C D ]'.* W2 + */ + vecW = vld1q(pW2); + pW2 += 4; + vecTmp1 = MVE_CMPLX_MULT_FX_AxConjB(vecTmp0, vecW, q31x4_t); + + vst1q(inB, vecTmp1); + inB += 4; + /* + * [ 1 -i -1 +i ] * [ A B C D ]' + */ + vecTmp0 = MVE_CMPLX_ADD_FX_A_ixB(vecDiff0, vecDiff1); + /* + * [ 1 -i -1 +i ] * [ A B C D ]'.* W1 + */ + vecW = vld1q(pW1); + pW1 += 4; + vecTmp1 = MVE_CMPLX_MULT_FX_AxConjB(vecTmp0, vecW, q31x4_t); + vst1q(inC, vecTmp1); + inC += 4; + /* + * [ 1 +i -1 -i ] * [ A B C D ]' + */ + vecTmp0 = MVE_CMPLX_SUB_FX_A_ixB(vecDiff0, vecDiff1); + /* + * [ 1 +i -1 -i ] * [ A B C D ]'.* W3 + */ + vecW = vld1q(pW3); + pW3 += 4; + vecTmp1 = MVE_CMPLX_MULT_FX_AxConjB(vecTmp0, vecW, q31x4_t); + vst1q(inD, vecTmp1); + inD += 4; + + vecA = vldrwq_s32(inA); + vecC = vldrwq_s32(inC); + + blkCnt--; + } + pBase += CMPLX_DIM * n1; + } + n1 = n2; + n2 >>= 2u; + iter = iter << 2; + stage++; + } + + /* + * End of 1st stages process + * data is in 11.21(q21) format for the 1024 point as there are 3 middle stages + * data is in 9.23(q23) format for the 256 point as there are 2 middle stages + * data is in 7.25(q25) format for the 64 point as there are 1 middle stage + * data is in 5.27(q27) format for the 16 point as there are no middle stages + */ + + /* + * start of Last stage process + */ + uint32x4_t vecScGathAddr = vld1q_u32((uint32_t*)strides); + vecScGathAddr = vecScGathAddr + (uint32_t) pSrc; + + /* + * load scheduling + */ + vecA = vldrwq_gather_base_wb_s32(&vecScGathAddr, 64); + vecC = vldrwq_gather_base_s32(vecScGathAddr, 16); + + blkCnt = (fftLen >> 3); + while (blkCnt > 0U) + { + vecSum0 = vhaddq(vecA, vecC); + vecDiff0 = vhsubq(vecA, vecC); + + vecB = vldrwq_gather_base_s32(vecScGathAddr, 8); + vecD = vldrwq_gather_base_s32(vecScGathAddr, 24); + + vecSum1 = vhaddq(vecB, vecD); + vecDiff1 = vhsubq(vecB, vecD); + /* + * pre-load for next iteration + */ + vecA = vldrwq_gather_base_wb_s32(&vecScGathAddr, 64); + vecC = vldrwq_gather_base_s32(vecScGathAddr, 16); + + vecTmp0 = vhaddq(vecSum0, vecSum1); + vstrwq_scatter_base_s32(vecScGathAddr, -64, vecTmp0); + + vecTmp0 = vhsubq(vecSum0, vecSum1); + vstrwq_scatter_base_s32(vecScGathAddr, -64 + 8, vecTmp0); + + vecTmp0 = MVE_CMPLX_ADD_FX_A_ixB(vecDiff0, vecDiff1); + vstrwq_scatter_base_s32(vecScGathAddr, -64 + 16, vecTmp0); + + vecTmp0 = MVE_CMPLX_SUB_FX_A_ixB(vecDiff0, vecDiff1); + vstrwq_scatter_base_s32(vecScGathAddr, -64 + 24, vecTmp0); + + blkCnt--; + } + /* + * output is in 11.21(q21) format for the 1024 point + * output is in 9.23(q23) format for the 256 point + * output is in 7.25(q25) format for the 64 point + * output is in 5.27(q27) format for the 16 point + */ +} + +static void arm_cfft_radix4by2_inverse_q31_mve(const arm_cfft_instance_q31 *S, q31_t *pSrc, uint32_t fftLen) +{ + uint32_t n2; + q31_t *pIn0; + q31_t *pIn1; + const q31_t *pCoef = S->pTwiddle; + + //uint16_t twidCoefModifier = arm_cfft_radix2_twiddle_factor(S->fftLen); + //q31_t twidIncr = (2 * twidCoefModifier * sizeof(q31_t)); + uint32_t blkCnt; + //uint64x2_t vecOffs; + q31x4_t vecIn0, vecIn1, vecSum, vecDiff; + q31x4_t vecCmplxTmp, vecTw; + + n2 = fftLen >> 1; + + pIn0 = pSrc; + pIn1 = pSrc + fftLen; + //vecOffs[0] = 0; + //vecOffs[1] = (uint64_t) twidIncr; + blkCnt = n2 / 2; + + while (blkCnt > 0U) + { + vecIn0 = vld1q_s32(pIn0); + vecIn1 = vld1q_s32(pIn1); + + vecIn0 = vecIn0 >> 1; + vecIn1 = vecIn1 >> 1; + vecSum = vhaddq(vecIn0, vecIn1); + vst1q(pIn0, vecSum); + pIn0 += 4; + + //vecTw = (q31x4_t) vldrdq_gather_offset_s64(pCoef, vecOffs); + vecTw = vld1q_s32(pCoef); + pCoef += 4; + vecDiff = vhsubq(vecIn0, vecIn1); + + vecCmplxTmp = MVE_CMPLX_MULT_FX_AxB(vecDiff, vecTw, q31x4_t); + vst1q(pIn1, vecCmplxTmp); + pIn1 += 4; + + //vecOffs = vaddq((q31x4_t) vecOffs, 2 * twidIncr); + blkCnt--; + } + + _arm_radix4_butterfly_inverse_q31_mve(S, pSrc, n2); + + _arm_radix4_butterfly_inverse_q31_mve(S, pSrc + fftLen, n2); + + pIn0 = pSrc; + blkCnt = (fftLen << 1) >> 2; + while (blkCnt > 0U) + { + vecIn0 = vld1q_s32(pIn0); + vecIn0 = vecIn0 << 1; + vst1q(pIn0, vecIn0); + pIn0 += 4; + blkCnt--; + } + /* + * tail + * (will be merged thru tail predication) + */ + blkCnt = (fftLen << 1) & 3; + if (blkCnt > 0U) + { + mve_pred16_t p0 = vctp32q(blkCnt); + + vecIn0 = vld1q_s32(pIn0); + vecIn0 = vecIn0 << 1; + vstrwq_p(pIn0, vecIn0, p0); + } + +} + +/** + @ingroup groupTransforms + */ + +/** + @addtogroup ComplexFFT + @{ + */ + +/** + @brief Processing function for the Q31 complex FFT. + @param[in] S points to an instance of the fixed-point CFFT structure + @param[in,out] p1 points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place + @param[in] ifftFlag flag that selects transform direction + - value = 0: forward transform + - value = 1: inverse transform + @param[in] bitReverseFlag flag that enables / disables bit reversal of output + - value = 0: disables bit reversal of output + - value = 1: enables bit reversal of output + @return none + */ +void arm_cfft_q31( + const arm_cfft_instance_q31 * S, + q31_t * pSrc, + uint8_t ifftFlag, + uint8_t bitReverseFlag) +{ + uint32_t fftLen = S->fftLen; + + if (ifftFlag == 1U) { + + switch (fftLen) { + case 16: + case 64: + case 256: + case 1024: + case 4096: + _arm_radix4_butterfly_inverse_q31_mve(S, pSrc, fftLen); + break; + + case 32: + case 128: + case 512: + case 2048: + arm_cfft_radix4by2_inverse_q31_mve(S, pSrc, fftLen); + break; + } + } else { + switch (fftLen) { + case 16: + case 64: + case 256: + case 1024: + case 4096: + _arm_radix4_butterfly_q31_mve(S, pSrc, fftLen); + break; + + case 32: + case 128: + case 512: + case 2048: + arm_cfft_radix4by2_q31_mve(S, pSrc, fftLen); + break; + } + } + + + if (bitReverseFlag) + { + + arm_bitreversal_32_inpl_mve((uint32_t*)pSrc, S->bitRevLength, S->pBitRevTable); + + } +} +#else + +extern void arm_radix4_butterfly_q31( + q31_t * pSrc, + uint32_t fftLen, + const q31_t * pCoef, + uint32_t twidCoefModifier); + +extern void arm_radix4_butterfly_inverse_q31( + q31_t * pSrc, + uint32_t fftLen, + const q31_t * pCoef, + uint32_t twidCoefModifier); + +extern void arm_bitreversal_32( + uint32_t * pSrc, + const uint16_t bitRevLen, + const uint16_t * pBitRevTable); + +void arm_cfft_radix4by2_q31( + q31_t * pSrc, + uint32_t fftLen, + const q31_t * pCoef); + +void arm_cfft_radix4by2_inverse_q31( + q31_t * pSrc, + uint32_t fftLen, + const q31_t * pCoef); + + +/** + @ingroup groupTransforms + */ + +/** + @addtogroup ComplexFFT + @{ + */ + +/** + @brief Processing function for the Q31 complex FFT. + @param[in] S points to an instance of the fixed-point CFFT structure + @param[in,out] p1 points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place + @param[in] ifftFlag flag that selects transform direction + - value = 0: forward transform + - value = 1: inverse transform + @param[in] bitReverseFlag flag that enables / disables bit reversal of output + - value = 0: disables bit reversal of output + - value = 1: enables bit reversal of output + @return none + */ +void arm_cfft_q31( + const arm_cfft_instance_q31 * S, + q31_t * p1, + uint8_t ifftFlag, + uint8_t bitReverseFlag) +{ + uint32_t L = S->fftLen; + + if (ifftFlag == 1U) + { + switch (L) + { + case 16: + case 64: + case 256: + case 1024: + case 4096: + arm_radix4_butterfly_inverse_q31 ( p1, L, (q31_t*)S->pTwiddle, 1 ); + break; + + case 32: + case 128: + case 512: + case 2048: + arm_cfft_radix4by2_inverse_q31 ( p1, L, S->pTwiddle ); + break; + } + } + else + { + switch (L) + { + case 16: + case 64: + case 256: + case 1024: + case 4096: + arm_radix4_butterfly_q31 ( p1, L, (q31_t*)S->pTwiddle, 1 ); + break; + + case 32: + case 128: + case 512: + case 2048: + arm_cfft_radix4by2_q31 ( p1, L, S->pTwiddle ); + break; + } + } + + if ( bitReverseFlag ) + arm_bitreversal_32 ((uint32_t*) p1, S->bitRevLength, S->pBitRevTable); +} + +/** + @} end of ComplexFFT group + */ + +void arm_cfft_radix4by2_q31( + q31_t * pSrc, + uint32_t fftLen, + const q31_t * pCoef) +{ + uint32_t i, l; + uint32_t n2; + q31_t xt, yt, cosVal, sinVal; + q31_t p0, p1; + + n2 = fftLen >> 1U; + for (i = 0; i < n2; i++) + { + cosVal = pCoef[2 * i]; + sinVal = pCoef[2 * i + 1]; + + l = i + n2; + + xt = (pSrc[2 * i] >> 2U) - (pSrc[2 * l] >> 2U); + pSrc[2 * i] = (pSrc[2 * i] >> 2U) + (pSrc[2 * l] >> 2U); + + yt = (pSrc[2 * i + 1] >> 2U) - (pSrc[2 * l + 1] >> 2U); + pSrc[2 * i + 1] = (pSrc[2 * l + 1] >> 2U) + (pSrc[2 * i + 1] >> 2U); + + mult_32x32_keep32_R(p0, xt, cosVal); + mult_32x32_keep32_R(p1, yt, cosVal); + multAcc_32x32_keep32_R(p0, yt, sinVal); + multSub_32x32_keep32_R(p1, xt, sinVal); + + pSrc[2 * l] = p0 << 1; + pSrc[2 * l + 1] = p1 << 1; + } + + + /* first col */ + arm_radix4_butterfly_q31 (pSrc, n2, (q31_t*)pCoef, 2U); + + /* second col */ + arm_radix4_butterfly_q31 (pSrc + fftLen, n2, (q31_t*)pCoef, 2U); + + n2 = fftLen >> 1U; + for (i = 0; i < n2; i++) + { + p0 = pSrc[4 * i + 0]; + p1 = pSrc[4 * i + 1]; + xt = pSrc[4 * i + 2]; + yt = pSrc[4 * i + 3]; + + p0 <<= 1U; + p1 <<= 1U; + xt <<= 1U; + yt <<= 1U; + + pSrc[4 * i + 0] = p0; + pSrc[4 * i + 1] = p1; + pSrc[4 * i + 2] = xt; + pSrc[4 * i + 3] = yt; + } + +} + +void arm_cfft_radix4by2_inverse_q31( + q31_t * pSrc, + uint32_t fftLen, + const q31_t * pCoef) +{ + uint32_t i, l; + uint32_t n2; + q31_t xt, yt, cosVal, sinVal; + q31_t p0, p1; + + n2 = fftLen >> 1U; + for (i = 0; i < n2; i++) + { + cosVal = pCoef[2 * i]; + sinVal = pCoef[2 * i + 1]; + + l = i + n2; + + xt = (pSrc[2 * i] >> 2U) - (pSrc[2 * l] >> 2U); + pSrc[2 * i] = (pSrc[2 * i] >> 2U) + (pSrc[2 * l] >> 2U); + + yt = (pSrc[2 * i + 1] >> 2U) - (pSrc[2 * l + 1] >> 2U); + pSrc[2 * i + 1] = (pSrc[2 * l + 1] >> 2U) + (pSrc[2 * i + 1] >> 2U); + + mult_32x32_keep32_R(p0, xt, cosVal); + mult_32x32_keep32_R(p1, yt, cosVal); + multSub_32x32_keep32_R(p0, yt, sinVal); + multAcc_32x32_keep32_R(p1, xt, sinVal); + + pSrc[2 * l] = p0 << 1U; + pSrc[2 * l + 1] = p1 << 1U; + } + + /* first col */ + arm_radix4_butterfly_inverse_q31( pSrc, n2, (q31_t*)pCoef, 2U); + + /* second col */ + arm_radix4_butterfly_inverse_q31( pSrc + fftLen, n2, (q31_t*)pCoef, 2U); + + n2 = fftLen >> 1U; + for (i = 0; i < n2; i++) + { + p0 = pSrc[4 * i + 0]; + p1 = pSrc[4 * i + 1]; + xt = pSrc[4 * i + 2]; + yt = pSrc[4 * i + 3]; + + p0 <<= 1U; + p1 <<= 1U; + xt <<= 1U; + yt <<= 1U; + + pSrc[4 * i + 0] = p0; + pSrc[4 * i + 1] = p1; + pSrc[4 * i + 2] = xt; + pSrc[4 * i + 3] = yt; + } +} +#endif /* defined(ARM_MATH_MVEI) */ diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_f16.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_f16.c new file mode 100644 index 0000000..8863fbc --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_f16.c @@ -0,0 +1,475 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_cfft_radix2_f16.c + * Description: Radix-2 Decimation in Frequency CFFT & CIFFT Floating point processing function + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/transform_functions_f16.h" + +#if defined(ARM_FLOAT16_SUPPORTED) + +void arm_radix2_butterfly_f16( + float16_t * pSrc, + uint32_t fftLen, + const float16_t * pCoef, + uint16_t twidCoefModifier); + +void arm_radix2_butterfly_inverse_f16( + float16_t * pSrc, + uint32_t fftLen, + const float16_t * pCoef, + uint16_t twidCoefModifier, + float16_t onebyfftLen); + +extern void arm_bitreversal_f16( + float16_t * pSrc, + uint16_t fftSize, + uint16_t bitRevFactor, + const uint16_t * pBitRevTab); + +/** + @ingroup groupTransforms + */ + +/** + @addtogroup ComplexFFT + @{ + */ + +/** + @brief Radix-2 CFFT/CIFFT. + @deprecated Do not use this function. It has been superseded by \ref arm_cfft_f16 and will be removed in the future + @param[in] S points to an instance of the floating-point Radix-2 CFFT/CIFFT structure + @param[in,out] pSrc points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place + @return none + */ + +void arm_cfft_radix2_f16( +const arm_cfft_radix2_instance_f16 * S, + float16_t * pSrc) +{ + + if (S->ifftFlag == 1U) + { + /* Complex IFFT radix-2 */ + arm_radix2_butterfly_inverse_f16(pSrc, S->fftLen, S->pTwiddle, + S->twidCoefModifier, S->onebyfftLen); + } + else + { + /* Complex FFT radix-2 */ + arm_radix2_butterfly_f16(pSrc, S->fftLen, S->pTwiddle, + S->twidCoefModifier); + } + + if (S->bitReverseFlag == 1U) + { + /* Bit Reversal */ + arm_bitreversal_f16(pSrc, S->fftLen, S->bitRevFactor, S->pBitRevTable); + } + +} + + +/** + @} end of ComplexFFT group + */ + + + +/* ---------------------------------------------------------------------- +** Internal helper function used by the FFTs +** ------------------------------------------------------------------- */ + +/* +* @brief Core function for the floating-point CFFT butterfly process. +* @param[in, out] *pSrc points to the in-place buffer of floating-point data type. +* @param[in] fftLen length of the FFT. +* @param[in] *pCoef points to the twiddle coefficient buffer. +* @param[in] twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. +* @return none. +*/ + +void arm_radix2_butterfly_f16( +float16_t * pSrc, +uint32_t fftLen, +const float16_t * pCoef, +uint16_t twidCoefModifier) +{ + + uint32_t i, j, k, l; + uint32_t n1, n2, ia; + float16_t xt, yt, cosVal, sinVal; + float16_t p0, p1, p2, p3; + float16_t a0, a1; + +#if defined (ARM_MATH_DSP) + + /* Initializations for the first stage */ + n2 = fftLen >> 1; + ia = 0; + i = 0; + + // loop for groups + for (k = n2; k > 0; k--) + { + cosVal = pCoef[ia * 2]; + sinVal = pCoef[(ia * 2) + 1]; + + /* Twiddle coefficients index modifier */ + ia += twidCoefModifier; + + /* index calculation for the input as, */ + /* pSrc[i + 0], pSrc[i + fftLen/1] */ + l = i + n2; + + /* Butterfly implementation */ + a0 = (_Float16)pSrc[2 * i] + (_Float16)pSrc[2 * l]; + xt = (_Float16)pSrc[2 * i] - (_Float16)pSrc[2 * l]; + + yt = (_Float16)pSrc[2 * i + 1] - (_Float16)pSrc[2 * l + 1]; + a1 = (_Float16)pSrc[2 * l + 1] + (_Float16)pSrc[2 * i + 1]; + + p0 = (_Float16)xt * (_Float16)cosVal; + p1 = (_Float16)yt * (_Float16)sinVal; + p2 = (_Float16)yt * (_Float16)cosVal; + p3 = (_Float16)xt * (_Float16)sinVal; + + pSrc[2 * i] = a0; + pSrc[2 * i + 1] = a1; + + pSrc[2 * l] = (_Float16)p0 + (_Float16)p1; + pSrc[2 * l + 1] = (_Float16)p2 - (_Float16)p3; + + i++; + } // groups loop end + + twidCoefModifier <<= 1U; + + // loop for stage + for (k = n2; k > 2; k = k >> 1) + { + n1 = n2; + n2 = n2 >> 1; + ia = 0; + + // loop for groups + j = 0; + do + { + cosVal = pCoef[ia * 2]; + sinVal = pCoef[(ia * 2) + 1]; + ia += twidCoefModifier; + + // loop for butterfly + i = j; + do + { + l = i + n2; + a0 = (_Float16)pSrc[2 * i] + (_Float16)pSrc[2 * l]; + xt = (_Float16)pSrc[2 * i] - (_Float16)pSrc[2 * l]; + + yt = (_Float16)pSrc[2 * i + 1] - (_Float16)pSrc[2 * l + 1]; + a1 = (_Float16)pSrc[2 * l + 1] + (_Float16)pSrc[2 * i + 1]; + + p0 = (_Float16)xt * (_Float16)cosVal; + p1 = (_Float16)yt * (_Float16)sinVal; + p2 = (_Float16)yt * (_Float16)cosVal; + p3 = (_Float16)xt * (_Float16)sinVal; + + pSrc[2 * i] = a0; + pSrc[2 * i + 1] = a1; + + pSrc[2 * l] = (_Float16)p0 + (_Float16)p1; + pSrc[2 * l + 1] = (_Float16)p2 - (_Float16)p3; + + i += n1; + } while ( i < fftLen ); // butterfly loop end + j++; + } while ( j < n2); // groups loop end + twidCoefModifier <<= 1U; + } // stages loop end + + // loop for butterfly + for (i = 0; i < fftLen; i += 2) + { + a0 = (_Float16)pSrc[2 * i] + (_Float16)pSrc[2 * i + 2]; + xt = (_Float16)pSrc[2 * i] - (_Float16)pSrc[2 * i + 2]; + + yt = (_Float16)pSrc[2 * i + 1] - (_Float16)pSrc[2 * i + 3]; + a1 = (_Float16)pSrc[2 * i + 3] + (_Float16)pSrc[2 * i + 1]; + + pSrc[2 * i] = a0; + pSrc[2 * i + 1] = a1; + pSrc[2 * i + 2] = xt; + pSrc[2 * i + 3] = yt; + } // groups loop end + +#else + + n2 = fftLen; + + // loop for stage + for (k = fftLen; k > 1; k = k >> 1) + { + n1 = n2; + n2 = n2 >> 1; + ia = 0; + + // loop for groups + j = 0; + do + { + cosVal = pCoef[ia * 2]; + sinVal = pCoef[(ia * 2) + 1]; + ia += twidCoefModifier; + + // loop for butterfly + i = j; + do + { + l = i + n2; + a0 = (_Float16)pSrc[2 * i] + (_Float16)pSrc[2 * l]; + xt = (_Float16)pSrc[2 * i] - (_Float16)pSrc[2 * l]; + + yt = (_Float16)pSrc[2 * i + 1] - (_Float16)pSrc[2 * l + 1]; + a1 = (_Float16)pSrc[2 * l + 1] + (_Float16)pSrc[2 * i + 1]; + + p0 = (_Float16)xt * (_Float16)cosVal; + p1 = (_Float16)yt * (_Float16)sinVal; + p2 = (_Float16)yt * (_Float16)cosVal; + p3 = (_Float16)xt * (_Float16)sinVal; + + pSrc[2 * i] = a0; + pSrc[2 * i + 1] = a1; + + pSrc[2 * l] = (_Float16)p0 + (_Float16)p1; + pSrc[2 * l + 1] = (_Float16)p2 - (_Float16)p3; + + i += n1; + } while (i < fftLen); + j++; + } while (j < n2); + twidCoefModifier <<= 1U; + } + +#endif // #if defined (ARM_MATH_DSP) + +} + + +void arm_radix2_butterfly_inverse_f16( +float16_t * pSrc, +uint32_t fftLen, +const float16_t * pCoef, +uint16_t twidCoefModifier, +float16_t onebyfftLen) +{ + + uint32_t i, j, k, l; + uint32_t n1, n2, ia; + float16_t xt, yt, cosVal, sinVal; + float16_t p0, p1, p2, p3; + float16_t a0, a1; + +#if defined (ARM_MATH_DSP) + + n2 = fftLen >> 1; + ia = 0; + + // loop for groups + for (i = 0; i < n2; i++) + { + cosVal = pCoef[ia * 2]; + sinVal = pCoef[(ia * 2) + 1]; + ia += twidCoefModifier; + + l = i + n2; + a0 = (_Float16)pSrc[2 * i] + (_Float16)pSrc[2 * l]; + xt = (_Float16)pSrc[2 * i] - (_Float16)pSrc[2 * l]; + + yt = (_Float16)pSrc[2 * i + 1] - (_Float16)pSrc[2 * l + 1]; + a1 = (_Float16)pSrc[2 * l + 1] + (_Float16)pSrc[2 * i + 1]; + + p0 = (_Float16)xt * (_Float16)cosVal; + p1 = (_Float16)yt * (_Float16)sinVal; + p2 = (_Float16)yt * (_Float16)cosVal; + p3 = (_Float16)xt * (_Float16)sinVal; + + pSrc[2 * i] = a0; + pSrc[2 * i + 1] = a1; + + pSrc[2 * l] = (_Float16)p0 - (_Float16)p1; + pSrc[2 * l + 1] = (_Float16)p2 + (_Float16)p3; + } // groups loop end + + twidCoefModifier <<= 1U; + + // loop for stage + for (k = fftLen / 2; k > 2; k = k >> 1) + { + n1 = n2; + n2 = n2 >> 1; + ia = 0; + + // loop for groups + j = 0; + do + { + cosVal = pCoef[ia * 2]; + sinVal = pCoef[(ia * 2) + 1]; + ia += twidCoefModifier; + + // loop for butterfly + i = j; + do + { + l = i + n2; + a0 = (_Float16)pSrc[2 * i] + (_Float16)pSrc[2 * l]; + xt = (_Float16)pSrc[2 * i] - (_Float16)pSrc[2 * l]; + + yt = (_Float16)pSrc[2 * i + 1] - (_Float16)pSrc[2 * l + 1]; + a1 = (_Float16)pSrc[2 * l + 1] + (_Float16)pSrc[2 * i + 1]; + + p0 = (_Float16)xt * (_Float16)cosVal; + p1 = (_Float16)yt * (_Float16)sinVal; + p2 = (_Float16)yt * (_Float16)cosVal; + p3 = (_Float16)xt * (_Float16)sinVal; + + pSrc[2 * i] = a0; + pSrc[2 * i + 1] = a1; + + pSrc[2 * l] = (_Float16)p0 - (_Float16)p1; + pSrc[2 * l + 1] = (_Float16)p2 + (_Float16)p3; + + i += n1; + } while ( i < fftLen ); // butterfly loop end + j++; + } while (j < n2); // groups loop end + + twidCoefModifier <<= 1U; + } // stages loop end + + // loop for butterfly + for (i = 0; i < fftLen; i += 2) + { + a0 = (_Float16)pSrc[2 * i] + (_Float16)pSrc[2 * i + 2]; + xt = (_Float16)pSrc[2 * i] - (_Float16)pSrc[2 * i + 2]; + + a1 = (_Float16)pSrc[2 * i + 3] + (_Float16)pSrc[2 * i + 1]; + yt = (_Float16)pSrc[2 * i + 1] - (_Float16)pSrc[2 * i + 3]; + + p0 = (_Float16)a0 * (_Float16)onebyfftLen; + p2 = (_Float16)xt * (_Float16)onebyfftLen; + p1 = (_Float16)a1 * (_Float16)onebyfftLen; + p3 = (_Float16)yt * (_Float16)onebyfftLen; + + pSrc[2 * i] = p0; + pSrc[2 * i + 1] = p1; + pSrc[2 * i + 2] = p2; + pSrc[2 * i + 3] = p3; + } // butterfly loop end + +#else + + n2 = fftLen; + + // loop for stage + for (k = fftLen; k > 2; k = k >> 1) + { + n1 = n2; + n2 = n2 >> 1; + ia = 0; + + // loop for groups + j = 0; + do + { + cosVal = pCoef[ia * 2]; + sinVal = pCoef[(ia * 2) + 1]; + ia = ia + twidCoefModifier; + + // loop for butterfly + i = j; + do + { + l = i + n2; + a0 = (_Float16)pSrc[2 * i] + (_Float16)pSrc[2 * l]; + xt = (_Float16)pSrc[2 * i] - (_Float16)pSrc[2 * l]; + + yt = (_Float16)pSrc[2 * i + 1] - (_Float16)pSrc[2 * l + 1]; + a1 = (_Float16)pSrc[2 * l + 1] + (_Float16)pSrc[2 * i + 1]; + + p0 = (_Float16)xt * (_Float16)cosVal; + p1 = (_Float16)yt * (_Float16)sinVal; + p2 = (_Float16)yt * (_Float16)cosVal; + p3 = (_Float16)xt * (_Float16)sinVal; + + pSrc[2 * i] = a0; + pSrc[2 * i + 1] = a1; + + pSrc[2 * l] = (_Float16)p0 - (_Float16)p1; + pSrc[2 * l + 1] = (_Float16)p2 + (_Float16)p3; + + i += n1; + } while ( i < fftLen ); // butterfly loop end + j++; + } while ( j < n2 ); // groups loop end + + twidCoefModifier = twidCoefModifier << 1U; + } // stages loop end + + n1 = n2; + n2 = n2 >> 1; + + // loop for butterfly + for (i = 0; i < fftLen; i += n1) + { + l = i + n2; + + a0 = (_Float16)pSrc[2 * i] + (_Float16)pSrc[2 * l]; + xt = (_Float16)pSrc[2 * i] - (_Float16)pSrc[2 * l]; + + a1 = (_Float16)pSrc[2 * l + 1] + (_Float16)pSrc[2 * i + 1]; + yt = (_Float16)pSrc[2 * i + 1] - (_Float16)pSrc[2 * l + 1]; + + p0 = (_Float16)a0 * (_Float16)onebyfftLen; + p2 = (_Float16)xt * (_Float16)onebyfftLen; + p1 = (_Float16)a1 * (_Float16)onebyfftLen; + p3 = (_Float16)yt * (_Float16)onebyfftLen; + + pSrc[2 * i] = p0; + pSrc[2U * l] = p2; + + pSrc[2 * i + 1] = p1; + pSrc[2U * l + 1U] = p3; + } // butterfly loop end + +#endif // #if defined (ARM_MATH_DSP) + +} + + +#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_f32.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_f32.c new file mode 100644 index 0000000..ab218a5 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_f32.c @@ -0,0 +1,470 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_cfft_radix2_f32.c + * Description: Radix-2 Decimation in Frequency CFFT & CIFFT Floating point processing function + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/transform_functions.h" + +void arm_radix2_butterfly_f32( + float32_t * pSrc, + uint32_t fftLen, + const float32_t * pCoef, + uint16_t twidCoefModifier); + +void arm_radix2_butterfly_inverse_f32( + float32_t * pSrc, + uint32_t fftLen, + const float32_t * pCoef, + uint16_t twidCoefModifier, + float32_t onebyfftLen); + +extern void arm_bitreversal_f32( + float32_t * pSrc, + uint16_t fftSize, + uint16_t bitRevFactor, + const uint16_t * pBitRevTab); + +/** + @ingroup groupTransforms + */ + +/** + @addtogroup ComplexFFT + @{ + */ + +/** + @brief Radix-2 CFFT/CIFFT. + @deprecated Do not use this function. It has been superseded by \ref arm_cfft_f32 and will be removed in the future + @param[in] S points to an instance of the floating-point Radix-2 CFFT/CIFFT structure + @param[in,out] pSrc points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place + @return none + */ + +void arm_cfft_radix2_f32( +const arm_cfft_radix2_instance_f32 * S, + float32_t * pSrc) +{ + + if (S->ifftFlag == 1U) + { + /* Complex IFFT radix-2 */ + arm_radix2_butterfly_inverse_f32(pSrc, S->fftLen, S->pTwiddle, + S->twidCoefModifier, S->onebyfftLen); + } + else + { + /* Complex FFT radix-2 */ + arm_radix2_butterfly_f32(pSrc, S->fftLen, S->pTwiddle, + S->twidCoefModifier); + } + + if (S->bitReverseFlag == 1U) + { + /* Bit Reversal */ + arm_bitreversal_f32(pSrc, S->fftLen, S->bitRevFactor, S->pBitRevTable); + } + +} + + +/** + @} end of ComplexFFT group + */ + + + +/* ---------------------------------------------------------------------- + ** Internal helper function used by the FFTs + ** ------------------------------------------------------------------- */ + +/** + brief Core function for the floating-point CFFT butterfly process. + param[in,out] pSrc points to in-place buffer of floating-point data type + param[in] fftLen length of the FFT + param[in] pCoef points to twiddle coefficient buffer + param[in] twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table + return none + */ + +void arm_radix2_butterfly_f32( + float32_t * pSrc, + uint32_t fftLen, + const float32_t * pCoef, + uint16_t twidCoefModifier) +{ + + uint32_t i, j, k, l; + uint32_t n1, n2, ia; + float32_t xt, yt, cosVal, sinVal; + float32_t p0, p1, p2, p3; + float32_t a0, a1; + +#if defined (ARM_MATH_DSP) + + /* Initializations for the first stage */ + n2 = fftLen >> 1; + ia = 0; + i = 0; + + // loop for groups + for (k = n2; k > 0; k--) + { + cosVal = pCoef[ia * 2]; + sinVal = pCoef[(ia * 2) + 1]; + + /* Twiddle coefficients index modifier */ + ia += twidCoefModifier; + + /* index calculation for the input as, */ + /* pSrc[i + 0], pSrc[i + fftLen/1] */ + l = i + n2; + + /* Butterfly implementation */ + a0 = pSrc[2 * i] + pSrc[2 * l]; + xt = pSrc[2 * i] - pSrc[2 * l]; + + yt = pSrc[2 * i + 1] - pSrc[2 * l + 1]; + a1 = pSrc[2 * l + 1] + pSrc[2 * i + 1]; + + p0 = xt * cosVal; + p1 = yt * sinVal; + p2 = yt * cosVal; + p3 = xt * sinVal; + + pSrc[2 * i] = a0; + pSrc[2 * i + 1] = a1; + + pSrc[2 * l] = p0 + p1; + pSrc[2 * l + 1] = p2 - p3; + + i++; + } // groups loop end + + twidCoefModifier <<= 1U; + + // loop for stage + for (k = n2; k > 2; k = k >> 1) + { + n1 = n2; + n2 = n2 >> 1; + ia = 0; + + // loop for groups + j = 0; + do + { + cosVal = pCoef[ia * 2]; + sinVal = pCoef[(ia * 2) + 1]; + ia += twidCoefModifier; + + // loop for butterfly + i = j; + do + { + l = i + n2; + a0 = pSrc[2 * i] + pSrc[2 * l]; + xt = pSrc[2 * i] - pSrc[2 * l]; + + yt = pSrc[2 * i + 1] - pSrc[2 * l + 1]; + a1 = pSrc[2 * l + 1] + pSrc[2 * i + 1]; + + p0 = xt * cosVal; + p1 = yt * sinVal; + p2 = yt * cosVal; + p3 = xt * sinVal; + + pSrc[2 * i] = a0; + pSrc[2 * i + 1] = a1; + + pSrc[2 * l] = p0 + p1; + pSrc[2 * l + 1] = p2 - p3; + + i += n1; + } while ( i < fftLen ); // butterfly loop end + j++; + } while ( j < n2); // groups loop end + twidCoefModifier <<= 1U; + } // stages loop end + + // loop for butterfly + for (i = 0; i < fftLen; i += 2) + { + a0 = pSrc[2 * i] + pSrc[2 * i + 2]; + xt = pSrc[2 * i] - pSrc[2 * i + 2]; + + yt = pSrc[2 * i + 1] - pSrc[2 * i + 3]; + a1 = pSrc[2 * i + 3] + pSrc[2 * i + 1]; + + pSrc[2 * i] = a0; + pSrc[2 * i + 1] = a1; + pSrc[2 * i + 2] = xt; + pSrc[2 * i + 3] = yt; + } // groups loop end + +#else /* #if defined (ARM_MATH_DSP) */ + + n2 = fftLen; + + // loop for stage + for (k = fftLen; k > 1; k = k >> 1) + { + n1 = n2; + n2 = n2 >> 1; + ia = 0; + + // loop for groups + j = 0; + do + { + cosVal = pCoef[ia * 2]; + sinVal = pCoef[(ia * 2) + 1]; + ia += twidCoefModifier; + + // loop for butterfly + i = j; + do + { + l = i + n2; + a0 = pSrc[2 * i] + pSrc[2 * l]; + xt = pSrc[2 * i] - pSrc[2 * l]; + + yt = pSrc[2 * i + 1] - pSrc[2 * l + 1]; + a1 = pSrc[2 * l + 1] + pSrc[2 * i + 1]; + + p0 = xt * cosVal; + p1 = yt * sinVal; + p2 = yt * cosVal; + p3 = xt * sinVal; + + pSrc[2 * i] = a0; + pSrc[2 * i + 1] = a1; + + pSrc[2 * l] = p0 + p1; + pSrc[2 * l + 1] = p2 - p3; + + i += n1; + } while (i < fftLen); + j++; + } while (j < n2); + twidCoefModifier <<= 1U; + } + +#endif /* #if defined (ARM_MATH_DSP) */ + +} + + +void arm_radix2_butterfly_inverse_f32( + float32_t * pSrc, + uint32_t fftLen, + const float32_t * pCoef, + uint16_t twidCoefModifier, + float32_t onebyfftLen) +{ + + uint32_t i, j, k, l; + uint32_t n1, n2, ia; + float32_t xt, yt, cosVal, sinVal; + float32_t p0, p1, p2, p3; + float32_t a0, a1; + +#if defined (ARM_MATH_DSP) + + n2 = fftLen >> 1; + ia = 0; + + // loop for groups + for (i = 0; i < n2; i++) + { + cosVal = pCoef[ia * 2]; + sinVal = pCoef[(ia * 2) + 1]; + ia += twidCoefModifier; + + l = i + n2; + a0 = pSrc[2 * i] + pSrc[2 * l]; + xt = pSrc[2 * i] - pSrc[2 * l]; + + yt = pSrc[2 * i + 1] - pSrc[2 * l + 1]; + a1 = pSrc[2 * l + 1] + pSrc[2 * i + 1]; + + p0 = xt * cosVal; + p1 = yt * sinVal; + p2 = yt * cosVal; + p3 = xt * sinVal; + + pSrc[2 * i] = a0; + pSrc[2 * i + 1] = a1; + + pSrc[2 * l] = p0 - p1; + pSrc[2 * l + 1] = p2 + p3; + } // groups loop end + + twidCoefModifier <<= 1U; + + // loop for stage + for (k = fftLen / 2; k > 2; k = k >> 1) + { + n1 = n2; + n2 = n2 >> 1; + ia = 0; + + // loop for groups + j = 0; + do + { + cosVal = pCoef[ia * 2]; + sinVal = pCoef[(ia * 2) + 1]; + ia += twidCoefModifier; + + // loop for butterfly + i = j; + do + { + l = i + n2; + a0 = pSrc[2 * i] + pSrc[2 * l]; + xt = pSrc[2 * i] - pSrc[2 * l]; + + yt = pSrc[2 * i + 1] - pSrc[2 * l + 1]; + a1 = pSrc[2 * l + 1] + pSrc[2 * i + 1]; + + p0 = xt * cosVal; + p1 = yt * sinVal; + p2 = yt * cosVal; + p3 = xt * sinVal; + + pSrc[2 * i] = a0; + pSrc[2 * i + 1] = a1; + + pSrc[2 * l] = p0 - p1; + pSrc[2 * l + 1] = p2 + p3; + + i += n1; + } while ( i < fftLen ); // butterfly loop end + j++; + } while (j < n2); // groups loop end + + twidCoefModifier <<= 1U; + } // stages loop end + + // loop for butterfly + for (i = 0; i < fftLen; i += 2) + { + a0 = pSrc[2 * i] + pSrc[2 * i + 2]; + xt = pSrc[2 * i] - pSrc[2 * i + 2]; + + a1 = pSrc[2 * i + 3] + pSrc[2 * i + 1]; + yt = pSrc[2 * i + 1] - pSrc[2 * i + 3]; + + p0 = a0 * onebyfftLen; + p2 = xt * onebyfftLen; + p1 = a1 * onebyfftLen; + p3 = yt * onebyfftLen; + + pSrc[2 * i] = p0; + pSrc[2 * i + 1] = p1; + pSrc[2 * i + 2] = p2; + pSrc[2 * i + 3] = p3; + } // butterfly loop end + +#else /* #if defined (ARM_MATH_DSP) */ + + n2 = fftLen; + + // loop for stage + for (k = fftLen; k > 2; k = k >> 1) + { + n1 = n2; + n2 = n2 >> 1; + ia = 0; + + // loop for groups + j = 0; + do + { + cosVal = pCoef[ia * 2]; + sinVal = pCoef[(ia * 2) + 1]; + ia = ia + twidCoefModifier; + + // loop for butterfly + i = j; + do + { + l = i + n2; + a0 = pSrc[2 * i] + pSrc[2 * l]; + xt = pSrc[2 * i] - pSrc[2 * l]; + + yt = pSrc[2 * i + 1] - pSrc[2 * l + 1]; + a1 = pSrc[2 * l + 1] + pSrc[2 * i + 1]; + + p0 = xt * cosVal; + p1 = yt * sinVal; + p2 = yt * cosVal; + p3 = xt * sinVal; + + pSrc[2 * i] = a0; + pSrc[2 * i + 1] = a1; + + pSrc[2 * l] = p0 - p1; + pSrc[2 * l + 1] = p2 + p3; + + i += n1; + } while ( i < fftLen ); // butterfly loop end + j++; + } while ( j < n2 ); // groups loop end + + twidCoefModifier = twidCoefModifier << 1U; + } // stages loop end + + n1 = n2; + n2 = n2 >> 1; + + // loop for butterfly + for (i = 0; i < fftLen; i += n1) + { + l = i + n2; + + a0 = pSrc[2 * i] + pSrc[2 * l]; + xt = pSrc[2 * i] - pSrc[2 * l]; + + a1 = pSrc[2 * l + 1] + pSrc[2 * i + 1]; + yt = pSrc[2 * i + 1] - pSrc[2 * l + 1]; + + p0 = a0 * onebyfftLen; + p2 = xt * onebyfftLen; + p1 = a1 * onebyfftLen; + p3 = yt * onebyfftLen; + + pSrc[2 * i] = p0; + pSrc[2 * l] = p2; + + pSrc[2 * i + 1] = p1; + pSrc[2 * l + 1] = p3; + } // butterfly loop end + +#endif /* #if defined (ARM_MATH_DSP) */ + +} diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_init_f16.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_init_f16.c new file mode 100644 index 0000000..e021a0e --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_init_f16.c @@ -0,0 +1,214 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_cfft_radix2_init_f16.c + * Description: Radix-2 Decimation in Frequency Floating-point CFFT & CIFFT Initialization function + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/transform_functions_f16.h" +#include "arm_common_tables.h" +#include "arm_common_tables_f16.h" + +/** + @ingroup groupTransforms + */ + +/** + @addtogroup ComplexFFT + @{ + */ + +/** + @brief Initialization function for the floating-point CFFT/CIFFT. + @deprecated Do not use this function. It has been superseded by \ref arm_cfft_f16 and will be removed in the future. + @param[in,out] S points to an instance of the floating-point CFFT/CIFFT structure + @param[in] fftLen length of the FFT + @param[in] ifftFlag flag that selects transform direction + - value = 0: forward transform + - value = 1: inverse transform + @param[in] bitReverseFlag flag that enables / disables bit reversal of output + - value = 0: disables bit reversal of output + - value = 1: enables bit reversal of output + @return execution status + - \ref ARM_MATH_SUCCESS : Operation successful + - \ref ARM_MATH_ARGUMENT_ERROR : <code>fftLen</code> is not a supported length + + @par Details + The parameter <code>ifftFlag</code> controls whether a forward or inverse transform is computed. + Set(=1) ifftFlag for calculation of CIFFT otherwise CFFT is calculated + @par + The parameter <code>bitReverseFlag</code> controls whether output is in normal order or bit reversed order. + Set(=1) bitReverseFlag for output to be in normal order otherwise output is in bit reversed order. + @par + The parameter <code>fftLen</code> Specifies length of CFFT/CIFFT process. Supported FFT Lengths are 16, 64, 256, 1024. + @par + This Function also initializes Twiddle factor table pointer and Bit reversal table pointer. +*/ + +#if defined(ARM_FLOAT16_SUPPORTED) + +arm_status arm_cfft_radix2_init_f16( + arm_cfft_radix2_instance_f16 * S, + uint16_t fftLen, + uint8_t ifftFlag, + uint8_t bitReverseFlag) +{ + /* Initialise the default arm status */ + arm_status status = ARM_MATH_ARGUMENT_ERROR; + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES) + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F16_4096) + + + /* Initialise the default arm status */ + status = ARM_MATH_SUCCESS; + + /* Initialise the FFT length */ + S->fftLen = fftLen; + + /* Initialise the Twiddle coefficient pointer */ + S->pTwiddle = (float16_t *) twiddleCoefF16_4096; + + /* Initialise the Flag for selection of CFFT or CIFFT */ + S->ifftFlag = ifftFlag; + + /* Initialise the Flag for calculation Bit reversal or not */ + S->bitReverseFlag = bitReverseFlag; + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREV_1024) + + /* Initializations of structure parameters depending on the FFT length */ + switch (S->fftLen) + { + + case 4096U: + /* Initializations of structure parameters for 4096 point FFT */ + + /* Initialise the twiddle coef modifier value */ + S->twidCoefModifier = 1U; + /* Initialise the bit reversal table modifier */ + S->bitRevFactor = 1U; + /* Initialise the bit reversal table pointer */ + S->pBitRevTable = (uint16_t *) armBitRevTable; + /* Initialise the 1/fftLen Value */ + S->onebyfftLen = 0.000244140625; + break; + + case 2048U: + /* Initializations of structure parameters for 2048 point FFT */ + + /* Initialise the twiddle coef modifier value */ + S->twidCoefModifier = 2U; + /* Initialise the bit reversal table modifier */ + S->bitRevFactor = 2U; + /* Initialise the bit reversal table pointer */ + S->pBitRevTable = (uint16_t *) & armBitRevTable[1]; + /* Initialise the 1/fftLen Value */ + S->onebyfftLen = 0.00048828125; + break; + + case 1024U: + /* Initializations of structure parameters for 1024 point FFT */ + + /* Initialise the twiddle coef modifier value */ + S->twidCoefModifier = 4U; + /* Initialise the bit reversal table modifier */ + S->bitRevFactor = 4U; + /* Initialise the bit reversal table pointer */ + S->pBitRevTable = (uint16_t *) & armBitRevTable[3]; + /* Initialise the 1/fftLen Value */ + S->onebyfftLen = 0.0009765625f; + break; + + case 512U: + /* Initializations of structure parameters for 512 point FFT */ + + /* Initialise the twiddle coef modifier value */ + S->twidCoefModifier = 8U; + /* Initialise the bit reversal table modifier */ + S->bitRevFactor = 8U; + /* Initialise the bit reversal table pointer */ + S->pBitRevTable = (uint16_t *) & armBitRevTable[7]; + /* Initialise the 1/fftLen Value */ + S->onebyfftLen = 0.001953125; + break; + + case 256U: + /* Initializations of structure parameters for 256 point FFT */ + S->twidCoefModifier = 16U; + S->bitRevFactor = 16U; + S->pBitRevTable = (uint16_t *) & armBitRevTable[15]; + S->onebyfftLen = 0.00390625f; + break; + + case 128U: + /* Initializations of structure parameters for 128 point FFT */ + S->twidCoefModifier = 32U; + S->bitRevFactor = 32U; + S->pBitRevTable = (uint16_t *) & armBitRevTable[31]; + S->onebyfftLen = 0.0078125; + break; + + case 64U: + /* Initializations of structure parameters for 64 point FFT */ + S->twidCoefModifier = 64U; + S->bitRevFactor = 64U; + S->pBitRevTable = (uint16_t *) & armBitRevTable[63]; + S->onebyfftLen = 0.015625f; + break; + + case 32U: + /* Initializations of structure parameters for 64 point FFT */ + S->twidCoefModifier = 128U; + S->bitRevFactor = 128U; + S->pBitRevTable = (uint16_t *) & armBitRevTable[127]; + S->onebyfftLen = 0.03125; + break; + + case 16U: + /* Initializations of structure parameters for 16 point FFT */ + S->twidCoefModifier = 256U; + S->bitRevFactor = 256U; + S->pBitRevTable = (uint16_t *) & armBitRevTable[255]; + S->onebyfftLen = 0.0625f; + break; + + + default: + /* Reporting argument error if fftSize is not valid value */ + status = ARM_MATH_ARGUMENT_ERROR; + break; + } + +#endif +#endif +#endif + return (status); +} + +#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ +/** + @} end of ComplexFFT group + */ diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_init_f32.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_init_f32.c new file mode 100644 index 0000000..ae9f29a --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_init_f32.c @@ -0,0 +1,209 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_cfft_radix2_init_f32.c + * Description: Radix-2 Decimation in Frequency Floating-point CFFT & CIFFT Initialization function + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/transform_functions.h" +#include "arm_common_tables.h" + +/** + @ingroup groupTransforms + */ + +/** + @addtogroup ComplexFFT + @{ + */ + +/** + @brief Initialization function for the floating-point CFFT/CIFFT. + @deprecated Do not use this function. It has been superseded by \ref arm_cfft_f32 and will be removed in the future. + @param[in,out] S points to an instance of the floating-point CFFT/CIFFT structure + @param[in] fftLen length of the FFT + @param[in] ifftFlag flag that selects transform direction + - value = 0: forward transform + - value = 1: inverse transform + @param[in] bitReverseFlag flag that enables / disables bit reversal of output + - value = 0: disables bit reversal of output + - value = 1: enables bit reversal of output + @return execution status + - \ref ARM_MATH_SUCCESS : Operation successful + - \ref ARM_MATH_ARGUMENT_ERROR : <code>fftLen</code> is not a supported length + + @par Details + The parameter <code>ifftFlag</code> controls whether a forward or inverse transform is computed. + Set(=1) ifftFlag for calculation of CIFFT otherwise CFFT is calculated + @par + The parameter <code>bitReverseFlag</code> controls whether output is in normal order or bit reversed order. + Set(=1) bitReverseFlag for output to be in normal order otherwise output is in bit reversed order. + @par + The parameter <code>fftLen</code> Specifies length of CFFT/CIFFT process. Supported FFT Lengths are 16, 64, 256, 1024. + @par + This Function also initializes Twiddle factor table pointer and Bit reversal table pointer. +*/ + +arm_status arm_cfft_radix2_init_f32( + arm_cfft_radix2_instance_f32 * S, + uint16_t fftLen, + uint8_t ifftFlag, + uint8_t bitReverseFlag) +{ + /* Initialise the default arm status */ + arm_status status = ARM_MATH_ARGUMENT_ERROR; + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES) + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F32_4096) + + /* Initialise the default arm status */ + status = ARM_MATH_SUCCESS; + + /* Initialise the FFT length */ + S->fftLen = fftLen; + + /* Initialise the Twiddle coefficient pointer */ + S->pTwiddle = (float32_t *) twiddleCoef; + + /* Initialise the Flag for selection of CFFT or CIFFT */ + S->ifftFlag = ifftFlag; + + /* Initialise the Flag for calculation Bit reversal or not */ + S->bitReverseFlag = bitReverseFlag; + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F32_4096) + + /* Initializations of structure parameters depending on the FFT length */ + switch (S->fftLen) + { + + case 4096U: + /* Initializations of structure parameters for 4096 point FFT */ + + /* Initialise the twiddle coef modifier value */ + S->twidCoefModifier = 1U; + /* Initialise the bit reversal table modifier */ + S->bitRevFactor = 1U; + /* Initialise the bit reversal table pointer */ + S->pBitRevTable = (uint16_t *) armBitRevTable; + /* Initialise the 1/fftLen Value */ + S->onebyfftLen = 0.000244140625; + break; + + case 2048U: + /* Initializations of structure parameters for 2048 point FFT */ + + /* Initialise the twiddle coef modifier value */ + S->twidCoefModifier = 2U; + /* Initialise the bit reversal table modifier */ + S->bitRevFactor = 2U; + /* Initialise the bit reversal table pointer */ + S->pBitRevTable = (uint16_t *) & armBitRevTable[1]; + /* Initialise the 1/fftLen Value */ + S->onebyfftLen = 0.00048828125; + break; + + case 1024U: + /* Initializations of structure parameters for 1024 point FFT */ + + /* Initialise the twiddle coef modifier value */ + S->twidCoefModifier = 4U; + /* Initialise the bit reversal table modifier */ + S->bitRevFactor = 4U; + /* Initialise the bit reversal table pointer */ + S->pBitRevTable = (uint16_t *) & armBitRevTable[3]; + /* Initialise the 1/fftLen Value */ + S->onebyfftLen = 0.0009765625f; + break; + + case 512U: + /* Initializations of structure parameters for 512 point FFT */ + + /* Initialise the twiddle coef modifier value */ + S->twidCoefModifier = 8U; + /* Initialise the bit reversal table modifier */ + S->bitRevFactor = 8U; + /* Initialise the bit reversal table pointer */ + S->pBitRevTable = (uint16_t *) & armBitRevTable[7]; + /* Initialise the 1/fftLen Value */ + S->onebyfftLen = 0.001953125; + break; + + case 256U: + /* Initializations of structure parameters for 256 point FFT */ + S->twidCoefModifier = 16U; + S->bitRevFactor = 16U; + S->pBitRevTable = (uint16_t *) & armBitRevTable[15]; + S->onebyfftLen = 0.00390625f; + break; + + case 128U: + /* Initializations of structure parameters for 128 point FFT */ + S->twidCoefModifier = 32U; + S->bitRevFactor = 32U; + S->pBitRevTable = (uint16_t *) & armBitRevTable[31]; + S->onebyfftLen = 0.0078125; + break; + + case 64U: + /* Initializations of structure parameters for 64 point FFT */ + S->twidCoefModifier = 64U; + S->bitRevFactor = 64U; + S->pBitRevTable = (uint16_t *) & armBitRevTable[63]; + S->onebyfftLen = 0.015625f; + break; + + case 32U: + /* Initializations of structure parameters for 64 point FFT */ + S->twidCoefModifier = 128U; + S->bitRevFactor = 128U; + S->pBitRevTable = (uint16_t *) & armBitRevTable[127]; + S->onebyfftLen = 0.03125; + break; + + case 16U: + /* Initializations of structure parameters for 16 point FFT */ + S->twidCoefModifier = 256U; + S->bitRevFactor = 256U; + S->pBitRevTable = (uint16_t *) & armBitRevTable[255]; + S->onebyfftLen = 0.0625f; + break; + + + default: + /* Reporting argument error if fftSize is not valid value */ + status = ARM_MATH_ARGUMENT_ERROR; + break; + } + +#endif +#endif +#endif + return (status); +} + +/** + @} end of ComplexFFT group + */ diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_init_q15.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_init_q15.c new file mode 100644 index 0000000..68c9930 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_init_q15.c @@ -0,0 +1,194 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_cfft_radix2_init_q15.c + * Description: Radix-2 Decimation in Frequency Q15 FFT & IFFT initialization function + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/transform_functions.h" +#include "arm_common_tables.h" + +/** + @ingroup groupTransforms + */ + + +/** + @addtogroup ComplexFFT + @{ + */ + +/** + @brief Initialization function for the Q15 CFFT/CIFFT. + @deprecated Do not use this function. It has been superseded by \ref arm_cfft_q15 and will be removed + @param[in,out] S points to an instance of the Q15 CFFT/CIFFT structure. + @param[in] fftLen length of the FFT. + @param[in] ifftFlag flag that selects transform direction + - value = 0: forward transform + - value = 1: inverse transform + @param[in] bitReverseFlag flag that enables / disables bit reversal of output + - value = 0: disables bit reversal of output + - value = 1: enables bit reversal of output + @return execution status + - \ref ARM_MATH_SUCCESS : Operation successful + - \ref ARM_MATH_ARGUMENT_ERROR : <code>fftLen</code> is not a supported length + + @par Details + The parameter <code>ifftFlag</code> controls whether a forward or inverse transform is computed. + Set(=1) ifftFlag for calculation of CIFFT otherwise CFFT is calculated + @par + The parameter <code>bitReverseFlag</code> controls whether output is in normal order or bit reversed order. + Set(=1) bitReverseFlag for output to be in normal order otherwise output is in bit reversed order. + @par + The parameter <code>fftLen</code> Specifies length of CFFT/CIFFT process. Supported FFT Lengths are 16, 64, 256, 1024. + @par + This Function also initializes Twiddle factor table pointer and Bit reversal table pointer. +*/ + +arm_status arm_cfft_radix2_init_q15( + arm_cfft_radix2_instance_q15 * S, + uint16_t fftLen, + uint8_t ifftFlag, + uint8_t bitReverseFlag) +{ + /* Initialise the default arm status */ + arm_status status = ARM_MATH_ARGUMENT_ERROR; + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES) + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q15_4096) + + /* Initialise the default arm status */ + status = ARM_MATH_SUCCESS; + + /* Initialise the FFT length */ + S->fftLen = fftLen; + + /* Initialise the Twiddle coefficient pointer */ + S->pTwiddle = (q15_t *) twiddleCoef_4096_q15; + /* Initialise the Flag for selection of CFFT or CIFFT */ + S->ifftFlag = ifftFlag; + /* Initialise the Flag for calculation Bit reversal or not */ + S->bitReverseFlag = bitReverseFlag; + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREV_1024) + + /* Initializations of structure parameters depending on the FFT length */ + switch (S->fftLen) + { + case 4096U: + /* Initializations of structure parameters for 4096 point FFT */ + + /* Initialise the twiddle coef modifier value */ + S->twidCoefModifier = 1U; + /* Initialise the bit reversal table modifier */ + S->bitRevFactor = 1U; + /* Initialise the bit reversal table pointer */ + S->pBitRevTable = (uint16_t *) armBitRevTable; + + break; + + case 2048U: + /* Initializations of structure parameters for 2048 point FFT */ + + /* Initialise the twiddle coef modifier value */ + S->twidCoefModifier = 2U; + /* Initialise the bit reversal table modifier */ + S->bitRevFactor = 2U; + /* Initialise the bit reversal table pointer */ + S->pBitRevTable = (uint16_t *) & armBitRevTable[1]; + + break; + + case 1024U: + /* Initializations of structure parameters for 1024 point FFT */ + S->twidCoefModifier = 4U; + S->bitRevFactor = 4U; + S->pBitRevTable = (uint16_t *) & armBitRevTable[3]; + + break; + + case 512U: + /* Initializations of structure parameters for 512 point FFT */ + S->twidCoefModifier = 8U; + S->bitRevFactor = 8U; + S->pBitRevTable = (uint16_t *) & armBitRevTable[7]; + + break; + + case 256U: + /* Initializations of structure parameters for 256 point FFT */ + S->twidCoefModifier = 16U; + S->bitRevFactor = 16U; + S->pBitRevTable = (uint16_t *) & armBitRevTable[15]; + + break; + + case 128U: + /* Initializations of structure parameters for 128 point FFT */ + S->twidCoefModifier = 32U; + S->bitRevFactor = 32U; + S->pBitRevTable = (uint16_t *) & armBitRevTable[31]; + + break; + + case 64U: + /* Initializations of structure parameters for 64 point FFT */ + S->twidCoefModifier = 64U; + S->bitRevFactor = 64U; + S->pBitRevTable = (uint16_t *) & armBitRevTable[63]; + + break; + + case 32U: + /* Initializations of structure parameters for 32 point FFT */ + S->twidCoefModifier = 128U; + S->bitRevFactor = 128U; + S->pBitRevTable = (uint16_t *) & armBitRevTable[127]; + + break; + + case 16U: + /* Initializations of structure parameters for 16 point FFT */ + S->twidCoefModifier = 256U; + S->bitRevFactor = 256U; + S->pBitRevTable = (uint16_t *) & armBitRevTable[255]; + + break; + + default: + /* Reporting argument error if fftSize is not valid value */ + status = ARM_MATH_ARGUMENT_ERROR; + break; + } + +#endif +#endif +#endif + return (status); +} + +/** + @} end of ComplexFFT group + */ diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_init_q31.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_init_q31.c new file mode 100644 index 0000000..73b8a39 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_init_q31.c @@ -0,0 +1,191 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_cfft_radix2_init_q31.c + * Description: Radix-2 Decimation in Frequency Fixed-point CFFT & CIFFT Initialization function + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/transform_functions.h" +#include "arm_common_tables.h" + +/** + @ingroup groupTransforms + */ + +/** + @addtogroup ComplexFFT + @{ + */ + +/** + @brief Initialization function for the Q31 CFFT/CIFFT. + @deprecated Do not use this function. It has been superseded by \ref arm_cfft_q31 and will be removed in the future. + @param[in,out] S points to an instance of the Q31 CFFT/CIFFT structure + @param[in] fftLen length of the FFT + @param[in] ifftFlag flag that selects transform direction + - value = 0: forward transform + - value = 1: inverse transform + @param[in] bitReverseFlag flag that enables / disables bit reversal of output + - value = 0: disables bit reversal of output + - value = 1: enables bit reversal of output + @return execution status + - \ref ARM_MATH_SUCCESS : Operation successful + - \ref ARM_MATH_ARGUMENT_ERROR : <code>fftLen</code> is not a supported length + + @par Details + The parameter <code>ifftFlag</code> controls whether a forward or inverse transform is computed. + Set(=1) ifftFlag for calculation of CIFFT otherwise CFFT is calculated + @par + The parameter <code>bitReverseFlag</code> controls whether output is in normal order or bit reversed order. + Set(=1) bitReverseFlag for output to be in normal order otherwise output is in bit reversed order. + @par + The parameter <code>fftLen</code> Specifies length of CFFT/CIFFT process. Supported FFT Lengths are 16, 64, 256, 1024. + @par + This Function also initializes Twiddle factor table pointer and Bit reversal table pointer. +*/ + +arm_status arm_cfft_radix2_init_q31( + arm_cfft_radix2_instance_q31 * S, + uint16_t fftLen, + uint8_t ifftFlag, + uint8_t bitReverseFlag) +{ + /* Initialise the default arm status */ + arm_status status = ARM_MATH_ARGUMENT_ERROR; + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES) + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q31_4096) + + /* Initialise the default arm status */ + status = ARM_MATH_SUCCESS; + + /* Initialise the FFT length */ + S->fftLen = fftLen; + + /* Initialise the Twiddle coefficient pointer */ + S->pTwiddle = (q31_t *) twiddleCoef_4096_q31; + + /* Initialise the Flag for selection of CFFT or CIFFT */ + S->ifftFlag = ifftFlag; + + /* Initialise the Flag for calculation Bit reversal or not */ + S->bitReverseFlag = bitReverseFlag; + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREV_1024) + + /* Initializations of Instance structure depending on the FFT length */ + switch (S->fftLen) + { + /* Initializations of structure parameters for 4096 point FFT */ + case 4096U: + /* Initialise the twiddle coef modifier value */ + S->twidCoefModifier = 1U; + /* Initialise the bit reversal table modifier */ + S->bitRevFactor = 1U; + /* Initialise the bit reversal table pointer */ + S->pBitRevTable = (uint16_t *) armBitRevTable; + break; + + /* Initializations of structure parameters for 2048 point FFT */ + case 2048U: + /* Initialise the twiddle coef modifier value */ + S->twidCoefModifier = 2U; + /* Initialise the bit reversal table modifier */ + S->bitRevFactor = 2U; + /* Initialise the bit reversal table pointer */ + S->pBitRevTable = (uint16_t *) & armBitRevTable[1]; + break; + + /* Initializations of structure parameters for 1024 point FFT */ + case 1024U: + /* Initialise the twiddle coef modifier value */ + S->twidCoefModifier = 4U; + /* Initialise the bit reversal table modifier */ + S->bitRevFactor = 4U; + /* Initialise the bit reversal table pointer */ + S->pBitRevTable = (uint16_t *) & armBitRevTable[3]; + break; + + /* Initializations of structure parameters for 512 point FFT */ + case 512U: + /* Initialise the twiddle coef modifier value */ + S->twidCoefModifier = 8U; + /* Initialise the bit reversal table modifier */ + S->bitRevFactor = 8U; + /* Initialise the bit reversal table pointer */ + S->pBitRevTable = (uint16_t *) & armBitRevTable[7]; + break; + + case 256U: + /* Initializations of structure parameters for 256 point FFT */ + S->twidCoefModifier = 16U; + S->bitRevFactor = 16U; + S->pBitRevTable = (uint16_t *) & armBitRevTable[15]; + break; + + case 128U: + /* Initializations of structure parameters for 128 point FFT */ + S->twidCoefModifier = 32U; + S->bitRevFactor = 32U; + S->pBitRevTable = (uint16_t *) & armBitRevTable[31]; + break; + + case 64U: + /* Initializations of structure parameters for 64 point FFT */ + S->twidCoefModifier = 64U; + S->bitRevFactor = 64U; + S->pBitRevTable = (uint16_t *) & armBitRevTable[63]; + break; + + case 32U: + /* Initializations of structure parameters for 32 point FFT */ + S->twidCoefModifier = 128U; + S->bitRevFactor = 128U; + S->pBitRevTable = (uint16_t *) & armBitRevTable[127]; + break; + + case 16U: + /* Initializations of structure parameters for 16 point FFT */ + S->twidCoefModifier = 256U; + S->bitRevFactor = 256U; + S->pBitRevTable = (uint16_t *) & armBitRevTable[255]; + break; + + + default: + /* Reporting argument error if fftSize is not valid value */ + status = ARM_MATH_ARGUMENT_ERROR; + break; + } + +#endif +#endif +#endif + return (status); +} + +/** + @} end of ComplexFFT group + */ diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_q15.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_q15.c new file mode 100644 index 0000000..ca15ea1 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_q15.c @@ -0,0 +1,689 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_cfft_radix2_q15.c + * Description: Radix-2 Decimation in Frequency CFFT & CIFFT Fixed point processing function + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/transform_functions.h" + +void arm_radix2_butterfly_q15( + q15_t * pSrc, + uint32_t fftLen, + const q15_t * pCoef, + uint16_t twidCoefModifier); + +void arm_radix2_butterfly_inverse_q15( + q15_t * pSrc, + uint32_t fftLen, + const q15_t * pCoef, + uint16_t twidCoefModifier); + +void arm_bitreversal_q15( + q15_t * pSrc, + uint32_t fftLen, + uint16_t bitRevFactor, + const uint16_t * pBitRevTab); + +/** + @ingroup groupTransforms + */ + +/** + @addtogroup ComplexFFT + @{ + */ + +/** + @brief Processing function for the fixed-point CFFT/CIFFT. + @deprecated Do not use this function. It has been superseded by \ref arm_cfft_q15 and will be removed in the future. + @param[in] S points to an instance of the fixed-point CFFT/CIFFT structure + @param[in,out] pSrc points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place + @return none + */ + +void arm_cfft_radix2_q15( + const arm_cfft_radix2_instance_q15 * S, + q15_t * pSrc) +{ + + if (S->ifftFlag == 1U) + { + arm_radix2_butterfly_inverse_q15 (pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier); + } + else + { + arm_radix2_butterfly_q15 (pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier); + } + + arm_bitreversal_q15(pSrc, S->fftLen, S->bitRevFactor, S->pBitRevTable); +} + +/** + @} end of ComplexFFT group + */ + +void arm_radix2_butterfly_q15( + q15_t * pSrc, + uint32_t fftLen, + const q15_t * pCoef, + uint16_t twidCoefModifier) +{ +#if defined (ARM_MATH_DSP) + + uint32_t i, j, k, l; + uint32_t n1, n2, ia; + q15_t in; + q31_t T, S, R; + q31_t coeff, out1, out2; + + //N = fftLen; + n2 = fftLen; + + n1 = n2; + n2 = n2 >> 1; + ia = 0; + + // loop for groups + for (i = 0; i < n2; i++) + { + coeff = read_q15x2 ((q15_t *)pCoef + (ia * 2U)); + + ia = ia + twidCoefModifier; + + l = i + n2; + + T = read_q15x2 (pSrc + (2 * i)); + in = ((int16_t) (T & 0xFFFF)) >> 1; + T = ((T >> 1) & 0xFFFF0000) | (in & 0xFFFF); + + S = read_q15x2 (pSrc + (2 * l)); + in = ((int16_t) (S & 0xFFFF)) >> 1; + S = ((S >> 1) & 0xFFFF0000) | (in & 0xFFFF); + + R = __QSUB16(T, S); + + write_q15x2 (pSrc + (2 * i), __SHADD16(T, S)); + +#ifndef ARM_MATH_BIG_ENDIAN + out1 = __SMUAD(coeff, R) >> 16; + out2 = __SMUSDX(coeff, R); +#else + out1 = __SMUSDX(R, coeff) >> 16U; + out2 = __SMUAD(coeff, R); +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + write_q15x2 (pSrc + (2U * l), (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF)); + + coeff = read_q15x2 ((q15_t *)pCoef + (ia * 2U)); + + ia = ia + twidCoefModifier; + + /* loop for butterfly */ + i++; + l++; + + T = read_q15x2 (pSrc + (2 * i)); + in = ((int16_t) (T & 0xFFFF)) >> 1; + T = ((T >> 1) & 0xFFFF0000) | (in & 0xFFFF); + + S = read_q15x2 (pSrc + (2 * l)); + in = ((int16_t) (S & 0xFFFF)) >> 1; + S = ((S >> 1) & 0xFFFF0000) | (in & 0xFFFF); + + R = __QSUB16(T, S); + + write_q15x2 (pSrc + (2 * i), __SHADD16(T, S)); + +#ifndef ARM_MATH_BIG_ENDIAN + out1 = __SMUAD(coeff, R) >> 16; + out2 = __SMUSDX(coeff, R); +#else + + out1 = __SMUSDX(R, coeff) >> 16U; + out2 = __SMUAD(coeff, R); +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + write_q15x2 (pSrc + (2U * l), (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF)); + + } /* groups loop end */ + + twidCoefModifier = twidCoefModifier << 1U; + + /* loop for stage */ + for (k = fftLen / 2; k > 2; k = k >> 1) + { + n1 = n2; + n2 = n2 >> 1; + ia = 0; + + /* loop for groups */ + for (j = 0; j < n2; j++) + { + coeff = read_q15x2 ((q15_t *)pCoef + (ia * 2U)); + + ia = ia + twidCoefModifier; + + /* loop for butterfly */ + for (i = j; i < fftLen; i += n1) + { + l = i + n2; + + T = read_q15x2 (pSrc + (2 * i)); + + S = read_q15x2 (pSrc + (2 * l)); + + R = __QSUB16(T, S); + + write_q15x2 (pSrc + (2 * i), __SHADD16(T, S)); + +#ifndef ARM_MATH_BIG_ENDIAN + out1 = __SMUAD(coeff, R) >> 16; + out2 = __SMUSDX(coeff, R); +#else + out1 = __SMUSDX(R, coeff) >> 16U; + out2 = __SMUAD(coeff, R); +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + write_q15x2 (pSrc + (2U * l), (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF)); + + i += n1; + + l = i + n2; + + T = read_q15x2 (pSrc + (2 * i)); + + S = read_q15x2 (pSrc + (2 * l)); + + R = __QSUB16(T, S); + + write_q15x2 (pSrc + (2 * i), __SHADD16(T, S)); + +#ifndef ARM_MATH_BIG_ENDIAN + out1 = __SMUAD(coeff, R) >> 16; + out2 = __SMUSDX(coeff, R); +#else + out1 = __SMUSDX(R, coeff) >> 16U; + out2 = __SMUAD(coeff, R); +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + write_q15x2 (pSrc + (2U * l), (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF)); + + } /* butterfly loop end */ + + } /* groups loop end */ + + twidCoefModifier = twidCoefModifier << 1U; + } /* stages loop end */ + + n1 = n2; + n2 = n2 >> 1; + ia = 0; + + coeff = read_q15x2 ((q15_t *)pCoef + (ia * 2U)); + + ia = ia + twidCoefModifier; + + /* loop for butterfly */ + for (i = 0; i < fftLen; i += n1) + { + l = i + n2; + + T = read_q15x2 (pSrc + (2 * i)); + + S = read_q15x2 (pSrc + (2 * l)); + + R = __QSUB16(T, S); + + write_q15x2 (pSrc + (2 * i), __QADD16(T, S)); + + write_q15x2 (pSrc + (2 * l), R); + + i += n1; + l = i + n2; + + T = read_q15x2 (pSrc + (2 * i)); + + S = read_q15x2 (pSrc + (2 * l)); + + R = __QSUB16(T, S); + + write_q15x2 (pSrc + (2 * i), __QADD16(T, S)); + + write_q15x2 (pSrc + (2 * l), R); + + } /* groups loop end */ + + +#else /* #if defined (ARM_MATH_DSP) */ + + uint32_t i, j, k, l; + uint32_t n1, n2, ia; + q15_t xt, yt, cosVal, sinVal; + + + // N = fftLen; + n2 = fftLen; + + n1 = n2; + n2 = n2 >> 1; + ia = 0; + + /* loop for groups */ + for (j = 0; j < n2; j++) + { + cosVal = pCoef[(ia * 2)]; + sinVal = pCoef[(ia * 2) + 1]; + ia = ia + twidCoefModifier; + + /* loop for butterfly */ + for (i = j; i < fftLen; i += n1) + { + l = i + n2; + xt = (pSrc[2 * i] >> 1U) - (pSrc[2 * l] >> 1U); + pSrc[2 * i] = ((pSrc[2 * i] >> 1U) + (pSrc[2 * l] >> 1U)) >> 1U; + + yt = (pSrc[2 * i + 1] >> 1U) - (pSrc[2 * l + 1] >> 1U); + pSrc[2 * i + 1] = ((pSrc[2 * l + 1] >> 1U) + + (pSrc[2 * i + 1] >> 1U) ) >> 1U; + + pSrc[2 * l] = (((int16_t) (((q31_t) xt * cosVal) >> 16)) + + ((int16_t) (((q31_t) yt * sinVal) >> 16))); + + pSrc[2U * l + 1] = (((int16_t) (((q31_t) yt * cosVal) >> 16)) - + ((int16_t) (((q31_t) xt * sinVal) >> 16))); + + } /* butterfly loop end */ + + } /* groups loop end */ + + twidCoefModifier = twidCoefModifier << 1U; + + /* loop for stage */ + for (k = fftLen / 2; k > 2; k = k >> 1) + { + n1 = n2; + n2 = n2 >> 1; + ia = 0; + + /* loop for groups */ + for (j = 0; j < n2; j++) + { + cosVal = pCoef[ia * 2]; + sinVal = pCoef[(ia * 2) + 1]; + ia = ia + twidCoefModifier; + + /* loop for butterfly */ + for (i = j; i < fftLen; i += n1) + { + l = i + n2; + xt = pSrc[2 * i] - pSrc[2 * l]; + pSrc[2 * i] = (pSrc[2 * i] + pSrc[2 * l]) >> 1U; + + yt = pSrc[2 * i + 1] - pSrc[2 * l + 1]; + pSrc[2 * i + 1] = (pSrc[2 * l + 1] + pSrc[2 * i + 1]) >> 1U; + + pSrc[2 * l] = (((int16_t) (((q31_t) xt * cosVal) >> 16)) + + ((int16_t) (((q31_t) yt * sinVal) >> 16))); + + pSrc[2U * l + 1] = (((int16_t) (((q31_t) yt * cosVal) >> 16)) - + ((int16_t) (((q31_t) xt * sinVal) >> 16))); + + } /* butterfly loop end */ + + } /* groups loop end */ + + twidCoefModifier = twidCoefModifier << 1U; + } /* stages loop end */ + + n1 = n2; + n2 = n2 >> 1; + ia = 0; + + /* loop for groups */ + for (j = 0; j < n2; j++) + { + cosVal = pCoef[ia * 2]; + sinVal = pCoef[(ia * 2) + 1]; + + ia = ia + twidCoefModifier; + + /* loop for butterfly */ + for (i = j; i < fftLen; i += n1) + { + l = i + n2; + xt = pSrc[2 * i] - pSrc[2 * l]; + pSrc[2 * i] = (pSrc[2 * i] + pSrc[2 * l]); + + yt = pSrc[2 * i + 1] - pSrc[2 * l + 1]; + pSrc[2 * i + 1] = (pSrc[2 * l + 1] + pSrc[2 * i + 1]); + + pSrc[2 * l] = xt; + + pSrc[2 * l + 1] = yt; + + } /* butterfly loop end */ + + } /* groups loop end */ + + twidCoefModifier = twidCoefModifier << 1U; + +#endif /* #if defined (ARM_MATH_DSP) */ + +} + + +void arm_radix2_butterfly_inverse_q15( + q15_t * pSrc, + uint32_t fftLen, + const q15_t * pCoef, + uint16_t twidCoefModifier) +{ +#if defined (ARM_MATH_DSP) + + uint32_t i, j, k, l; + uint32_t n1, n2, ia; + q15_t in; + q31_t T, S, R; + q31_t coeff, out1, out2; + + // N = fftLen; + n2 = fftLen; + + n1 = n2; + n2 = n2 >> 1; + ia = 0; + + /* loop for groups */ + for (i = 0; i < n2; i++) + { + coeff = read_q15x2 ((q15_t *)pCoef + (ia * 2U)); + + ia = ia + twidCoefModifier; + + l = i + n2; + + T = read_q15x2 (pSrc + (2 * i)); + in = ((int16_t) (T & 0xFFFF)) >> 1; + T = ((T >> 1) & 0xFFFF0000) | (in & 0xFFFF); + + S = read_q15x2 (pSrc + (2 * l)); + in = ((int16_t) (S & 0xFFFF)) >> 1; + S = ((S >> 1) & 0xFFFF0000) | (in & 0xFFFF); + + R = __QSUB16(T, S); + + write_q15x2 (pSrc + (2 * i), __SHADD16(T, S)); + +#ifndef ARM_MATH_BIG_ENDIAN + out1 = __SMUSD(coeff, R) >> 16; + out2 = __SMUADX(coeff, R); +#else + out1 = __SMUADX(R, coeff) >> 16U; + out2 = __SMUSD(__QSUB(0, coeff), R); +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + write_q15x2 (pSrc + (2 * l), (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF)); + + coeff = read_q15x2 ((q15_t *)pCoef + (ia * 2U)); + + ia = ia + twidCoefModifier; + + /* loop for butterfly */ + i++; + l++; + + T = read_q15x2 (pSrc + (2 * i)); + in = ((int16_t) (T & 0xFFFF)) >> 1; + T = ((T >> 1) & 0xFFFF0000) | (in & 0xFFFF); + + S = read_q15x2 (pSrc + (2 * l)); + in = ((int16_t) (S & 0xFFFF)) >> 1; + S = ((S >> 1) & 0xFFFF0000) | (in & 0xFFFF); + + R = __QSUB16(T, S); + + write_q15x2 (pSrc + (2 * i), __SHADD16(T, S)); + +#ifndef ARM_MATH_BIG_ENDIAN + out1 = __SMUSD(coeff, R) >> 16; + out2 = __SMUADX(coeff, R); +#else + out1 = __SMUADX(R, coeff) >> 16U; + out2 = __SMUSD(__QSUB(0, coeff), R); +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + write_q15x2 (pSrc + (2 * l), (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF)); + + } /* groups loop end */ + + twidCoefModifier = twidCoefModifier << 1U; + + /* loop for stage */ + for (k = fftLen / 2; k > 2; k = k >> 1) + { + n1 = n2; + n2 = n2 >> 1; + ia = 0; + + /* loop for groups */ + for (j = 0; j < n2; j++) + { + coeff = read_q15x2 ((q15_t *)pCoef + (ia * 2U)); + + ia = ia + twidCoefModifier; + + /* loop for butterfly */ + for (i = j; i < fftLen; i += n1) + { + l = i + n2; + + T = read_q15x2 (pSrc + (2 * i)); + + S = read_q15x2 (pSrc + (2 * l)); + + R = __QSUB16(T, S); + + write_q15x2 (pSrc + (2 * i), __SHADD16(T, S)); + +#ifndef ARM_MATH_BIG_ENDIAN + out1 = __SMUSD(coeff, R) >> 16; + out2 = __SMUADX(coeff, R); +#else + out1 = __SMUADX(R, coeff) >> 16U; + out2 = __SMUSD(__QSUB(0, coeff), R); +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + write_q15x2 (pSrc + (2 * l), (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF)); + + i += n1; + + l = i + n2; + + T = read_q15x2 (pSrc + (2 * i)); + + S = read_q15x2 (pSrc + (2 * l)); + + R = __QSUB16(T, S); + + write_q15x2 (pSrc + (2 * i), __SHADD16(T, S)); + +#ifndef ARM_MATH_BIG_ENDIAN + out1 = __SMUSD(coeff, R) >> 16; + out2 = __SMUADX(coeff, R); +#else + out1 = __SMUADX(R, coeff) >> 16U; + out2 = __SMUSD(__QSUB(0, coeff), R); +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + write_q15x2 (pSrc + (2 * l), (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF)); + + } /* butterfly loop end */ + + } /* groups loop end */ + + twidCoefModifier = twidCoefModifier << 1U; + } /* stages loop end */ + + n1 = n2; + n2 = n2 >> 1; + ia = 0; + + /* loop for groups */ + for (j = 0; j < n2; j++) + { + coeff = read_q15x2 ((q15_t *)pCoef + (ia * 2U)); + + ia = ia + twidCoefModifier; + + /* loop for butterfly */ + for (i = j; i < fftLen; i += n1) + { + l = i + n2; + + T = read_q15x2 (pSrc + (2 * i)); + + S = read_q15x2 (pSrc + (2 * l)); + + R = __QSUB16(T, S); + + write_q15x2 (pSrc + (2 * i), __QADD16(T, S)); + + write_q15x2 (pSrc + (2 * l), R); + + } /* butterfly loop end */ + + } /* groups loop end */ + + twidCoefModifier = twidCoefModifier << 1U; + +#else /* #if defined (ARM_MATH_DSP) */ + + uint32_t i, j, k, l; + uint32_t n1, n2, ia; + q15_t xt, yt, cosVal, sinVal; + + // N = fftLen; + n2 = fftLen; + + n1 = n2; + n2 = n2 >> 1; + ia = 0; + + /* loop for groups */ + for (j = 0; j < n2; j++) + { + cosVal = pCoef[(ia * 2)]; + sinVal = pCoef[(ia * 2) + 1]; + ia = ia + twidCoefModifier; + + /* loop for butterfly */ + for (i = j; i < fftLen; i += n1) + { + l = i + n2; + xt = (pSrc[2 * i] >> 1U) - (pSrc[2 * l] >> 1U); + pSrc[2 * i] = ((pSrc[2 * i] >> 1U) + (pSrc[2 * l] >> 1U)) >> 1U; + + yt = (pSrc[2 * i + 1] >> 1U) - (pSrc[2 * l + 1] >> 1U); + pSrc[2 * i + 1] = ((pSrc[2 * l + 1] >> 1U) + + (pSrc[2 * i + 1] >> 1U) ) >> 1U; + + pSrc[2 * l] = (((int16_t) (((q31_t) xt * cosVal) >> 16)) - + ((int16_t) (((q31_t) yt * sinVal) >> 16))); + + pSrc[2 * l + 1] = (((int16_t) (((q31_t) yt * cosVal) >> 16)) + + ((int16_t) (((q31_t) xt * sinVal) >> 16))); + + } /* butterfly loop end */ + + } /* groups loop end */ + + twidCoefModifier = twidCoefModifier << 1U; + + /* loop for stage */ + for (k = fftLen / 2; k > 2; k = k >> 1) + { + n1 = n2; + n2 = n2 >> 1; + ia = 0; + + /* loop for groups */ + for (j = 0; j < n2; j++) + { + cosVal = pCoef[(ia * 2)]; + sinVal = pCoef[(ia * 2) + 1]; + ia = ia + twidCoefModifier; + + /* loop for butterfly */ + for (i = j; i < fftLen; i += n1) + { + l = i + n2; + xt = pSrc[2 * i] - pSrc[2 * l]; + pSrc[2 * i] = (pSrc[2 * i] + pSrc[2 * l]) >> 1U; + + yt = pSrc[2 * i + 1] - pSrc[2 * l + 1]; + pSrc[2 * i + 1] = (pSrc[2 * l + 1] + pSrc[2 * i + 1]) >> 1U; + + pSrc[2 * l] = (((int16_t) (((q31_t) xt * cosVal) >> 16)) - + ((int16_t) (((q31_t) yt * sinVal) >> 16)) ); + + pSrc[2 * l + 1] = (((int16_t) (((q31_t) yt * cosVal) >> 16)) + + ((int16_t) (((q31_t) xt * sinVal) >> 16)) ); + + } /* butterfly loop end */ + + } /* groups loop end */ + + twidCoefModifier = twidCoefModifier << 1U; + } /* stages loop end */ + + n1 = n2; + n2 = n2 >> 1; + ia = 0; + + cosVal = pCoef[(ia * 2)]; + sinVal = pCoef[(ia * 2) + 1]; + + ia = ia + twidCoefModifier; + + /* loop for butterfly */ + for (i = 0; i < fftLen; i += n1) + { + l = i + n2; + xt = pSrc[2 * i] - pSrc[2 * l]; + pSrc[2 * i] = (pSrc[2 * i] + pSrc[2 * l]); + + yt = pSrc[2 * i + 1] - pSrc[2 * l + 1]; + pSrc[2 * i + 1] = (pSrc[2 * l + 1] + pSrc[2 * i + 1]); + + pSrc[2 * l] = xt; + + pSrc[2 * l + 1] = yt; + + } /* groups loop end */ + + +#endif /* #if defined (ARM_MATH_DSP) */ + +} diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_q31.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_q31.c new file mode 100644 index 0000000..996e91d --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_q31.c @@ -0,0 +1,337 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_cfft_radix2_q31.c + * Description: Radix-2 Decimation in Frequency CFFT & CIFFT Fixed point processing function + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/transform_functions.h" + +void arm_radix2_butterfly_q31( + q31_t * pSrc, + uint32_t fftLen, + const q31_t * pCoef, + uint16_t twidCoefModifier); + +void arm_radix2_butterfly_inverse_q31( + q31_t * pSrc, + uint32_t fftLen, + const q31_t * pCoef, + uint16_t twidCoefModifier); + +void arm_bitreversal_q31( + q31_t * pSrc, + uint32_t fftLen, + uint16_t bitRevFactor, + const uint16_t * pBitRevTab); + +/** + @ingroup groupTransforms + */ + +/** + @addtogroup ComplexFFT + @{ + */ + +/** + @brief Processing function for the fixed-point CFFT/CIFFT. + @deprecated Do not use this function. It has been superseded by \ref arm_cfft_q31 and will be removed in the future. + @param[in] S points to an instance of the fixed-point CFFT/CIFFT structure + @param[in,out] pSrc points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place + @return none + */ + +void arm_cfft_radix2_q31( + const arm_cfft_radix2_instance_q31 * S, + q31_t * pSrc) +{ + + if (S->ifftFlag == 1U) + { + arm_radix2_butterfly_inverse_q31(pSrc, S->fftLen, + S->pTwiddle, S->twidCoefModifier); + } + else + { + arm_radix2_butterfly_q31(pSrc, S->fftLen, + S->pTwiddle, S->twidCoefModifier); + } + + arm_bitreversal_q31(pSrc, S->fftLen, S->bitRevFactor, S->pBitRevTable); +} + +/** + @} end of ComplexFFT group + */ + +void arm_radix2_butterfly_q31( + q31_t * pSrc, + uint32_t fftLen, + const q31_t * pCoef, + uint16_t twidCoefModifier) +{ + + unsigned i, j, k, l, m; + unsigned n1, n2, ia; + q31_t xt, yt, cosVal, sinVal; + q31_t p0, p1; + + //N = fftLen; + n2 = fftLen; + + n1 = n2; + n2 = n2 >> 1; + ia = 0; + + // loop for groups + for (i = 0; i < n2; i++) + { + cosVal = pCoef[ia * 2]; + sinVal = pCoef[(ia * 2) + 1]; + ia = ia + twidCoefModifier; + + l = i + n2; + xt = (pSrc[2 * i] >> 1U) - (pSrc[2 * l] >> 1U); + pSrc[2 * i] = ((pSrc[2 * i] >> 1U) + (pSrc[2 * l] >> 1U)) >> 1U; + + yt = (pSrc[2 * i + 1] >> 1U) - (pSrc[2 * l + 1] >> 1U); + pSrc[2 * i + 1] = + ((pSrc[2 * l + 1] >> 1U) + (pSrc[2 * i + 1] >> 1U)) >> 1U; + + mult_32x32_keep32_R(p0, xt, cosVal); + mult_32x32_keep32_R(p1, yt, cosVal); + multAcc_32x32_keep32_R(p0, yt, sinVal); + multSub_32x32_keep32_R(p1, xt, sinVal); + + pSrc[2U * l] = p0; + pSrc[2U * l + 1U] = p1; + + } // groups loop end + + twidCoefModifier <<= 1U; + + // loop for stage + for (k = fftLen / 2; k > 2; k = k >> 1) + { + n1 = n2; + n2 = n2 >> 1; + ia = 0; + + // loop for groups + for (j = 0; j < n2; j++) + { + cosVal = pCoef[ia * 2]; + sinVal = pCoef[(ia * 2) + 1]; + ia = ia + twidCoefModifier; + + // loop for butterfly + i = j; + m = fftLen / n1; + do + { + l = i + n2; + xt = pSrc[2 * i] - pSrc[2 * l]; + pSrc[2 * i] = (pSrc[2 * i] + pSrc[2 * l]) >> 1U; + + yt = pSrc[2 * i + 1] - pSrc[2 * l + 1]; + pSrc[2 * i + 1] = (pSrc[2 * l + 1] + pSrc[2 * i + 1]) >> 1U; + + mult_32x32_keep32_R(p0, xt, cosVal); + mult_32x32_keep32_R(p1, yt, cosVal); + multAcc_32x32_keep32_R(p0, yt, sinVal); + multSub_32x32_keep32_R(p1, xt, sinVal); + + pSrc[2U * l] = p0; + pSrc[2U * l + 1U] = p1; + i += n1; + m--; + } while ( m > 0); // butterfly loop end + + } // groups loop end + + twidCoefModifier <<= 1U; + } // stages loop end + + n1 = n2; + n2 = n2 >> 1; + ia = 0; + + cosVal = pCoef[ia * 2]; + sinVal = pCoef[(ia * 2) + 1]; + ia = ia + twidCoefModifier; + + // loop for butterfly + for (i = 0; i < fftLen; i += n1) + { + l = i + n2; + xt = pSrc[2 * i] - pSrc[2 * l]; + pSrc[2 * i] = (pSrc[2 * i] + pSrc[2 * l]); + + yt = pSrc[2 * i + 1] - pSrc[2 * l + 1]; + pSrc[2 * i + 1] = (pSrc[2 * l + 1] + pSrc[2 * i + 1]); + + pSrc[2U * l] = xt; + + pSrc[2U * l + 1U] = yt; + + i += n1; + l = i + n2; + + xt = pSrc[2 * i] - pSrc[2 * l]; + pSrc[2 * i] = (pSrc[2 * i] + pSrc[2 * l]); + + yt = pSrc[2 * i + 1] - pSrc[2 * l + 1]; + pSrc[2 * i + 1] = (pSrc[2 * l + 1] + pSrc[2 * i + 1]); + + pSrc[2U * l] = xt; + + pSrc[2U * l + 1U] = yt; + + } // butterfly loop end + +} + + +void arm_radix2_butterfly_inverse_q31( + q31_t * pSrc, + uint32_t fftLen, + const q31_t * pCoef, + uint16_t twidCoefModifier) +{ + + unsigned i, j, k, l; + unsigned n1, n2, ia; + q31_t xt, yt, cosVal, sinVal; + q31_t p0, p1; + + //N = fftLen; + n2 = fftLen; + + n1 = n2; + n2 = n2 >> 1; + ia = 0; + + // loop for groups + for (i = 0; i < n2; i++) + { + cosVal = pCoef[ia * 2]; + sinVal = pCoef[(ia * 2) + 1]; + ia = ia + twidCoefModifier; + + l = i + n2; + xt = (pSrc[2 * i] >> 1U) - (pSrc[2 * l] >> 1U); + pSrc[2 * i] = ((pSrc[2 * i] >> 1U) + (pSrc[2 * l] >> 1U)) >> 1U; + + yt = (pSrc[2 * i + 1] >> 1U) - (pSrc[2 * l + 1] >> 1U); + pSrc[2 * i + 1] = + ((pSrc[2 * l + 1] >> 1U) + (pSrc[2 * i + 1] >> 1U)) >> 1U; + + mult_32x32_keep32_R(p0, xt, cosVal); + mult_32x32_keep32_R(p1, yt, cosVal); + multSub_32x32_keep32_R(p0, yt, sinVal); + multAcc_32x32_keep32_R(p1, xt, sinVal); + + pSrc[2U * l] = p0; + pSrc[2U * l + 1U] = p1; + } // groups loop end + + twidCoefModifier = twidCoefModifier << 1U; + + // loop for stage + for (k = fftLen / 2; k > 2; k = k >> 1) + { + n1 = n2; + n2 = n2 >> 1; + ia = 0; + + // loop for groups + for (j = 0; j < n2; j++) + { + cosVal = pCoef[ia * 2]; + sinVal = pCoef[(ia * 2) + 1]; + ia = ia + twidCoefModifier; + + // loop for butterfly + for (i = j; i < fftLen; i += n1) + { + l = i + n2; + xt = pSrc[2 * i] - pSrc[2 * l]; + pSrc[2 * i] = (pSrc[2 * i] + pSrc[2 * l]) >> 1U; + + yt = pSrc[2 * i + 1] - pSrc[2 * l + 1]; + pSrc[2 * i + 1] = (pSrc[2 * l + 1] + pSrc[2 * i + 1]) >> 1U; + + mult_32x32_keep32_R(p0, xt, cosVal); + mult_32x32_keep32_R(p1, yt, cosVal); + multSub_32x32_keep32_R(p0, yt, sinVal); + multAcc_32x32_keep32_R(p1, xt, sinVal); + + pSrc[2U * l] = p0; + pSrc[2U * l + 1U] = p1; + } // butterfly loop end + + } // groups loop end + + twidCoefModifier = twidCoefModifier << 1U; + } // stages loop end + + n1 = n2; + n2 = n2 >> 1; + ia = 0; + + cosVal = pCoef[ia * 2]; + sinVal = pCoef[(ia * 2) + 1]; + ia = ia + twidCoefModifier; + + // loop for butterfly + for (i = 0; i < fftLen; i += n1) + { + l = i + n2; + xt = pSrc[2 * i] - pSrc[2 * l]; + pSrc[2 * i] = (pSrc[2 * i] + pSrc[2 * l]); + + yt = pSrc[2 * i + 1] - pSrc[2 * l + 1]; + pSrc[2 * i + 1] = (pSrc[2 * l + 1] + pSrc[2 * i + 1]); + + pSrc[2U * l] = xt; + + pSrc[2U * l + 1U] = yt; + + i += n1; + l = i + n2; + + xt = pSrc[2 * i] - pSrc[2 * l]; + pSrc[2 * i] = (pSrc[2 * i] + pSrc[2 * l]); + + yt = pSrc[2 * i + 1] - pSrc[2 * l + 1]; + pSrc[2 * i + 1] = (pSrc[2 * l + 1] + pSrc[2 * i + 1]); + + pSrc[2U * l] = xt; + + pSrc[2U * l + 1U] = yt; + + } // butterfly loop end + +} diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_f16.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_f16.c new file mode 100644 index 0000000..1fc5169 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_f16.c @@ -0,0 +1,1272 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_cfft_radix4_f16.c + * Description: Radix-4 Decimation in Frequency CFFT & CIFFT Floating point processing function + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/transform_functions_f16.h" + +#if defined(ARM_FLOAT16_SUPPORTED) + +extern void arm_bitreversal_f16( + float16_t * pSrc, + uint16_t fftSize, + uint16_t bitRevFactor, + const uint16_t * pBitRevTab); + +void arm_radix4_butterfly_f16( + float16_t * pSrc, + uint16_t fftLen, + const float16_t * pCoef, + uint16_t twidCoefModifier); + +void arm_radix4_butterfly_inverse_f16( + float16_t * pSrc, + uint16_t fftLen, + const float16_t * pCoef, + uint16_t twidCoefModifier, + float16_t onebyfftLen); + + +void arm_cfft_radix4by2_f16( + float16_t * pSrc, + uint32_t fftLen, + const float16_t * pCoef); + + +/** + @ingroup groupTransforms + */ + +/** + @addtogroup ComplexFFT + @{ + */ + +/* +* @brief Core function for the floating-point CFFT butterfly process. +* @param[in, out] *pSrc points to the in-place buffer of floating-point data type. +* @param[in] fftLen length of the FFT. +* @param[in] *pCoef points to the twiddle coefficient buffer. +* @param[in] twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. +* @return none. +*/ + +void arm_cfft_radix4by2_f16( + float16_t * pSrc, + uint32_t fftLen, + const float16_t * pCoef) +{ + uint32_t i, l; + uint32_t n2, ia; + float16_t xt, yt, cosVal, sinVal; + float16_t p0, p1,p2,p3,a0,a1; + + n2 = fftLen >> 1; + ia = 0; + for (i = 0; i < n2; i++) + { + cosVal = pCoef[2*ia]; + sinVal = pCoef[2*ia + 1]; + ia++; + + l = i + n2; + + /* Butterfly implementation */ + a0 = (_Float16)pSrc[2 * i] + (_Float16)pSrc[2 * l]; + xt = (_Float16)pSrc[2 * i] - (_Float16)pSrc[2 * l]; + + yt = (_Float16)pSrc[2 * i + 1] - (_Float16)pSrc[2 * l + 1]; + a1 = (_Float16)pSrc[2 * l + 1] + (_Float16)pSrc[2 * i + 1]; + + p0 = (_Float16)xt * (_Float16)cosVal; + p1 = (_Float16)yt * (_Float16)sinVal; + p2 = (_Float16)yt * (_Float16)cosVal; + p3 = (_Float16)xt * (_Float16)sinVal; + + pSrc[2 * i] = a0; + pSrc[2 * i + 1] = a1; + + pSrc[2 * l] = (_Float16)p0 + (_Float16)p1; + pSrc[2 * l + 1] = (_Float16)p2 - (_Float16)p3; + + } + + // first col + arm_radix4_butterfly_f16( pSrc, n2, (float16_t*)pCoef, 2U); + // second col + arm_radix4_butterfly_f16( pSrc + fftLen, n2, (float16_t*)pCoef, 2U); + +} + + +/** + @brief Processing function for the floating-point Radix-4 CFFT/CIFFT. + @deprecated Do not use this function. It has been superseded by \ref arm_cfft_f16 and will be removed in the future. + @param[in] S points to an instance of the floating-point Radix-4 CFFT/CIFFT structure + @param[in,out] pSrc points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place + @return none + */ + +void arm_cfft_radix4_f16( + const arm_cfft_radix4_instance_f16 * S, + float16_t * pSrc) +{ + if (S->ifftFlag == 1U) + { + /* Complex IFFT radix-4 */ + arm_radix4_butterfly_inverse_f16(pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier, S->onebyfftLen); + } + else + { + /* Complex FFT radix-4 */ + arm_radix4_butterfly_f16(pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier); + } + + if (S->bitReverseFlag == 1U) + { + /* Bit Reversal */ + arm_bitreversal_f16(pSrc, S->fftLen, S->bitRevFactor, S->pBitRevTable); + } + +} + +/** + @} end of ComplexFFT group + */ + +/* ---------------------------------------------------------------------- + * Internal helper function used by the FFTs + * ---------------------------------------------------------------------- */ + +/* +* @brief Core function for the floating-point CFFT butterfly process. +* @param[in, out] *pSrc points to the in-place buffer of floating-point data type. +* @param[in] fftLen length of the FFT. +* @param[in] *pCoef points to the twiddle coefficient buffer. +* @param[in] twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. +* @return none. +*/ + +void arm_radix4_butterfly_f16( +float16_t * pSrc, +uint16_t fftLen, +const float16_t * pCoef, +uint16_t twidCoefModifier) +{ + + float16_t co1, co2, co3, si1, si2, si3; + uint32_t ia1, ia2, ia3; + uint32_t i0, i1, i2, i3; + uint32_t n1, n2, j, k; + +#if defined (ARM_MATH_DSP) + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + float16_t xaIn, yaIn, xbIn, ybIn, xcIn, ycIn, xdIn, ydIn; + float16_t Xaplusc, Xbplusd, Yaplusc, Ybplusd, Xaminusc, Xbminusd, Yaminusc, + Ybminusd; + float16_t Xb12C_out, Yb12C_out, Xc12C_out, Yc12C_out, Xd12C_out, Yd12C_out; + float16_t Xb12_out, Yb12_out, Xc12_out, Yc12_out, Xd12_out, Yd12_out; + float16_t *ptr1; + float16_t p0,p1,p2,p3,p4,p5; + float16_t a0,a1,a2,a3,a4,a5,a6,a7; + + /* Initializations for the first stage */ + n2 = fftLen; + n1 = n2; + + /* n2 = fftLen/4 */ + n2 >>= 2U; + i0 = 0U; + ia1 = 0U; + + j = n2; + + /* Calculation of first stage */ + do + { + /* index calculation for the input as, */ + /* pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2], pSrc[i0 + 3fftLen/4] */ + i1 = i0 + n2; + i2 = i1 + n2; + i3 = i2 + n2; + + xaIn = pSrc[(2U * i0)]; + yaIn = pSrc[(2U * i0) + 1U]; + + xbIn = pSrc[(2U * i1)]; + ybIn = pSrc[(2U * i1) + 1U]; + + xcIn = pSrc[(2U * i2)]; + ycIn = pSrc[(2U * i2) + 1U]; + + xdIn = pSrc[(2U * i3)]; + ydIn = pSrc[(2U * i3) + 1U]; + + /* xa + xc */ + Xaplusc = (_Float16)xaIn + (_Float16)xcIn; + /* xb + xd */ + Xbplusd = (_Float16)xbIn + (_Float16)xdIn; + /* ya + yc */ + Yaplusc = (_Float16)yaIn + (_Float16)ycIn; + /* yb + yd */ + Ybplusd = (_Float16)ybIn + (_Float16)ydIn; + + /* index calculation for the coefficients */ + ia2 = ia1 + ia1; + co2 = pCoef[ia2 * 2U]; + si2 = pCoef[(ia2 * 2U) + 1U]; + + /* xa - xc */ + Xaminusc = (_Float16)xaIn - (_Float16)xcIn; + /* xb - xd */ + Xbminusd = (_Float16)xbIn - (_Float16)xdIn; + /* ya - yc */ + Yaminusc = (_Float16)yaIn - (_Float16)ycIn; + /* yb - yd */ + Ybminusd = (_Float16)ybIn - (_Float16)ydIn; + + /* xa' = xa + xb + xc + xd */ + pSrc[(2U * i0)] = (_Float16)Xaplusc + (_Float16)Xbplusd; + /* ya' = ya + yb + yc + yd */ + pSrc[(2U * i0) + 1U] = (_Float16)Yaplusc + (_Float16)Ybplusd; + + /* (xa - xc) + (yb - yd) */ + Xb12C_out = ((_Float16)Xaminusc + (_Float16)Ybminusd); + /* (ya - yc) + (xb - xd) */ + Yb12C_out = ((_Float16)Yaminusc - (_Float16)Xbminusd); + /* (xa + xc) - (xb + xd) */ + Xc12C_out = ((_Float16)Xaplusc - (_Float16)Xbplusd); + /* (ya + yc) - (yb + yd) */ + Yc12C_out = ((_Float16)Yaplusc - (_Float16)Ybplusd); + /* (xa - xc) - (yb - yd) */ + Xd12C_out = ((_Float16)Xaminusc - (_Float16)Ybminusd); + /* (ya - yc) + (xb - xd) */ + Yd12C_out = ((_Float16)Xbminusd + (_Float16)Yaminusc); + + co1 = pCoef[ia1 * 2U]; + si1 = pCoef[(ia1 * 2U) + 1U]; + + /* index calculation for the coefficients */ + ia3 = ia2 + ia1; + co3 = pCoef[ia3 * 2U]; + si3 = pCoef[(ia3 * 2U) + 1U]; + + Xb12_out = (_Float16)Xb12C_out * (_Float16)co1; + Yb12_out = (_Float16)Yb12C_out * (_Float16)co1; + Xc12_out = (_Float16)Xc12C_out * (_Float16)co2; + Yc12_out = (_Float16)Yc12C_out * (_Float16)co2; + Xd12_out = (_Float16)Xd12C_out * (_Float16)co3; + Yd12_out = (_Float16)Yd12C_out * (_Float16)co3; + + /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */ + //Xb12_out -= Yb12C_out * si1; + p0 = (_Float16)Yb12C_out * (_Float16)si1; + /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */ + //Yb12_out += Xb12C_out * si1; + p1 = (_Float16)Xb12C_out * (_Float16)si1; + /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */ + //Xc12_out -= Yc12C_out * si2; + p2 = (_Float16)Yc12C_out * (_Float16)si2; + /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */ + //Yc12_out += Xc12C_out * si2; + p3 = (_Float16)Xc12C_out * (_Float16)si2; + /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */ + //Xd12_out -= Yd12C_out * si3; + p4 = (_Float16)Yd12C_out * (_Float16)si3; + /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */ + //Yd12_out += Xd12C_out * si3; + p5 = (_Float16)Xd12C_out * (_Float16)si3; + + Xb12_out += (_Float16)p0; + Yb12_out -= (_Float16)p1; + Xc12_out += (_Float16)p2; + Yc12_out -= (_Float16)p3; + Xd12_out += (_Float16)p4; + Yd12_out -= (_Float16)p5; + + /* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */ + pSrc[2U * i1] = Xc12_out; + + /* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */ + pSrc[(2U * i1) + 1U] = Yc12_out; + + /* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */ + pSrc[2U * i2] = Xb12_out; + + /* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */ + pSrc[(2U * i2) + 1U] = Yb12_out; + + /* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */ + pSrc[2U * i3] = Xd12_out; + + /* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */ + pSrc[(2U * i3) + 1U] = Yd12_out; + + /* Twiddle coefficients index modifier */ + ia1 += twidCoefModifier; + + /* Updating input index */ + i0++; + + } + while (--j); + + twidCoefModifier <<= 2U; + + /* Calculation of second stage to excluding last stage */ + for (k = fftLen >> 2U; k > 4U; k >>= 2U) + { + /* Initializations for the first stage */ + n1 = n2; + n2 >>= 2U; + ia1 = 0U; + + /* Calculation of first stage */ + j = 0; + do + { + /* index calculation for the coefficients */ + ia2 = ia1 + ia1; + ia3 = ia2 + ia1; + co1 = pCoef[ia1 * 2U]; + si1 = pCoef[(ia1 * 2U) + 1U]; + co2 = pCoef[ia2 * 2U]; + si2 = pCoef[(ia2 * 2U) + 1U]; + co3 = pCoef[ia3 * 2U]; + si3 = pCoef[(ia3 * 2U) + 1U]; + + /* Twiddle coefficients index modifier */ + ia1 += twidCoefModifier; + + i0 = j; + do + { + /* index calculation for the input as, */ + /* pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2], pSrc[i0 + 3fftLen/4] */ + i1 = i0 + n2; + i2 = i1 + n2; + i3 = i2 + n2; + + xaIn = pSrc[(2U * i0)]; + yaIn = pSrc[(2U * i0) + 1U]; + + xbIn = pSrc[(2U * i1)]; + ybIn = pSrc[(2U * i1) + 1U]; + + xcIn = pSrc[(2U * i2)]; + ycIn = pSrc[(2U * i2) + 1U]; + + xdIn = pSrc[(2U * i3)]; + ydIn = pSrc[(2U * i3) + 1U]; + + /* xa - xc */ + Xaminusc = (_Float16)xaIn - (_Float16)xcIn; + /* (xb - xd) */ + Xbminusd = (_Float16)xbIn - (_Float16)xdIn; + /* ya - yc */ + Yaminusc = (_Float16)yaIn - (_Float16)ycIn; + /* (yb - yd) */ + Ybminusd = (_Float16)ybIn - (_Float16)ydIn; + + /* xa + xc */ + Xaplusc = (_Float16)xaIn + (_Float16)xcIn; + /* xb + xd */ + Xbplusd = (_Float16)xbIn + (_Float16)xdIn; + /* ya + yc */ + Yaplusc = (_Float16)yaIn + (_Float16)ycIn; + /* yb + yd */ + Ybplusd = (_Float16)ybIn + (_Float16)ydIn; + + /* (xa - xc) + (yb - yd) */ + Xb12C_out = ((_Float16)Xaminusc + (_Float16)Ybminusd); + /* (ya - yc) - (xb - xd) */ + Yb12C_out = ((_Float16)Yaminusc - (_Float16)Xbminusd); + /* xa + xc -(xb + xd) */ + Xc12C_out = ((_Float16)Xaplusc - (_Float16)Xbplusd); + /* (ya + yc) - (yb + yd) */ + Yc12C_out = ((_Float16)Yaplusc - (_Float16)Ybplusd); + /* (xa - xc) - (yb - yd) */ + Xd12C_out = ((_Float16)Xaminusc - (_Float16)Ybminusd); + /* (ya - yc) + (xb - xd) */ + Yd12C_out = ((_Float16)Xbminusd + (_Float16)Yaminusc); + + pSrc[(2U * i0)] = (_Float16)Xaplusc + (_Float16)Xbplusd; + pSrc[(2U * i0) + 1U] = (_Float16)Yaplusc + (_Float16)Ybplusd; + + Xb12_out = (_Float16)Xb12C_out * (_Float16)co1; + Yb12_out = (_Float16)Yb12C_out * (_Float16)co1; + Xc12_out = (_Float16)Xc12C_out * (_Float16)co2; + Yc12_out = (_Float16)Yc12C_out * (_Float16)co2; + Xd12_out = (_Float16)Xd12C_out * (_Float16)co3; + Yd12_out = (_Float16)Yd12C_out * (_Float16)co3; + + /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */ + //Xb12_out -= Yb12C_out * si1; + p0 = (_Float16)Yb12C_out * (_Float16)si1; + /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */ + //Yb12_out += Xb12C_out * si1; + p1 = (_Float16)Xb12C_out * (_Float16)si1; + /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */ + //Xc12_out -= Yc12C_out * si2; + p2 = (_Float16)Yc12C_out * (_Float16)si2; + /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */ + //Yc12_out += Xc12C_out * si2; + p3 = (_Float16)Xc12C_out * (_Float16)si2; + /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */ + //Xd12_out -= Yd12C_out * si3; + p4 = (_Float16)Yd12C_out * (_Float16)si3; + /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */ + //Yd12_out += Xd12C_out * si3; + p5 = (_Float16)Xd12C_out * (_Float16)si3; + + Xb12_out += (_Float16)p0; + Yb12_out -= (_Float16)p1; + Xc12_out += (_Float16)p2; + Yc12_out -= (_Float16)p3; + Xd12_out += (_Float16)p4; + Yd12_out -= (_Float16)p5; + + /* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */ + pSrc[2U * i1] = Xc12_out; + + /* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */ + pSrc[(2U * i1) + 1U] = Yc12_out; + + /* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */ + pSrc[2U * i2] = Xb12_out; + + /* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */ + pSrc[(2U * i2) + 1U] = Yb12_out; + + /* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */ + pSrc[2U * i3] = Xd12_out; + + /* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */ + pSrc[(2U * i3) + 1U] = Yd12_out; + + i0 += n1; + } while (i0 < fftLen); + j++; + } while (j <= (n2 - 1U)); + twidCoefModifier <<= 2U; + } + + j = fftLen >> 2; + ptr1 = &pSrc[0]; + + /* Calculations of last stage */ + do + { + xaIn = ptr1[0]; + yaIn = ptr1[1]; + xbIn = ptr1[2]; + ybIn = ptr1[3]; + xcIn = ptr1[4]; + ycIn = ptr1[5]; + xdIn = ptr1[6]; + ydIn = ptr1[7]; + + /* xa + xc */ + Xaplusc = (_Float16)xaIn + (_Float16)xcIn; + + /* xa - xc */ + Xaminusc = (_Float16)xaIn - (_Float16)xcIn; + + /* ya + yc */ + Yaplusc = (_Float16)yaIn + (_Float16)ycIn; + + /* ya - yc */ + Yaminusc = (_Float16)yaIn - (_Float16)ycIn; + + /* xb + xd */ + Xbplusd = (_Float16)xbIn + (_Float16)xdIn; + + /* yb + yd */ + Ybplusd = (_Float16)ybIn + (_Float16)ydIn; + + /* (xb-xd) */ + Xbminusd = (_Float16)xbIn - (_Float16)xdIn; + + /* (yb-yd) */ + Ybminusd = (_Float16)ybIn - (_Float16)ydIn; + + /* xa' = xa + xb + xc + xd */ + a0 = ((_Float16)Xaplusc + (_Float16)Xbplusd); + /* ya' = ya + yb + yc + yd */ + a1 = ((_Float16)Yaplusc + (_Float16)Ybplusd); + /* xc' = (xa-xb+xc-xd) */ + a2 = ((_Float16)Xaplusc - (_Float16)Xbplusd); + /* yc' = (ya-yb+yc-yd) */ + a3 = ((_Float16)Yaplusc - (_Float16)Ybplusd); + /* xb' = (xa+yb-xc-yd) */ + a4 = ((_Float16)Xaminusc + (_Float16)Ybminusd); + /* yb' = (ya-xb-yc+xd) */ + a5 = ((_Float16)Yaminusc - (_Float16)Xbminusd); + /* xd' = (xa-yb-xc+yd)) */ + a6 = ((_Float16)Xaminusc - (_Float16)Ybminusd); + /* yd' = (ya+xb-yc-xd) */ + a7 = ((_Float16)Xbminusd + (_Float16)Yaminusc); + + ptr1[0] = a0; + ptr1[1] = a1; + ptr1[2] = a2; + ptr1[3] = a3; + ptr1[4] = a4; + ptr1[5] = a5; + ptr1[6] = a6; + ptr1[7] = a7; + + /* increment pointer by 8 */ + ptr1 += 8U; + } while (--j); + +#else + + float16_t t1, t2, r1, r2, s1, s2; + + /* Run the below code for Cortex-M0 */ + + /* Initializations for the fft calculation */ + n2 = fftLen; + n1 = n2; + for (k = fftLen; k > 1U; k >>= 2U) + { + /* Initializations for the fft calculation */ + n1 = n2; + n2 >>= 2U; + ia1 = 0U; + + /* FFT Calculation */ + j = 0; + do + { + /* index calculation for the coefficients */ + ia2 = ia1 + ia1; + ia3 = ia2 + ia1; + co1 = pCoef[ia1 * 2U]; + si1 = pCoef[(ia1 * 2U) + 1U]; + co2 = pCoef[ia2 * 2U]; + si2 = pCoef[(ia2 * 2U) + 1U]; + co3 = pCoef[ia3 * 2U]; + si3 = pCoef[(ia3 * 2U) + 1U]; + + /* Twiddle coefficients index modifier */ + ia1 = ia1 + twidCoefModifier; + + i0 = j; + do + { + /* index calculation for the input as, */ + /* pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2], pSrc[i0 + 3fftLen/4] */ + i1 = i0 + n2; + i2 = i1 + n2; + i3 = i2 + n2; + + /* xa + xc */ + r1 = (_Float16)pSrc[(2U * i0)] + (_Float16)pSrc[(2U * i2)]; + + /* xa - xc */ + r2 = (_Float16)pSrc[(2U * i0)] - (_Float16)pSrc[(2U * i2)]; + + /* ya + yc */ + s1 = (_Float16)pSrc[(2U * i0) + 1U] + (_Float16)pSrc[(2U * i2) + 1U]; + + /* ya - yc */ + s2 = (_Float16)pSrc[(2U * i0) + 1U] - (_Float16)pSrc[(2U * i2) + 1U]; + + /* xb + xd */ + t1 = (_Float16)pSrc[2U * i1] + (_Float16)pSrc[2U * i3]; + + /* xa' = xa + xb + xc + xd */ + pSrc[2U * i0] = (_Float16)r1 + (_Float16)t1; + + /* xa + xc -(xb + xd) */ + r1 = (_Float16)r1 - (_Float16)t1; + + /* yb + yd */ + t2 = (_Float16)pSrc[(2U * i1) + 1U] + (_Float16)pSrc[(2U * i3) + 1U]; + + /* ya' = ya + yb + yc + yd */ + pSrc[(2U * i0) + 1U] = (_Float16)s1 + (_Float16)t2; + + /* (ya + yc) - (yb + yd) */ + s1 = (_Float16)s1 - (_Float16)t2; + + /* (yb - yd) */ + t1 = (_Float16)pSrc[(2U * i1) + 1U] - (_Float16)pSrc[(2U * i3) + 1U]; + + /* (xb - xd) */ + t2 = (_Float16)pSrc[2U * i1] - (_Float16)pSrc[2U * i3]; + + /* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */ + pSrc[2U * i1] = ((_Float16)r1 * (_Float16)co2) + ((_Float16)s1 * (_Float16)si2); + + /* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */ + pSrc[(2U * i1) + 1U] = ((_Float16)s1 * (_Float16)co2) - ((_Float16)r1 * (_Float16)si2); + + /* (xa - xc) + (yb - yd) */ + r1 = (_Float16)r2 + (_Float16)t1; + + /* (xa - xc) - (yb - yd) */ + r2 = (_Float16)r2 - (_Float16)t1; + + /* (ya - yc) - (xb - xd) */ + s1 = (_Float16)s2 - (_Float16)t2; + + /* (ya - yc) + (xb - xd) */ + s2 = (_Float16)s2 + (_Float16)t2; + + /* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */ + pSrc[2U * i2] = ((_Float16)r1 * (_Float16)co1) + ((_Float16)s1 * (_Float16)si1); + + /* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */ + pSrc[(2U * i2) + 1U] = ((_Float16)s1 * (_Float16)co1) - ((_Float16)r1 * (_Float16)si1); + + /* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */ + pSrc[2U * i3] = ((_Float16)r2 * (_Float16)co3) + ((_Float16)s2 * (_Float16)si3); + + /* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */ + pSrc[(2U * i3) + 1U] = ((_Float16)s2 * (_Float16)co3) - ((_Float16)r2 * (_Float16)si3); + + i0 += n1; + } while ( i0 < fftLen); + j++; + } while (j <= (n2 - 1U)); + twidCoefModifier <<= 2U; + } + +#endif /* #if defined (ARM_MATH_DSP) */ + +} + +/* +* @brief Core function for the floating-point CIFFT butterfly process. +* @param[in, out] *pSrc points to the in-place buffer of floating-point data type. +* @param[in] fftLen length of the FFT. +* @param[in] *pCoef points to twiddle coefficient buffer. +* @param[in] twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. +* @param[in] onebyfftLen value of 1/fftLen. +* @return none. +*/ + +void arm_radix4_butterfly_inverse_f16( +float16_t * pSrc, +uint16_t fftLen, +const float16_t * pCoef, +uint16_t twidCoefModifier, +float16_t onebyfftLen) +{ + float16_t co1, co2, co3, si1, si2, si3; + uint32_t ia1, ia2, ia3; + uint32_t i0, i1, i2, i3; + uint32_t n1, n2, j, k; + +#if defined (ARM_MATH_DSP) + + float16_t xaIn, yaIn, xbIn, ybIn, xcIn, ycIn, xdIn, ydIn; + float16_t Xaplusc, Xbplusd, Yaplusc, Ybplusd, Xaminusc, Xbminusd, Yaminusc, + Ybminusd; + float16_t Xb12C_out, Yb12C_out, Xc12C_out, Yc12C_out, Xd12C_out, Yd12C_out; + float16_t Xb12_out, Yb12_out, Xc12_out, Yc12_out, Xd12_out, Yd12_out; + float16_t *ptr1; + float16_t p0,p1,p2,p3,p4,p5,p6,p7; + float16_t a0,a1,a2,a3,a4,a5,a6,a7; + + + /* Initializations for the first stage */ + n2 = fftLen; + n1 = n2; + + /* n2 = fftLen/4 */ + n2 >>= 2U; + i0 = 0U; + ia1 = 0U; + + j = n2; + + /* Calculation of first stage */ + do + { + /* index calculation for the input as, */ + /* pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2], pSrc[i0 + 3fftLen/4] */ + i1 = i0 + n2; + i2 = i1 + n2; + i3 = i2 + n2; + + /* Butterfly implementation */ + xaIn = pSrc[(2U * i0)]; + yaIn = pSrc[(2U * i0) + 1U]; + + xcIn = pSrc[(2U * i2)]; + ycIn = pSrc[(2U * i2) + 1U]; + + xbIn = pSrc[(2U * i1)]; + ybIn = pSrc[(2U * i1) + 1U]; + + xdIn = pSrc[(2U * i3)]; + ydIn = pSrc[(2U * i3) + 1U]; + + /* xa + xc */ + Xaplusc = (_Float16)xaIn + (_Float16)xcIn; + /* xb + xd */ + Xbplusd = (_Float16)xbIn + (_Float16)xdIn; + /* ya + yc */ + Yaplusc = (_Float16)yaIn + (_Float16)ycIn; + /* yb + yd */ + Ybplusd = (_Float16)ybIn + (_Float16)ydIn; + + /* index calculation for the coefficients */ + ia2 = ia1 + ia1; + co2 = pCoef[ia2 * 2U]; + si2 = pCoef[(ia2 * 2U) + 1U]; + + /* xa - xc */ + Xaminusc = (_Float16)xaIn - (_Float16)xcIn; + /* xb - xd */ + Xbminusd = (_Float16)xbIn - (_Float16)xdIn; + /* ya - yc */ + Yaminusc = (_Float16)yaIn - (_Float16)ycIn; + /* yb - yd */ + Ybminusd = (_Float16)ybIn - (_Float16)ydIn; + + /* xa' = xa + xb + xc + xd */ + pSrc[(2U * i0)] = (_Float16)Xaplusc + (_Float16)Xbplusd; + + /* ya' = ya + yb + yc + yd */ + pSrc[(2U * i0) + 1U] = (_Float16)Yaplusc + (_Float16)Ybplusd; + + /* (xa - xc) - (yb - yd) */ + Xb12C_out = ((_Float16)Xaminusc - (_Float16)Ybminusd); + /* (ya - yc) + (xb - xd) */ + Yb12C_out = ((_Float16)Yaminusc + (_Float16)Xbminusd); + /* (xa + xc) - (xb + xd) */ + Xc12C_out = ((_Float16)Xaplusc - (_Float16)Xbplusd); + /* (ya + yc) - (yb + yd) */ + Yc12C_out = ((_Float16)Yaplusc - (_Float16)Ybplusd); + /* (xa - xc) + (yb - yd) */ + Xd12C_out = ((_Float16)Xaminusc + (_Float16)Ybminusd); + /* (ya - yc) - (xb - xd) */ + Yd12C_out = ((_Float16)Yaminusc - (_Float16)Xbminusd); + + co1 = pCoef[ia1 * 2U]; + si1 = pCoef[(ia1 * 2U) + 1U]; + + /* index calculation for the coefficients */ + ia3 = ia2 + ia1; + co3 = pCoef[ia3 * 2U]; + si3 = pCoef[(ia3 * 2U) + 1U]; + + Xb12_out = (_Float16)Xb12C_out * (_Float16)co1; + Yb12_out = (_Float16)Yb12C_out * (_Float16)co1; + Xc12_out = (_Float16)Xc12C_out * (_Float16)co2; + Yc12_out = (_Float16)Yc12C_out * (_Float16)co2; + Xd12_out = (_Float16)Xd12C_out * (_Float16)co3; + Yd12_out = (_Float16)Yd12C_out * (_Float16)co3; + + /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */ + //Xb12_out -= Yb12C_out * si1; + p0 = (_Float16)Yb12C_out * (_Float16)si1; + /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */ + //Yb12_out += Xb12C_out * si1; + p1 = (_Float16)Xb12C_out * (_Float16)si1; + /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */ + //Xc12_out -= Yc12C_out * si2; + p2 = (_Float16)Yc12C_out * (_Float16)si2; + /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */ + //Yc12_out += Xc12C_out * si2; + p3 = (_Float16)Xc12C_out * (_Float16)si2; + /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */ + //Xd12_out -= Yd12C_out * si3; + p4 = (_Float16)Yd12C_out * (_Float16)si3; + /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */ + //Yd12_out += Xd12C_out * si3; + p5 =(_Float16) Xd12C_out * (_Float16)si3; + + Xb12_out -= (_Float16)p0; + Yb12_out += (_Float16)p1; + Xc12_out -= (_Float16)p2; + Yc12_out += (_Float16)p3; + Xd12_out -= (_Float16)p4; + Yd12_out += (_Float16)p5; + + /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */ + pSrc[2U * i1] = Xc12_out; + + /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */ + pSrc[(2U * i1) + 1U] = Yc12_out; + + /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */ + pSrc[2U * i2] = Xb12_out; + + /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */ + pSrc[(2U * i2) + 1U] = Yb12_out; + + /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */ + pSrc[2U * i3] = Xd12_out; + + /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */ + pSrc[(2U * i3) + 1U] = Yd12_out; + + /* Twiddle coefficients index modifier */ + ia1 = ia1 + twidCoefModifier; + + /* Updating input index */ + i0 = i0 + 1U; + + } while (--j); + + twidCoefModifier <<= 2U; + + /* Calculation of second stage to excluding last stage */ + for (k = fftLen >> 2U; k > 4U; k >>= 2U) + { + /* Initializations for the first stage */ + n1 = n2; + n2 >>= 2U; + ia1 = 0U; + + /* Calculation of first stage */ + j = 0; + do + { + /* index calculation for the coefficients */ + ia2 = ia1 + ia1; + ia3 = ia2 + ia1; + co1 = pCoef[ia1 * 2U]; + si1 = pCoef[(ia1 * 2U) + 1U]; + co2 = pCoef[ia2 * 2U]; + si2 = pCoef[(ia2 * 2U) + 1U]; + co3 = pCoef[ia3 * 2U]; + si3 = pCoef[(ia3 * 2U) + 1U]; + + /* Twiddle coefficients index modifier */ + ia1 = ia1 + twidCoefModifier; + + i0 = j; + do + { + /* index calculation for the input as, */ + /* pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2], pSrc[i0 + 3fftLen/4] */ + i1 = i0 + n2; + i2 = i1 + n2; + i3 = i2 + n2; + + xaIn = pSrc[(2U * i0)]; + yaIn = pSrc[(2U * i0) + 1U]; + + xbIn = pSrc[(2U * i1)]; + ybIn = pSrc[(2U * i1) + 1U]; + + xcIn = pSrc[(2U * i2)]; + ycIn = pSrc[(2U * i2) + 1U]; + + xdIn = pSrc[(2U * i3)]; + ydIn = pSrc[(2U * i3) + 1U]; + + /* xa - xc */ + Xaminusc = (_Float16)xaIn - (_Float16)xcIn; + /* (xb - xd) */ + Xbminusd = (_Float16)xbIn - (_Float16)xdIn; + /* ya - yc */ + Yaminusc = (_Float16)yaIn - (_Float16)ycIn; + /* (yb - yd) */ + Ybminusd = (_Float16)ybIn - (_Float16)ydIn; + + /* xa + xc */ + Xaplusc = (_Float16)xaIn + (_Float16)xcIn; + /* xb + xd */ + Xbplusd = (_Float16)xbIn + (_Float16)xdIn; + /* ya + yc */ + Yaplusc = (_Float16)yaIn + (_Float16)ycIn; + /* yb + yd */ + Ybplusd = (_Float16)ybIn + (_Float16)ydIn; + + /* (xa - xc) - (yb - yd) */ + Xb12C_out = ((_Float16)Xaminusc - (_Float16)Ybminusd); + /* (ya - yc) + (xb - xd) */ + Yb12C_out = ((_Float16)Yaminusc + (_Float16)Xbminusd); + /* xa + xc -(xb + xd) */ + Xc12C_out = ((_Float16)Xaplusc - (_Float16)Xbplusd); + /* (ya + yc) - (yb + yd) */ + Yc12C_out = ((_Float16)Yaplusc - (_Float16)Ybplusd); + /* (xa - xc) + (yb - yd) */ + Xd12C_out = ((_Float16)Xaminusc + (_Float16)Ybminusd); + /* (ya - yc) - (xb - xd) */ + Yd12C_out = ((_Float16)Yaminusc - (_Float16)Xbminusd); + + pSrc[(2U * i0)] = (_Float16)Xaplusc + (_Float16)Xbplusd; + pSrc[(2U * i0) + 1U] = (_Float16)Yaplusc + (_Float16)Ybplusd; + + Xb12_out = (_Float16)Xb12C_out * (_Float16)co1; + Yb12_out = (_Float16)Yb12C_out * (_Float16)co1; + Xc12_out = (_Float16)Xc12C_out * (_Float16)co2; + Yc12_out = (_Float16)Yc12C_out * (_Float16)co2; + Xd12_out = (_Float16)Xd12C_out * (_Float16)co3; + Yd12_out = (_Float16)Yd12C_out * (_Float16)co3; + + /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */ + //Xb12_out -= Yb12C_out * si1; + p0 = (_Float16)Yb12C_out * (_Float16)si1; + /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */ + //Yb12_out += Xb12C_out * si1; + p1 = (_Float16)Xb12C_out * (_Float16)si1; + /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */ + //Xc12_out -= Yc12C_out * si2; + p2 = (_Float16)Yc12C_out * (_Float16)si2; + /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */ + //Yc12_out += Xc12C_out * si2; + p3 = (_Float16)Xc12C_out * (_Float16)si2; + /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */ + //Xd12_out -= Yd12C_out * si3; + p4 = (_Float16)Yd12C_out * (_Float16)si3; + /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */ + //Yd12_out += Xd12C_out * si3; + p5 = (_Float16)Xd12C_out * (_Float16)si3; + + Xb12_out -= (_Float16)p0; + Yb12_out += (_Float16)p1; + Xc12_out -= (_Float16)p2; + Yc12_out += (_Float16)p3; + Xd12_out -= (_Float16)p4; + Yd12_out += (_Float16)p5; + + /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */ + pSrc[2U * i1] = Xc12_out; + + /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */ + pSrc[(2U * i1) + 1U] = Yc12_out; + + /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */ + pSrc[2U * i2] = Xb12_out; + + /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */ + pSrc[(2U * i2) + 1U] = Yb12_out; + + /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */ + pSrc[2U * i3] = Xd12_out; + + /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */ + pSrc[(2U * i3) + 1U] = Yd12_out; + + i0 += n1; + } while (i0 < fftLen); + j++; + } while (j <= (n2 - 1U)); + twidCoefModifier <<= 2U; + } + /* Initializations of last stage */ + + j = fftLen >> 2; + ptr1 = &pSrc[0]; + + /* Calculations of last stage */ + do + { + xaIn = ptr1[0]; + yaIn = ptr1[1]; + xbIn = ptr1[2]; + ybIn = ptr1[3]; + xcIn = ptr1[4]; + ycIn = ptr1[5]; + xdIn = ptr1[6]; + ydIn = ptr1[7]; + + /* Butterfly implementation */ + /* xa + xc */ + Xaplusc = (_Float16)xaIn + (_Float16)xcIn; + + /* xa - xc */ + Xaminusc = (_Float16)xaIn - (_Float16)xcIn; + + /* ya + yc */ + Yaplusc = (_Float16)yaIn + (_Float16)ycIn; + + /* ya - yc */ + Yaminusc = (_Float16)yaIn - (_Float16)ycIn; + + /* xb + xd */ + Xbplusd = (_Float16)xbIn + (_Float16)xdIn; + + /* yb + yd */ + Ybplusd = (_Float16)ybIn + (_Float16)ydIn; + + /* (xb-xd) */ + Xbminusd = (_Float16)xbIn - (_Float16)xdIn; + + /* (yb-yd) */ + Ybminusd = (_Float16)ybIn - (_Float16)ydIn; + + /* xa' = (xa+xb+xc+xd) * onebyfftLen */ + a0 = ((_Float16)Xaplusc + (_Float16)Xbplusd); + /* ya' = (ya+yb+yc+yd) * onebyfftLen */ + a1 = ((_Float16)Yaplusc + (_Float16)Ybplusd); + /* xc' = (xa-xb+xc-xd) * onebyfftLen */ + a2 = ((_Float16)Xaplusc - (_Float16)Xbplusd); + /* yc' = (ya-yb+yc-yd) * onebyfftLen */ + a3 = ((_Float16)Yaplusc - (_Float16)Ybplusd); + /* xb' = (xa-yb-xc+yd) * onebyfftLen */ + a4 = ((_Float16)Xaminusc - (_Float16)Ybminusd); + /* yb' = (ya+xb-yc-xd) * onebyfftLen */ + a5 = ((_Float16)Yaminusc + (_Float16)Xbminusd); + /* xd' = (xa-yb-xc+yd) * onebyfftLen */ + a6 = ((_Float16)Xaminusc + (_Float16)Ybminusd); + /* yd' = (ya-xb-yc+xd) * onebyfftLen */ + a7 = ((_Float16)Yaminusc - (_Float16)Xbminusd); + + p0 = (_Float16)a0 * (_Float16)onebyfftLen; + p1 = (_Float16)a1 * (_Float16)onebyfftLen; + p2 = (_Float16)a2 * (_Float16)onebyfftLen; + p3 = (_Float16)a3 * (_Float16)onebyfftLen; + p4 = (_Float16)a4 * (_Float16)onebyfftLen; + p5 = (_Float16)a5 * (_Float16)onebyfftLen; + p6 = (_Float16)a6 * (_Float16)onebyfftLen; + p7 = (_Float16)a7 * (_Float16)onebyfftLen; + + /* xa' = (xa+xb+xc+xd) * onebyfftLen */ + ptr1[0] = p0; + /* ya' = (ya+yb+yc+yd) * onebyfftLen */ + ptr1[1] = p1; + /* xc' = (xa-xb+xc-xd) * onebyfftLen */ + ptr1[2] = p2; + /* yc' = (ya-yb+yc-yd) * onebyfftLen */ + ptr1[3] = p3; + /* xb' = (xa-yb-xc+yd) * onebyfftLen */ + ptr1[4] = p4; + /* yb' = (ya+xb-yc-xd) * onebyfftLen */ + ptr1[5] = p5; + /* xd' = (xa-yb-xc+yd) * onebyfftLen */ + ptr1[6] = p6; + /* yd' = (ya-xb-yc+xd) * onebyfftLen */ + ptr1[7] = p7; + + /* increment source pointer by 8 for next calculations */ + ptr1 = ptr1 + 8U; + + } while (--j); + +#else + + float16_t t1, t2, r1, r2, s1, s2; + + /* Run the below code for Cortex-M0 */ + + /* Initializations for the first stage */ + n2 = fftLen; + n1 = n2; + + /* Calculation of first stage */ + for (k = fftLen; k > 4U; k >>= 2U) + { + /* Initializations for the first stage */ + n1 = n2; + n2 >>= 2U; + ia1 = 0U; + + /* Calculation of first stage */ + j = 0; + do + { + /* index calculation for the coefficients */ + ia2 = ia1 + ia1; + ia3 = ia2 + ia1; + co1 = pCoef[ia1 * 2U]; + si1 = pCoef[(ia1 * 2U) + 1U]; + co2 = pCoef[ia2 * 2U]; + si2 = pCoef[(ia2 * 2U) + 1U]; + co3 = pCoef[ia3 * 2U]; + si3 = pCoef[(ia3 * 2U) + 1U]; + + /* Twiddle coefficients index modifier */ + ia1 = ia1 + twidCoefModifier; + + i0 = j; + do + { + /* index calculation for the input as, */ + /* pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2], pSrc[i0 + 3fftLen/4] */ + i1 = i0 + n2; + i2 = i1 + n2; + i3 = i2 + n2; + + /* xa + xc */ + r1 = (_Float16)pSrc[(2U * i0)] + (_Float16)pSrc[(2U * i2)]; + + /* xa - xc */ + r2 = (_Float16)pSrc[(2U * i0)] - (_Float16)pSrc[(2U * i2)]; + + /* ya + yc */ + s1 = (_Float16)pSrc[(2U * i0) + 1U] + (_Float16)pSrc[(2U * i2) + 1U]; + + /* ya - yc */ + s2 = (_Float16)pSrc[(2U * i0) + 1U] - (_Float16)pSrc[(2U * i2) + 1U]; + + /* xb + xd */ + t1 = (_Float16)pSrc[2U * i1] + (_Float16)pSrc[2U * i3]; + + /* xa' = xa + xb + xc + xd */ + pSrc[2U * i0] = (_Float16)r1 + (_Float16)t1; + + /* xa + xc -(xb + xd) */ + r1 = (_Float16)r1 - (_Float16)t1; + + /* yb + yd */ + t2 = (_Float16)pSrc[(2U * i1) + 1U] + (_Float16)pSrc[(2U * i3) + 1U]; + + /* ya' = ya + yb + yc + yd */ + pSrc[(2U * i0) + 1U] = (_Float16)s1 + (_Float16)t2; + + /* (ya + yc) - (yb + yd) */ + s1 = (_Float16)s1 - (_Float16)t2; + + /* (yb - yd) */ + t1 = (_Float16)pSrc[(2U * i1) + 1U] - (_Float16)pSrc[(2U * i3) + 1U]; + + /* (xb - xd) */ + t2 = (_Float16)pSrc[2U * i1] - (_Float16)pSrc[2U * i3]; + + /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */ + pSrc[2U * i1] = ((_Float16)r1 * (_Float16)co2) - ((_Float16)s1 * (_Float16)si2); + + /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */ + pSrc[(2U * i1) + 1U] = ((_Float16)s1 * (_Float16)co2) + ((_Float16)r1 * (_Float16)si2); + + /* (xa - xc) - (yb - yd) */ + r1 = (_Float16)r2 - (_Float16)t1; + + /* (xa - xc) + (yb - yd) */ + r2 = (_Float16)r2 + (_Float16)t1; + + /* (ya - yc) + (xb - xd) */ + s1 = (_Float16)s2 + (_Float16)t2; + + /* (ya - yc) - (xb - xd) */ + s2 = (_Float16)s2 - (_Float16)t2; + + /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */ + pSrc[2U * i2] = ((_Float16)r1 * (_Float16)co1) - ((_Float16)s1 * (_Float16)si1); + + /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */ + pSrc[(2U * i2) + 1U] = ((_Float16)s1 * (_Float16)co1) + ((_Float16)r1 * (_Float16)si1); + + /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */ + pSrc[2U * i3] = ((_Float16)r2 * (_Float16)co3) - ((_Float16)s2 * (_Float16)si3); + + /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */ + pSrc[(2U * i3) + 1U] = ((_Float16)s2 * (_Float16)co3) + ((_Float16)r2 * (_Float16)si3); + + i0 += n1; + } while ( i0 < fftLen); + j++; + } while (j <= (n2 - 1U)); + twidCoefModifier <<= 2U; + } + /* Initializations of last stage */ + n1 = n2; + n2 >>= 2U; + + /* Calculations of last stage */ + for (i0 = 0U; i0 <= (fftLen - n1); i0 += n1) + { + /* index calculation for the input as, */ + /* pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2], pSrc[i0 + 3fftLen/4] */ + i1 = i0 + n2; + i2 = i1 + n2; + i3 = i2 + n2; + + /* Butterfly implementation */ + /* xa + xc */ + r1 = (_Float16)pSrc[2U * i0] + (_Float16)pSrc[2U * i2]; + + /* xa - xc */ + r2 = (_Float16)pSrc[2U * i0] - (_Float16)pSrc[2U * i2]; + + /* ya + yc */ + s1 = (_Float16)pSrc[(2U * i0) + 1U] + (_Float16)pSrc[(2U * i2) + 1U]; + + /* ya - yc */ + s2 = (_Float16)pSrc[(2U * i0) + 1U] - (_Float16)pSrc[(2U * i2) + 1U]; + + /* xc + xd */ + t1 = (_Float16)pSrc[2U * i1] + (_Float16)pSrc[2U * i3]; + + /* xa' = xa + xb + xc + xd */ + pSrc[2U * i0] = ((_Float16)r1 + (_Float16)t1) * (_Float16)onebyfftLen; + + /* (xa + xb) - (xc + xd) */ + r1 = (_Float16)r1 - (_Float16)t1; + + /* yb + yd */ + t2 = (_Float16)pSrc[(2U * i1) + 1U] + (_Float16)pSrc[(2U * i3) + 1U]; + + /* ya' = ya + yb + yc + yd */ + pSrc[(2U * i0) + 1U] = ((_Float16)s1 + (_Float16)t2) * (_Float16)onebyfftLen; + + /* (ya + yc) - (yb + yd) */ + s1 = (_Float16)s1 - (_Float16)t2; + + /* (yb-yd) */ + t1 = (_Float16)pSrc[(2U * i1) + 1U] - (_Float16)pSrc[(2U * i3) + 1U]; + + /* (xb-xd) */ + t2 = (_Float16)pSrc[2U * i1] - (_Float16)pSrc[2U * i3]; + + /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */ + pSrc[2U * i1] = (_Float16)r1 * (_Float16)onebyfftLen; + + /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */ + pSrc[(2U * i1) + 1U] = (_Float16)s1 * (_Float16)onebyfftLen; + + /* (xa - xc) - (yb-yd) */ + r1 = (_Float16)r2 - (_Float16)t1; + + /* (xa - xc) + (yb-yd) */ + r2 = (_Float16)r2 + (_Float16)t1; + + /* (ya - yc) + (xb-xd) */ + s1 = (_Float16)s2 + (_Float16)t2; + + /* (ya - yc) - (xb-xd) */ + s2 = (_Float16)s2 - (_Float16)t2; + + /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */ + pSrc[2U * i2] = (_Float16)r1 * (_Float16)onebyfftLen; + + /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */ + pSrc[(2U * i2) + 1U] = (_Float16)s1 * (_Float16)onebyfftLen; + + /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */ + pSrc[2U * i3] = (_Float16)r2 * (_Float16)onebyfftLen; + + /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */ + pSrc[(2U * i3) + 1U] = (_Float16)s2 * (_Float16)onebyfftLen; + } + +#endif /* #if defined (ARM_MATH_DSP) */ +} + +#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_f32.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_f32.c new file mode 100644 index 0000000..9d9d4d5 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_f32.c @@ -0,0 +1,1203 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_cfft_radix4_f32.c + * Description: Radix-4 Decimation in Frequency CFFT & CIFFT Floating point processing function + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/transform_functions.h" + +extern void arm_bitreversal_f32( + float32_t * pSrc, + uint16_t fftSize, + uint16_t bitRevFactor, + const uint16_t * pBitRevTab); + +void arm_radix4_butterfly_f32( + float32_t * pSrc, + uint16_t fftLen, + const float32_t * pCoef, + uint16_t twidCoefModifier); + +void arm_radix4_butterfly_inverse_f32( + float32_t * pSrc, + uint16_t fftLen, + const float32_t * pCoef, + uint16_t twidCoefModifier, + float32_t onebyfftLen); + + + + +/** + @ingroup groupTransforms + */ + +/** + @addtogroup ComplexFFT + @{ + */ + + +/** + @brief Processing function for the floating-point Radix-4 CFFT/CIFFT. + @deprecated Do not use this function. It has been superseded by \ref arm_cfft_f32 and will be removed in the future. + @param[in] S points to an instance of the floating-point Radix-4 CFFT/CIFFT structure + @param[in,out] pSrc points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place + @return none + */ + +void arm_cfft_radix4_f32( + const arm_cfft_radix4_instance_f32 * S, + float32_t * pSrc) +{ + if (S->ifftFlag == 1U) + { + /* Complex IFFT radix-4 */ + arm_radix4_butterfly_inverse_f32(pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier, S->onebyfftLen); + } + else + { + /* Complex FFT radix-4 */ + arm_radix4_butterfly_f32(pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier); + } + + if (S->bitReverseFlag == 1U) + { + /* Bit Reversal */ + arm_bitreversal_f32(pSrc, S->fftLen, S->bitRevFactor, S->pBitRevTable); + } + +} + +/** + @} end of ComplexFFT group + */ + +/* ---------------------------------------------------------------------- + * Internal helper function used by the FFTs + * ---------------------------------------------------------------------- */ + +/** + brief Core function for the floating-point CFFT butterfly process. + param[in,out] pSrc points to the in-place buffer of floating-point data type + param[in] fftLen length of the FFT + param[in] pCoef points to the twiddle coefficient buffer + param[in] twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table + return none + */ + +void arm_radix4_butterfly_f32( + float32_t * pSrc, + uint16_t fftLen, + const float32_t * pCoef, + uint16_t twidCoefModifier) +{ + float32_t co1, co2, co3, si1, si2, si3; + uint32_t ia1, ia2, ia3; + uint32_t i0, i1, i2, i3; + uint32_t n1, n2, j, k; + +#if defined (ARM_MATH_LOOPUNROLL) + + float32_t xaIn, yaIn, xbIn, ybIn, xcIn, ycIn, xdIn, ydIn; + float32_t Xaplusc, Xbplusd, Yaplusc, Ybplusd, Xaminusc, Xbminusd, Yaminusc, + Ybminusd; + float32_t Xb12C_out, Yb12C_out, Xc12C_out, Yc12C_out, Xd12C_out, Yd12C_out; + float32_t Xb12_out, Yb12_out, Xc12_out, Yc12_out, Xd12_out, Yd12_out; + float32_t *ptr1; + float32_t p0,p1,p2,p3,p4,p5; + float32_t a0,a1,a2,a3,a4,a5,a6,a7; + + /* Initializations for the first stage */ + n2 = fftLen; + n1 = n2; + + /* n2 = fftLen/4 */ + n2 >>= 2U; + i0 = 0U; + ia1 = 0U; + + j = n2; + + /* Calculation of first stage */ + do + { + /* index calculation for the input as, */ + /* pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2], pSrc[i0 + 3fftLen/4] */ + i1 = i0 + n2; + i2 = i1 + n2; + i3 = i2 + n2; + + xaIn = pSrc[(2U * i0)]; + yaIn = pSrc[(2U * i0) + 1U]; + + xbIn = pSrc[(2U * i1)]; + ybIn = pSrc[(2U * i1) + 1U]; + + xcIn = pSrc[(2U * i2)]; + ycIn = pSrc[(2U * i2) + 1U]; + + xdIn = pSrc[(2U * i3)]; + ydIn = pSrc[(2U * i3) + 1U]; + + /* xa + xc */ + Xaplusc = xaIn + xcIn; + /* xb + xd */ + Xbplusd = xbIn + xdIn; + /* ya + yc */ + Yaplusc = yaIn + ycIn; + /* yb + yd */ + Ybplusd = ybIn + ydIn; + + /* index calculation for the coefficients */ + ia2 = ia1 + ia1; + co2 = pCoef[ia2 * 2U]; + si2 = pCoef[(ia2 * 2U) + 1U]; + + /* xa - xc */ + Xaminusc = xaIn - xcIn; + /* xb - xd */ + Xbminusd = xbIn - xdIn; + /* ya - yc */ + Yaminusc = yaIn - ycIn; + /* yb - yd */ + Ybminusd = ybIn - ydIn; + + /* xa' = xa + xb + xc + xd */ + pSrc[(2U * i0)] = Xaplusc + Xbplusd; + /* ya' = ya + yb + yc + yd */ + pSrc[(2U * i0) + 1U] = Yaplusc + Ybplusd; + + /* (xa - xc) + (yb - yd) */ + Xb12C_out = (Xaminusc + Ybminusd); + /* (ya - yc) + (xb - xd) */ + Yb12C_out = (Yaminusc - Xbminusd); + /* (xa + xc) - (xb + xd) */ + Xc12C_out = (Xaplusc - Xbplusd); + /* (ya + yc) - (yb + yd) */ + Yc12C_out = (Yaplusc - Ybplusd); + /* (xa - xc) - (yb - yd) */ + Xd12C_out = (Xaminusc - Ybminusd); + /* (ya - yc) + (xb - xd) */ + Yd12C_out = (Xbminusd + Yaminusc); + + co1 = pCoef[ia1 * 2U]; + si1 = pCoef[(ia1 * 2U) + 1U]; + + /* index calculation for the coefficients */ + ia3 = ia2 + ia1; + co3 = pCoef[ia3 * 2U]; + si3 = pCoef[(ia3 * 2U) + 1U]; + + Xb12_out = Xb12C_out * co1; + Yb12_out = Yb12C_out * co1; + Xc12_out = Xc12C_out * co2; + Yc12_out = Yc12C_out * co2; + Xd12_out = Xd12C_out * co3; + Yd12_out = Yd12C_out * co3; + + /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */ + //Xb12_out -= Yb12C_out * si1; + p0 = Yb12C_out * si1; + /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */ + //Yb12_out += Xb12C_out * si1; + p1 = Xb12C_out * si1; + /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */ + //Xc12_out -= Yc12C_out * si2; + p2 = Yc12C_out * si2; + /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */ + //Yc12_out += Xc12C_out * si2; + p3 = Xc12C_out * si2; + /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */ + //Xd12_out -= Yd12C_out * si3; + p4 = Yd12C_out * si3; + /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */ + //Yd12_out += Xd12C_out * si3; + p5 = Xd12C_out * si3; + + Xb12_out += p0; + Yb12_out -= p1; + Xc12_out += p2; + Yc12_out -= p3; + Xd12_out += p4; + Yd12_out -= p5; + + /* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */ + pSrc[2U * i1] = Xc12_out; + + /* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */ + pSrc[(2U * i1) + 1U] = Yc12_out; + + /* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */ + pSrc[2U * i2] = Xb12_out; + + /* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */ + pSrc[(2U * i2) + 1U] = Yb12_out; + + /* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */ + pSrc[2U * i3] = Xd12_out; + + /* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */ + pSrc[(2U * i3) + 1U] = Yd12_out; + + /* Twiddle coefficients index modifier */ + ia1 += twidCoefModifier; + + /* Updating input index */ + i0++; + + } + while (--j); + + twidCoefModifier <<= 2U; + + /* Calculation of second stage to excluding last stage */ + for (k = fftLen >> 2U; k > 4U; k >>= 2U) + { + /* Initializations for the first stage */ + n1 = n2; + n2 >>= 2U; + ia1 = 0U; + + /* Calculation of first stage */ + j = 0; + do + { + /* index calculation for the coefficients */ + ia2 = ia1 + ia1; + ia3 = ia2 + ia1; + co1 = pCoef[(ia1 * 2U)]; + si1 = pCoef[(ia1 * 2U) + 1U]; + co2 = pCoef[(ia2 * 2U)]; + si2 = pCoef[(ia2 * 2U) + 1U]; + co3 = pCoef[(ia3 * 2U)]; + si3 = pCoef[(ia3 * 2U) + 1U]; + + /* Twiddle coefficients index modifier */ + ia1 += twidCoefModifier; + + i0 = j; + do + { + /* index calculation for the input as, */ + /* pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2], pSrc[i0 + 3fftLen/4] */ + i1 = i0 + n2; + i2 = i1 + n2; + i3 = i2 + n2; + + xaIn = pSrc[(2U * i0)]; + yaIn = pSrc[(2U * i0) + 1U]; + + xbIn = pSrc[(2U * i1)]; + ybIn = pSrc[(2U * i1) + 1U]; + + xcIn = pSrc[(2U * i2)]; + ycIn = pSrc[(2U * i2) + 1U]; + + xdIn = pSrc[(2U * i3)]; + ydIn = pSrc[(2U * i3) + 1U]; + + /* xa - xc */ + Xaminusc = xaIn - xcIn; + /* (xb - xd) */ + Xbminusd = xbIn - xdIn; + /* ya - yc */ + Yaminusc = yaIn - ycIn; + /* (yb - yd) */ + Ybminusd = ybIn - ydIn; + + /* xa + xc */ + Xaplusc = xaIn + xcIn; + /* xb + xd */ + Xbplusd = xbIn + xdIn; + /* ya + yc */ + Yaplusc = yaIn + ycIn; + /* yb + yd */ + Ybplusd = ybIn + ydIn; + + /* (xa - xc) + (yb - yd) */ + Xb12C_out = (Xaminusc + Ybminusd); + /* (ya - yc) - (xb - xd) */ + Yb12C_out = (Yaminusc - Xbminusd); + /* xa + xc -(xb + xd) */ + Xc12C_out = (Xaplusc - Xbplusd); + /* (ya + yc) - (yb + yd) */ + Yc12C_out = (Yaplusc - Ybplusd); + /* (xa - xc) - (yb - yd) */ + Xd12C_out = (Xaminusc - Ybminusd); + /* (ya - yc) + (xb - xd) */ + Yd12C_out = (Xbminusd + Yaminusc); + + pSrc[(2U * i0)] = Xaplusc + Xbplusd; + pSrc[(2U * i0) + 1U] = Yaplusc + Ybplusd; + + Xb12_out = Xb12C_out * co1; + Yb12_out = Yb12C_out * co1; + Xc12_out = Xc12C_out * co2; + Yc12_out = Yc12C_out * co2; + Xd12_out = Xd12C_out * co3; + Yd12_out = Yd12C_out * co3; + + /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */ + //Xb12_out -= Yb12C_out * si1; + p0 = Yb12C_out * si1; + /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */ + //Yb12_out += Xb12C_out * si1; + p1 = Xb12C_out * si1; + /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */ + //Xc12_out -= Yc12C_out * si2; + p2 = Yc12C_out * si2; + /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */ + //Yc12_out += Xc12C_out * si2; + p3 = Xc12C_out * si2; + /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */ + //Xd12_out -= Yd12C_out * si3; + p4 = Yd12C_out * si3; + /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */ + //Yd12_out += Xd12C_out * si3; + p5 = Xd12C_out * si3; + + Xb12_out += p0; + Yb12_out -= p1; + Xc12_out += p2; + Yc12_out -= p3; + Xd12_out += p4; + Yd12_out -= p5; + + /* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */ + pSrc[2U * i1] = Xc12_out; + + /* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */ + pSrc[(2U * i1) + 1U] = Yc12_out; + + /* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */ + pSrc[2U * i2] = Xb12_out; + + /* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */ + pSrc[(2U * i2) + 1U] = Yb12_out; + + /* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */ + pSrc[2U * i3] = Xd12_out; + + /* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */ + pSrc[(2U * i3) + 1U] = Yd12_out; + + i0 += n1; + } while (i0 < fftLen); + j++; + } while (j <= (n2 - 1U)); + twidCoefModifier <<= 2U; + } + + j = fftLen >> 2; + ptr1 = &pSrc[0]; + + /* Calculations of last stage */ + do + { + xaIn = ptr1[0]; + yaIn = ptr1[1]; + xbIn = ptr1[2]; + ybIn = ptr1[3]; + xcIn = ptr1[4]; + ycIn = ptr1[5]; + xdIn = ptr1[6]; + ydIn = ptr1[7]; + + /* xa + xc */ + Xaplusc = xaIn + xcIn; + + /* xa - xc */ + Xaminusc = xaIn - xcIn; + + /* ya + yc */ + Yaplusc = yaIn + ycIn; + + /* ya - yc */ + Yaminusc = yaIn - ycIn; + + /* xb + xd */ + Xbplusd = xbIn + xdIn; + + /* yb + yd */ + Ybplusd = ybIn + ydIn; + + /* (xb-xd) */ + Xbminusd = xbIn - xdIn; + + /* (yb-yd) */ + Ybminusd = ybIn - ydIn; + + /* xa' = xa + xb + xc + xd */ + a0 = (Xaplusc + Xbplusd); + /* ya' = ya + yb + yc + yd */ + a1 = (Yaplusc + Ybplusd); + /* xc' = (xa-xb+xc-xd) */ + a2 = (Xaplusc - Xbplusd); + /* yc' = (ya-yb+yc-yd) */ + a3 = (Yaplusc - Ybplusd); + /* xb' = (xa+yb-xc-yd) */ + a4 = (Xaminusc + Ybminusd); + /* yb' = (ya-xb-yc+xd) */ + a5 = (Yaminusc - Xbminusd); + /* xd' = (xa-yb-xc+yd)) */ + a6 = (Xaminusc - Ybminusd); + /* yd' = (ya+xb-yc-xd) */ + a7 = (Xbminusd + Yaminusc); + + ptr1[0] = a0; + ptr1[1] = a1; + ptr1[2] = a2; + ptr1[3] = a3; + ptr1[4] = a4; + ptr1[5] = a5; + ptr1[6] = a6; + ptr1[7] = a7; + + /* increment pointer by 8 */ + ptr1 += 8U; + } while (--j); + +#else + + float32_t t1, t2, r1, r2, s1, s2; + + /* Initializations for the fft calculation */ + n2 = fftLen; + n1 = n2; + for (k = fftLen; k > 1U; k >>= 2U) + { + /* Initializations for the fft calculation */ + n1 = n2; + n2 >>= 2U; + ia1 = 0U; + + /* FFT Calculation */ + j = 0; + do + { + /* index calculation for the coefficients */ + ia2 = ia1 + ia1; + ia3 = ia2 + ia1; + co1 = pCoef[ia1 * 2U]; + si1 = pCoef[(ia1 * 2U) + 1U]; + co2 = pCoef[ia2 * 2U]; + si2 = pCoef[(ia2 * 2U) + 1U]; + co3 = pCoef[ia3 * 2U]; + si3 = pCoef[(ia3 * 2U) + 1U]; + + /* Twiddle coefficients index modifier */ + ia1 = ia1 + twidCoefModifier; + + i0 = j; + do + { + /* index calculation for the input as, */ + /* pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2], pSrc[i0 + 3fftLen/4] */ + i1 = i0 + n2; + i2 = i1 + n2; + i3 = i2 + n2; + + /* xa + xc */ + r1 = pSrc[(2U * i0)] + pSrc[(2U * i2)]; + + /* xa - xc */ + r2 = pSrc[(2U * i0)] - pSrc[(2U * i2)]; + + /* ya + yc */ + s1 = pSrc[(2U * i0) + 1U] + pSrc[(2U * i2) + 1U]; + + /* ya - yc */ + s2 = pSrc[(2U * i0) + 1U] - pSrc[(2U * i2) + 1U]; + + /* xb + xd */ + t1 = pSrc[2U * i1] + pSrc[2U * i3]; + + /* xa' = xa + xb + xc + xd */ + pSrc[2U * i0] = r1 + t1; + + /* xa + xc -(xb + xd) */ + r1 = r1 - t1; + + /* yb + yd */ + t2 = pSrc[(2U * i1) + 1U] + pSrc[(2U * i3) + 1U]; + + /* ya' = ya + yb + yc + yd */ + pSrc[(2U * i0) + 1U] = s1 + t2; + + /* (ya + yc) - (yb + yd) */ + s1 = s1 - t2; + + /* (yb - yd) */ + t1 = pSrc[(2U * i1) + 1U] - pSrc[(2U * i3) + 1U]; + + /* (xb - xd) */ + t2 = pSrc[2U * i1] - pSrc[2U * i3]; + + /* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */ + pSrc[2U * i1] = (r1 * co2) + (s1 * si2); + + /* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */ + pSrc[(2U * i1) + 1U] = (s1 * co2) - (r1 * si2); + + /* (xa - xc) + (yb - yd) */ + r1 = r2 + t1; + + /* (xa - xc) - (yb - yd) */ + r2 = r2 - t1; + + /* (ya - yc) - (xb - xd) */ + s1 = s2 - t2; + + /* (ya - yc) + (xb - xd) */ + s2 = s2 + t2; + + /* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */ + pSrc[2U * i2] = (r1 * co1) + (s1 * si1); + + /* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */ + pSrc[(2U * i2) + 1U] = (s1 * co1) - (r1 * si1); + + /* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */ + pSrc[2U * i3] = (r2 * co3) + (s2 * si3); + + /* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */ + pSrc[(2U * i3) + 1U] = (s2 * co3) - (r2 * si3); + + i0 += n1; + } while ( i0 < fftLen); + j++; + } while (j <= (n2 - 1U)); + twidCoefModifier <<= 2U; + } + +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ + +} + +/** + brief Core function for the floating-point CIFFT butterfly process. + param[in,out] pSrc points to the in-place buffer of floating-point data type + param[in] fftLen length of the FFT + param[in] pCoef points to twiddle coefficient buffer + param[in] twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. + param[in] onebyfftLen value of 1/fftLen + return none + */ + +void arm_radix4_butterfly_inverse_f32( + float32_t * pSrc, + uint16_t fftLen, + const float32_t * pCoef, + uint16_t twidCoefModifier, + float32_t onebyfftLen) +{ + float32_t co1, co2, co3, si1, si2, si3; + uint32_t ia1, ia2, ia3; + uint32_t i0, i1, i2, i3; + uint32_t n1, n2, j, k; + +#if defined (ARM_MATH_LOOPUNROLL) + + float32_t xaIn, yaIn, xbIn, ybIn, xcIn, ycIn, xdIn, ydIn; + float32_t Xaplusc, Xbplusd, Yaplusc, Ybplusd, Xaminusc, Xbminusd, Yaminusc, + Ybminusd; + float32_t Xb12C_out, Yb12C_out, Xc12C_out, Yc12C_out, Xd12C_out, Yd12C_out; + float32_t Xb12_out, Yb12_out, Xc12_out, Yc12_out, Xd12_out, Yd12_out; + float32_t *ptr1; + float32_t p0,p1,p2,p3,p4,p5,p6,p7; + float32_t a0,a1,a2,a3,a4,a5,a6,a7; + + + /* Initializations for the first stage */ + n2 = fftLen; + n1 = n2; + + /* n2 = fftLen/4 */ + n2 >>= 2U; + i0 = 0U; + ia1 = 0U; + + j = n2; + + /* Calculation of first stage */ + do + { + /* index calculation for the input as, */ + /* pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2], pSrc[i0 + 3fftLen/4] */ + i1 = i0 + n2; + i2 = i1 + n2; + i3 = i2 + n2; + + /* Butterfly implementation */ + xaIn = pSrc[(2U * i0)]; + yaIn = pSrc[(2U * i0) + 1U]; + + xcIn = pSrc[(2U * i2)]; + ycIn = pSrc[(2U * i2) + 1U]; + + xbIn = pSrc[(2U * i1)]; + ybIn = pSrc[(2U * i1) + 1U]; + + xdIn = pSrc[(2U * i3)]; + ydIn = pSrc[(2U * i3) + 1U]; + + /* xa + xc */ + Xaplusc = xaIn + xcIn; + /* xb + xd */ + Xbplusd = xbIn + xdIn; + /* ya + yc */ + Yaplusc = yaIn + ycIn; + /* yb + yd */ + Ybplusd = ybIn + ydIn; + + /* index calculation for the coefficients */ + ia2 = ia1 + ia1; + co2 = pCoef[ia2 * 2U]; + si2 = pCoef[(ia2 * 2U) + 1U]; + + /* xa - xc */ + Xaminusc = xaIn - xcIn; + /* xb - xd */ + Xbminusd = xbIn - xdIn; + /* ya - yc */ + Yaminusc = yaIn - ycIn; + /* yb - yd */ + Ybminusd = ybIn - ydIn; + + /* xa' = xa + xb + xc + xd */ + pSrc[(2U * i0)] = Xaplusc + Xbplusd; + + /* ya' = ya + yb + yc + yd */ + pSrc[(2U * i0) + 1U] = Yaplusc + Ybplusd; + + /* (xa - xc) - (yb - yd) */ + Xb12C_out = (Xaminusc - Ybminusd); + /* (ya - yc) + (xb - xd) */ + Yb12C_out = (Yaminusc + Xbminusd); + /* (xa + xc) - (xb + xd) */ + Xc12C_out = (Xaplusc - Xbplusd); + /* (ya + yc) - (yb + yd) */ + Yc12C_out = (Yaplusc - Ybplusd); + /* (xa - xc) + (yb - yd) */ + Xd12C_out = (Xaminusc + Ybminusd); + /* (ya - yc) - (xb - xd) */ + Yd12C_out = (Yaminusc - Xbminusd); + + co1 = pCoef[ia1 * 2U]; + si1 = pCoef[(ia1 * 2U) + 1U]; + + /* index calculation for the coefficients */ + ia3 = ia2 + ia1; + co3 = pCoef[ia3 * 2U]; + si3 = pCoef[(ia3 * 2U) + 1U]; + + Xb12_out = Xb12C_out * co1; + Yb12_out = Yb12C_out * co1; + Xc12_out = Xc12C_out * co2; + Yc12_out = Yc12C_out * co2; + Xd12_out = Xd12C_out * co3; + Yd12_out = Yd12C_out * co3; + + /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */ + //Xb12_out -= Yb12C_out * si1; + p0 = Yb12C_out * si1; + /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */ + //Yb12_out += Xb12C_out * si1; + p1 = Xb12C_out * si1; + /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */ + //Xc12_out -= Yc12C_out * si2; + p2 = Yc12C_out * si2; + /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */ + //Yc12_out += Xc12C_out * si2; + p3 = Xc12C_out * si2; + /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */ + //Xd12_out -= Yd12C_out * si3; + p4 = Yd12C_out * si3; + /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */ + //Yd12_out += Xd12C_out * si3; + p5 = Xd12C_out * si3; + + Xb12_out -= p0; + Yb12_out += p1; + Xc12_out -= p2; + Yc12_out += p3; + Xd12_out -= p4; + Yd12_out += p5; + + /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */ + pSrc[2U * i1] = Xc12_out; + + /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */ + pSrc[(2U * i1) + 1U] = Yc12_out; + + /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */ + pSrc[2U * i2] = Xb12_out; + + /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */ + pSrc[(2U * i2) + 1U] = Yb12_out; + + /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */ + pSrc[2U * i3] = Xd12_out; + + /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */ + pSrc[(2U * i3) + 1U] = Yd12_out; + + /* Twiddle coefficients index modifier */ + ia1 = ia1 + twidCoefModifier; + + /* Updating input index */ + i0 = i0 + 1U; + + } while (--j); + + twidCoefModifier <<= 2U; + + /* Calculation of second stage to excluding last stage */ + for (k = fftLen >> 2U; k > 4U; k >>= 2U) + { + /* Initializations for the first stage */ + n1 = n2; + n2 >>= 2U; + ia1 = 0U; + + /* Calculation of first stage */ + j = 0; + do + { + /* index calculation for the coefficients */ + ia2 = ia1 + ia1; + ia3 = ia2 + ia1; + co1 = pCoef[ia1 * 2U]; + si1 = pCoef[(ia1 * 2U) + 1U]; + co2 = pCoef[ia2 * 2U]; + si2 = pCoef[(ia2 * 2U) + 1U]; + co3 = pCoef[ia3 * 2U]; + si3 = pCoef[(ia3 * 2U) + 1U]; + + /* Twiddle coefficients index modifier */ + ia1 = ia1 + twidCoefModifier; + + i0 = j; + do + { + /* index calculation for the input as, */ + /* pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2], pSrc[i0 + 3fftLen/4] */ + i1 = i0 + n2; + i2 = i1 + n2; + i3 = i2 + n2; + + xaIn = pSrc[(2U * i0)]; + yaIn = pSrc[(2U * i0) + 1U]; + + xbIn = pSrc[(2U * i1)]; + ybIn = pSrc[(2U * i1) + 1U]; + + xcIn = pSrc[(2U * i2)]; + ycIn = pSrc[(2U * i2) + 1U]; + + xdIn = pSrc[(2U * i3)]; + ydIn = pSrc[(2U * i3) + 1U]; + + /* xa - xc */ + Xaminusc = xaIn - xcIn; + /* (xb - xd) */ + Xbminusd = xbIn - xdIn; + /* ya - yc */ + Yaminusc = yaIn - ycIn; + /* (yb - yd) */ + Ybminusd = ybIn - ydIn; + + /* xa + xc */ + Xaplusc = xaIn + xcIn; + /* xb + xd */ + Xbplusd = xbIn + xdIn; + /* ya + yc */ + Yaplusc = yaIn + ycIn; + /* yb + yd */ + Ybplusd = ybIn + ydIn; + + /* (xa - xc) - (yb - yd) */ + Xb12C_out = (Xaminusc - Ybminusd); + /* (ya - yc) + (xb - xd) */ + Yb12C_out = (Yaminusc + Xbminusd); + /* xa + xc -(xb + xd) */ + Xc12C_out = (Xaplusc - Xbplusd); + /* (ya + yc) - (yb + yd) */ + Yc12C_out = (Yaplusc - Ybplusd); + /* (xa - xc) + (yb - yd) */ + Xd12C_out = (Xaminusc + Ybminusd); + /* (ya - yc) - (xb - xd) */ + Yd12C_out = (Yaminusc - Xbminusd); + + pSrc[(2U * i0)] = Xaplusc + Xbplusd; + pSrc[(2U * i0) + 1U] = Yaplusc + Ybplusd; + + Xb12_out = Xb12C_out * co1; + Yb12_out = Yb12C_out * co1; + Xc12_out = Xc12C_out * co2; + Yc12_out = Yc12C_out * co2; + Xd12_out = Xd12C_out * co3; + Yd12_out = Yd12C_out * co3; + + /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */ + //Xb12_out -= Yb12C_out * si1; + p0 = Yb12C_out * si1; + /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */ + //Yb12_out += Xb12C_out * si1; + p1 = Xb12C_out * si1; + /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */ + //Xc12_out -= Yc12C_out * si2; + p2 = Yc12C_out * si2; + /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */ + //Yc12_out += Xc12C_out * si2; + p3 = Xc12C_out * si2; + /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */ + //Xd12_out -= Yd12C_out * si3; + p4 = Yd12C_out * si3; + /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */ + //Yd12_out += Xd12C_out * si3; + p5 = Xd12C_out * si3; + + Xb12_out -= p0; + Yb12_out += p1; + Xc12_out -= p2; + Yc12_out += p3; + Xd12_out -= p4; + Yd12_out += p5; + + /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */ + pSrc[2U * i1] = Xc12_out; + + /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */ + pSrc[(2U * i1) + 1U] = Yc12_out; + + /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */ + pSrc[2U * i2] = Xb12_out; + + /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */ + pSrc[(2U * i2) + 1U] = Yb12_out; + + /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */ + pSrc[2U * i3] = Xd12_out; + + /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */ + pSrc[(2U * i3) + 1U] = Yd12_out; + + i0 += n1; + } while (i0 < fftLen); + j++; + } while (j <= (n2 - 1U)); + twidCoefModifier <<= 2U; + } + /* Initializations of last stage */ + + j = fftLen >> 2; + ptr1 = &pSrc[0]; + + /* Calculations of last stage */ + do + { + xaIn = ptr1[0]; + yaIn = ptr1[1]; + xbIn = ptr1[2]; + ybIn = ptr1[3]; + xcIn = ptr1[4]; + ycIn = ptr1[5]; + xdIn = ptr1[6]; + ydIn = ptr1[7]; + + /* Butterfly implementation */ + /* xa + xc */ + Xaplusc = xaIn + xcIn; + + /* xa - xc */ + Xaminusc = xaIn - xcIn; + + /* ya + yc */ + Yaplusc = yaIn + ycIn; + + /* ya - yc */ + Yaminusc = yaIn - ycIn; + + /* xb + xd */ + Xbplusd = xbIn + xdIn; + + /* yb + yd */ + Ybplusd = ybIn + ydIn; + + /* (xb-xd) */ + Xbminusd = xbIn - xdIn; + + /* (yb-yd) */ + Ybminusd = ybIn - ydIn; + + /* xa' = (xa+xb+xc+xd) * onebyfftLen */ + a0 = (Xaplusc + Xbplusd); + /* ya' = (ya+yb+yc+yd) * onebyfftLen */ + a1 = (Yaplusc + Ybplusd); + /* xc' = (xa-xb+xc-xd) * onebyfftLen */ + a2 = (Xaplusc - Xbplusd); + /* yc' = (ya-yb+yc-yd) * onebyfftLen */ + a3 = (Yaplusc - Ybplusd); + /* xb' = (xa-yb-xc+yd) * onebyfftLen */ + a4 = (Xaminusc - Ybminusd); + /* yb' = (ya+xb-yc-xd) * onebyfftLen */ + a5 = (Yaminusc + Xbminusd); + /* xd' = (xa-yb-xc+yd) * onebyfftLen */ + a6 = (Xaminusc + Ybminusd); + /* yd' = (ya-xb-yc+xd) * onebyfftLen */ + a7 = (Yaminusc - Xbminusd); + + p0 = a0 * onebyfftLen; + p1 = a1 * onebyfftLen; + p2 = a2 * onebyfftLen; + p3 = a3 * onebyfftLen; + p4 = a4 * onebyfftLen; + p5 = a5 * onebyfftLen; + p6 = a6 * onebyfftLen; + p7 = a7 * onebyfftLen; + + /* xa' = (xa+xb+xc+xd) * onebyfftLen */ + ptr1[0] = p0; + /* ya' = (ya+yb+yc+yd) * onebyfftLen */ + ptr1[1] = p1; + /* xc' = (xa-xb+xc-xd) * onebyfftLen */ + ptr1[2] = p2; + /* yc' = (ya-yb+yc-yd) * onebyfftLen */ + ptr1[3] = p3; + /* xb' = (xa-yb-xc+yd) * onebyfftLen */ + ptr1[4] = p4; + /* yb' = (ya+xb-yc-xd) * onebyfftLen */ + ptr1[5] = p5; + /* xd' = (xa-yb-xc+yd) * onebyfftLen */ + ptr1[6] = p6; + /* yd' = (ya-xb-yc+xd) * onebyfftLen */ + ptr1[7] = p7; + + /* increment source pointer by 8 for next calculations */ + ptr1 = ptr1 + 8U; + + } while (--j); + +#else + + float32_t t1, t2, r1, r2, s1, s2; + + /* Initializations for the first stage */ + n2 = fftLen; + n1 = n2; + + /* Calculation of first stage */ + for (k = fftLen; k > 4U; k >>= 2U) + { + /* Initializations for the first stage */ + n1 = n2; + n2 >>= 2U; + ia1 = 0U; + + /* Calculation of first stage */ + j = 0; + do + { + /* index calculation for the coefficients */ + ia2 = ia1 + ia1; + ia3 = ia2 + ia1; + co1 = pCoef[ia1 * 2U]; + si1 = pCoef[(ia1 * 2U) + 1U]; + co2 = pCoef[ia2 * 2U]; + si2 = pCoef[(ia2 * 2U) + 1U]; + co3 = pCoef[ia3 * 2U]; + si3 = pCoef[(ia3 * 2U) + 1U]; + + /* Twiddle coefficients index modifier */ + ia1 = ia1 + twidCoefModifier; + + i0 = j; + do + { + /* index calculation for the input as, */ + /* pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2], pSrc[i0 + 3fftLen/4] */ + i1 = i0 + n2; + i2 = i1 + n2; + i3 = i2 + n2; + + /* xa + xc */ + r1 = pSrc[(2U * i0)] + pSrc[(2U * i2)]; + + /* xa - xc */ + r2 = pSrc[(2U * i0)] - pSrc[(2U * i2)]; + + /* ya + yc */ + s1 = pSrc[(2U * i0) + 1U] + pSrc[(2U * i2) + 1U]; + + /* ya - yc */ + s2 = pSrc[(2U * i0) + 1U] - pSrc[(2U * i2) + 1U]; + + /* xb + xd */ + t1 = pSrc[2U * i1] + pSrc[2U * i3]; + + /* xa' = xa + xb + xc + xd */ + pSrc[2U * i0] = r1 + t1; + + /* xa + xc -(xb + xd) */ + r1 = r1 - t1; + + /* yb + yd */ + t2 = pSrc[(2U * i1) + 1U] + pSrc[(2U * i3) + 1U]; + + /* ya' = ya + yb + yc + yd */ + pSrc[(2U * i0) + 1U] = s1 + t2; + + /* (ya + yc) - (yb + yd) */ + s1 = s1 - t2; + + /* (yb - yd) */ + t1 = pSrc[(2U * i1) + 1U] - pSrc[(2U * i3) + 1U]; + + /* (xb - xd) */ + t2 = pSrc[2U * i1] - pSrc[2U * i3]; + + /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */ + pSrc[2U * i1] = (r1 * co2) - (s1 * si2); + + /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */ + pSrc[(2U * i1) + 1U] = (s1 * co2) + (r1 * si2); + + /* (xa - xc) - (yb - yd) */ + r1 = r2 - t1; + + /* (xa - xc) + (yb - yd) */ + r2 = r2 + t1; + + /* (ya - yc) + (xb - xd) */ + s1 = s2 + t2; + + /* (ya - yc) - (xb - xd) */ + s2 = s2 - t2; + + /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */ + pSrc[2U * i2] = (r1 * co1) - (s1 * si1); + + /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */ + pSrc[(2U * i2) + 1U] = (s1 * co1) + (r1 * si1); + + /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */ + pSrc[2U * i3] = (r2 * co3) - (s2 * si3); + + /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */ + pSrc[(2U * i3) + 1U] = (s2 * co3) + (r2 * si3); + + i0 += n1; + } while ( i0 < fftLen); + j++; + } while (j <= (n2 - 1U)); + twidCoefModifier <<= 2U; + } + /* Initializations of last stage */ + n1 = n2; + n2 >>= 2U; + + /* Calculations of last stage */ + for (i0 = 0U; i0 <= (fftLen - n1); i0 += n1) + { + /* index calculation for the input as, */ + /* pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2], pSrc[i0 + 3fftLen/4] */ + i1 = i0 + n2; + i2 = i1 + n2; + i3 = i2 + n2; + + /* Butterfly implementation */ + /* xa + xc */ + r1 = pSrc[2U * i0] + pSrc[2U * i2]; + + /* xa - xc */ + r2 = pSrc[2U * i0] - pSrc[2U * i2]; + + /* ya + yc */ + s1 = pSrc[(2U * i0) + 1U] + pSrc[(2U * i2) + 1U]; + + /* ya - yc */ + s2 = pSrc[(2U * i0) + 1U] - pSrc[(2U * i2) + 1U]; + + /* xc + xd */ + t1 = pSrc[2U * i1] + pSrc[2U * i3]; + + /* xa' = xa + xb + xc + xd */ + pSrc[2U * i0] = (r1 + t1) * onebyfftLen; + + /* (xa + xb) - (xc + xd) */ + r1 = r1 - t1; + + /* yb + yd */ + t2 = pSrc[(2U * i1) + 1U] + pSrc[(2U * i3) + 1U]; + + /* ya' = ya + yb + yc + yd */ + pSrc[(2U * i0) + 1U] = (s1 + t2) * onebyfftLen; + + /* (ya + yc) - (yb + yd) */ + s1 = s1 - t2; + + /* (yb-yd) */ + t1 = pSrc[(2U * i1) + 1U] - pSrc[(2U * i3) + 1U]; + + /* (xb-xd) */ + t2 = pSrc[2U * i1] - pSrc[2U * i3]; + + /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */ + pSrc[2U * i1] = r1 * onebyfftLen; + + /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */ + pSrc[(2U * i1) + 1U] = s1 * onebyfftLen; + + /* (xa - xc) - (yb-yd) */ + r1 = r2 - t1; + + /* (xa - xc) + (yb-yd) */ + r2 = r2 + t1; + + /* (ya - yc) + (xb-xd) */ + s1 = s2 + t2; + + /* (ya - yc) - (xb-xd) */ + s2 = s2 - t2; + + /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */ + pSrc[2U * i2] = r1 * onebyfftLen; + + /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */ + pSrc[(2U * i2) + 1U] = s1 * onebyfftLen; + + /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */ + pSrc[2U * i3] = r2 * onebyfftLen; + + /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */ + pSrc[(2U * i3) + 1U] = s2 * onebyfftLen; + } + +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ +} + + diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_init_f16.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_init_f16.c new file mode 100644 index 0000000..cb45cd8 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_init_f16.c @@ -0,0 +1,171 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_cfft_radix4_init_f16.c + * Description: Radix-4 Decimation in Frequency Floating-point CFFT & CIFFT Initialization function + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/transform_functions_f16.h" +#include "arm_common_tables.h" +#include "arm_common_tables_f16.h" + +/** + @ingroup groupTransforms + */ + +/** + @addtogroup ComplexFFT + @{ + */ + +/** + @brief Initialization function for the floating-point CFFT/CIFFT. + @deprecated Do not use this function. It has been superceded by \ref arm_cfft_f16 and will be removed in the future. + @param[in,out] S points to an instance of the floating-point CFFT/CIFFT structure + @param[in] fftLen length of the FFT + @param[in] ifftFlag flag that selects transform direction + - value = 0: forward transform + - value = 1: inverse transform + @param[in] bitReverseFlag flag that enables / disables bit reversal of output + - value = 0: disables bit reversal of output + - value = 1: enables bit reversal of output + @return execution status + - \ref ARM_MATH_SUCCESS : Operation successful + - \ref ARM_MATH_ARGUMENT_ERROR : <code>fftLen</code> is not a supported length + + @par Details + The parameter <code>ifftFlag</code> controls whether a forward or inverse transform is computed. + Set(=1) ifftFlag for calculation of CIFFT otherwise CFFT is calculated + @par + The parameter <code>bitReverseFlag</code> controls whether output is in normal order or bit reversed order. + Set(=1) bitReverseFlag for output to be in normal order otherwise output is in bit reversed order. + @par + The parameter <code>fftLen</code> Specifies length of CFFT/CIFFT process. Supported FFT Lengths are 16, 64, 256, 1024. + @par + This Function also initializes Twiddle factor table pointer and Bit reversal table pointer. + */ + +#if defined(ARM_FLOAT16_SUPPORTED) + +arm_status arm_cfft_radix4_init_f16( + arm_cfft_radix4_instance_f16 * S, + uint16_t fftLen, + uint8_t ifftFlag, + uint8_t bitReverseFlag) +{ + /* Initialise the default arm status */ + arm_status status = ARM_MATH_ARGUMENT_ERROR; + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES) + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F16_4096) + + /* Initialise the default arm status */ + status = ARM_MATH_SUCCESS; + + /* Initialise the FFT length */ + S->fftLen = fftLen; + + /* Initialise the Twiddle coefficient pointer */ + S->pTwiddle = (float16_t *) twiddleCoefF16; + + /* Initialise the Flag for selection of CFFT or CIFFT */ + S->ifftFlag = ifftFlag; + + /* Initialise the Flag for calculation Bit reversal or not */ + S->bitReverseFlag = bitReverseFlag; + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREV_1024) + + /* Initializations of structure parameters depending on the FFT length */ + switch (S->fftLen) + { + + case 4096U: + /* Initializations of structure parameters for 4096 point FFT */ + + /* Initialise the twiddle coef modifier value */ + S->twidCoefModifier = 1U; + /* Initialise the bit reversal table modifier */ + S->bitRevFactor = 1U; + /* Initialise the bit reversal table pointer */ + S->pBitRevTable = (uint16_t *) armBitRevTable; + /* Initialise the 1/fftLen Value */ + S->onebyfftLen = 0.000244140625; + break; + + case 1024U: + /* Initializations of structure parameters for 1024 point FFT */ + + /* Initialise the twiddle coef modifier value */ + S->twidCoefModifier = 4U; + /* Initialise the bit reversal table modifier */ + S->bitRevFactor = 4U; + /* Initialise the bit reversal table pointer */ + S->pBitRevTable = (uint16_t *) & armBitRevTable[3]; + /* Initialise the 1/fftLen Value */ + S->onebyfftLen = 0.0009765625f; + break; + + + case 256U: + /* Initializations of structure parameters for 256 point FFT */ + S->twidCoefModifier = 16U; + S->bitRevFactor = 16U; + S->pBitRevTable = (uint16_t *) & armBitRevTable[15]; + S->onebyfftLen = 0.00390625f; + break; + + case 64U: + /* Initializations of structure parameters for 64 point FFT */ + S->twidCoefModifier = 64U; + S->bitRevFactor = 64U; + S->pBitRevTable = (uint16_t *) & armBitRevTable[63]; + S->onebyfftLen = 0.015625f; + break; + + case 16U: + /* Initializations of structure parameters for 16 point FFT */ + S->twidCoefModifier = 256U; + S->bitRevFactor = 256U; + S->pBitRevTable = (uint16_t *) & armBitRevTable[255]; + S->onebyfftLen = 0.0625f; + break; + + + default: + /* Reporting argument error if fftSize is not valid value */ + status = ARM_MATH_ARGUMENT_ERROR; + break; + } + +#endif +#endif +#endif + return (status); +} +#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ +/** + @} end of ComplexFFT group + */ diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_init_f32.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_init_f32.c new file mode 100644 index 0000000..b3aabbb --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_init_f32.c @@ -0,0 +1,168 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_cfft_radix4_init_f32.c + * Description: Radix-4 Decimation in Frequency Floating-point CFFT & CIFFT Initialization function + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/transform_functions.h" +#include "arm_common_tables.h" + +/** + @ingroup groupTransforms + */ + +/** + @addtogroup ComplexFFT + @{ + */ + +/** + @brief Initialization function for the floating-point CFFT/CIFFT. + @deprecated Do not use this function. It has been superceded by \ref arm_cfft_f32 and will be removed in the future. + @param[in,out] S points to an instance of the floating-point CFFT/CIFFT structure + @param[in] fftLen length of the FFT + @param[in] ifftFlag flag that selects transform direction + - value = 0: forward transform + - value = 1: inverse transform + @param[in] bitReverseFlag flag that enables / disables bit reversal of output + - value = 0: disables bit reversal of output + - value = 1: enables bit reversal of output + @return execution status + - \ref ARM_MATH_SUCCESS : Operation successful + - \ref ARM_MATH_ARGUMENT_ERROR : <code>fftLen</code> is not a supported length + + @par Details + The parameter <code>ifftFlag</code> controls whether a forward or inverse transform is computed. + Set(=1) ifftFlag for calculation of CIFFT otherwise CFFT is calculated + @par + The parameter <code>bitReverseFlag</code> controls whether output is in normal order or bit reversed order. + Set(=1) bitReverseFlag for output to be in normal order otherwise output is in bit reversed order. + @par + The parameter <code>fftLen</code> Specifies length of CFFT/CIFFT process. Supported FFT Lengths are 16, 64, 256, 1024. + @par + This Function also initializes Twiddle factor table pointer and Bit reversal table pointer. + */ + +arm_status arm_cfft_radix4_init_f32( + arm_cfft_radix4_instance_f32 * S, + uint16_t fftLen, + uint8_t ifftFlag, + uint8_t bitReverseFlag) +{ + /* Initialise the default arm status */ + arm_status status = ARM_MATH_ARGUMENT_ERROR; + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES) + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F32_4096) + + /* Initialise the default arm status */ + status = ARM_MATH_SUCCESS; + + /* Initialise the FFT length */ + S->fftLen = fftLen; + + /* Initialise the Twiddle coefficient pointer */ + S->pTwiddle = (float32_t *) twiddleCoef; + + /* Initialise the Flag for selection of CFFT or CIFFT */ + S->ifftFlag = ifftFlag; + + /* Initialise the Flag for calculation Bit reversal or not */ + S->bitReverseFlag = bitReverseFlag; + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F32_4096) + + /* Initializations of structure parameters depending on the FFT length */ + switch (S->fftLen) + { + + case 4096U: + /* Initializations of structure parameters for 4096 point FFT */ + + /* Initialise the twiddle coef modifier value */ + S->twidCoefModifier = 1U; + /* Initialise the bit reversal table modifier */ + S->bitRevFactor = 1U; + /* Initialise the bit reversal table pointer */ + S->pBitRevTable = (uint16_t *) armBitRevTable; + /* Initialise the 1/fftLen Value */ + S->onebyfftLen = 0.000244140625; + break; + + case 1024U: + /* Initializations of structure parameters for 1024 point FFT */ + + /* Initialise the twiddle coef modifier value */ + S->twidCoefModifier = 4U; + /* Initialise the bit reversal table modifier */ + S->bitRevFactor = 4U; + /* Initialise the bit reversal table pointer */ + S->pBitRevTable = (uint16_t *) & armBitRevTable[3]; + /* Initialise the 1/fftLen Value */ + S->onebyfftLen = 0.0009765625f; + break; + + + case 256U: + /* Initializations of structure parameters for 256 point FFT */ + S->twidCoefModifier = 16U; + S->bitRevFactor = 16U; + S->pBitRevTable = (uint16_t *) & armBitRevTable[15]; + S->onebyfftLen = 0.00390625f; + break; + + case 64U: + /* Initializations of structure parameters for 64 point FFT */ + S->twidCoefModifier = 64U; + S->bitRevFactor = 64U; + S->pBitRevTable = (uint16_t *) & armBitRevTable[63]; + S->onebyfftLen = 0.015625f; + break; + + case 16U: + /* Initializations of structure parameters for 16 point FFT */ + S->twidCoefModifier = 256U; + S->bitRevFactor = 256U; + S->pBitRevTable = (uint16_t *) & armBitRevTable[255]; + S->onebyfftLen = 0.0625f; + break; + + + default: + /* Reporting argument error if fftSize is not valid value */ + status = ARM_MATH_ARGUMENT_ERROR; + break; + } +#endif +#endif +#endif + + return (status); +} + +/** + @} end of ComplexFFT group + */ diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_init_q15.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_init_q15.c new file mode 100644 index 0000000..7774243 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_init_q15.c @@ -0,0 +1,157 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_cfft_radix4_init_q15.c + * Description: Radix-4 Decimation in Frequency Q15 FFT & IFFT initialization function + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/transform_functions.h" +#include "arm_common_tables.h" + +/** + @ingroup groupTransforms + */ + + +/** + @addtogroup ComplexFFT + @{ + */ + + +/** + @brief Initialization function for the Q15 CFFT/CIFFT. + @deprecated Do not use this function. It has been superseded by \ref arm_cfft_q15 and will be removed in the future. + @param[in,out] S points to an instance of the Q15 CFFT/CIFFT structure + @param[in] fftLen length of the FFT + @param[in] ifftFlag flag that selects transform direction + - value = 0: forward transform + - value = 1: inverse transform + @param[in] bitReverseFlag flag that enables / disables bit reversal of output + - value = 0: disables bit reversal of output + - value = 1: enables bit reversal of output + @return execution status + - \ref ARM_MATH_SUCCESS : Operation successful + - \ref ARM_MATH_ARGUMENT_ERROR : <code>fftLen</code> is not a supported length + + @par Details + The parameter <code>ifftFlag</code> controls whether a forward or inverse transform is computed. + Set(=1) ifftFlag for calculation of CIFFT otherwise CFFT is calculated + @par + The parameter <code>bitReverseFlag</code> controls whether output is in normal order or bit reversed order. + Set(=1) bitReverseFlag for output to be in normal order otherwise output is in bit reversed order. + @par + The parameter <code>fftLen</code> Specifies length of CFFT/CIFFT process. Supported FFT Lengths are 16, 64, 256, 1024. + @par + This Function also initializes Twiddle factor table pointer and Bit reversal table pointer. + */ + +arm_status arm_cfft_radix4_init_q15( + arm_cfft_radix4_instance_q15 * S, + uint16_t fftLen, + uint8_t ifftFlag, + uint8_t bitReverseFlag) +{ + /* Initialise the default arm status */ + arm_status status = ARM_MATH_ARGUMENT_ERROR; + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES) + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q15_4096) + + /* Initialise the default arm status */ + status = ARM_MATH_SUCCESS; + /* Initialise the FFT length */ + S->fftLen = fftLen; + /* Initialise the Twiddle coefficient pointer */ + S->pTwiddle = (q15_t *) twiddleCoef_4096_q15; + /* Initialise the Flag for selection of CFFT or CIFFT */ + S->ifftFlag = ifftFlag; + /* Initialise the Flag for calculation Bit reversal or not */ + S->bitReverseFlag = bitReverseFlag; + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREV_1024) + + /* Initializations of structure parameters depending on the FFT length */ + switch (S->fftLen) + { + case 4096U: + /* Initializations of structure parameters for 4096 point FFT */ + + /* Initialise the twiddle coef modifier value */ + S->twidCoefModifier = 1U; + /* Initialise the bit reversal table modifier */ + S->bitRevFactor = 1U; + /* Initialise the bit reversal table pointer */ + S->pBitRevTable = (uint16_t *) armBitRevTable; + + break; + + case 1024U: + /* Initializations of structure parameters for 1024 point FFT */ + S->twidCoefModifier = 4U; + S->bitRevFactor = 4U; + S->pBitRevTable = (uint16_t *) & armBitRevTable[3]; + + break; + + case 256U: + /* Initializations of structure parameters for 256 point FFT */ + S->twidCoefModifier = 16U; + S->bitRevFactor = 16U; + S->pBitRevTable = (uint16_t *) & armBitRevTable[15]; + + break; + + case 64U: + /* Initializations of structure parameters for 64 point FFT */ + S->twidCoefModifier = 64U; + S->bitRevFactor = 64U; + S->pBitRevTable = (uint16_t *) & armBitRevTable[63]; + + break; + + case 16U: + /* Initializations of structure parameters for 16 point FFT */ + S->twidCoefModifier = 256U; + S->bitRevFactor = 256U; + S->pBitRevTable = (uint16_t *) & armBitRevTable[255]; + + break; + + default: + /* Reporting argument error if fftSize is not valid value */ + status = ARM_MATH_ARGUMENT_ERROR; + break; + } + +#endif +#endif +#endif + return (status); +} + +/** + @} end of ComplexFFT group + */ diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_init_q31.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_init_q31.c new file mode 100644 index 0000000..04ba393 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_init_q31.c @@ -0,0 +1,154 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_cfft_radix4_init_q31.c + * Description: Radix-4 Decimation in Frequency Q31 FFT & IFFT initialization function + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/transform_functions.h" +#include "arm_common_tables.h" + +/** + @ingroup groupTransforms + */ + +/** + @addtogroup ComplexFFT + @{ + */ + +/** + + @brief Initialization function for the Q31 CFFT/CIFFT. + @deprecated Do not use this function. It has been superseded by \ref arm_cfft_q31 and will be removed in the future. + @param[in,out] S points to an instance of the Q31 CFFT/CIFFT structure. + @param[in] fftLen length of the FFT. + @param[in] ifftFlag flag that selects transform direction + - value = 0: forward transform + - value = 1: inverse transform + @param[in] bitReverseFlag flag that enables / disables bit reversal of output + - value = 0: disables bit reversal of output + - value = 1: enables bit reversal of output + @return execution status + - \ref ARM_MATH_SUCCESS : Operation successful + - \ref ARM_MATH_ARGUMENT_ERROR : <code>fftLen</code> is not a supported length + + @par Details + The parameter <code>ifftFlag</code> controls whether a forward or inverse transform is computed. + Set(=1) ifftFlag for calculation of CIFFT otherwise CFFT is calculated + @par + The parameter <code>bitReverseFlag</code> controls whether output is in normal order or bit reversed order. + Set(=1) bitReverseFlag for output to be in normal order otherwise output is in bit reversed order. + @par + The parameter <code>fftLen</code> Specifies length of CFFT/CIFFT process. Supported FFT Lengths are 16, 64, 256, 1024. + @par + This Function also initializes Twiddle factor table pointer and Bit reversal table pointer. +*/ + +arm_status arm_cfft_radix4_init_q31( + arm_cfft_radix4_instance_q31 * S, + uint16_t fftLen, + uint8_t ifftFlag, + uint8_t bitReverseFlag) +{ + + /* Initialise the default arm status */ + arm_status status = ARM_MATH_ARGUMENT_ERROR; + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES) + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q31_4096) + + /* Initialise the default arm status */ + status = ARM_MATH_SUCCESS; + /* Initialise the FFT length */ + S->fftLen = fftLen; + /* Initialise the Twiddle coefficient pointer */ + S->pTwiddle = (q31_t *) twiddleCoef_4096_q31; + /* Initialise the Flag for selection of CFFT or CIFFT */ + S->ifftFlag = ifftFlag; + /* Initialise the Flag for calculation Bit reversal or not */ + S->bitReverseFlag = bitReverseFlag; + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREV_1024) + + /* Initializations of Instance structure depending on the FFT length */ + switch (S->fftLen) + { + /* Initializations of structure parameters for 4096 point FFT */ + case 4096U: + /* Initialise the twiddle coef modifier value */ + S->twidCoefModifier = 1U; + /* Initialise the bit reversal table modifier */ + S->bitRevFactor = 1U; + /* Initialise the bit reversal table pointer */ + S->pBitRevTable = (uint16_t *) armBitRevTable; + break; + + /* Initializations of structure parameters for 1024 point FFT */ + case 1024U: + /* Initialise the twiddle coef modifier value */ + S->twidCoefModifier = 4U; + /* Initialise the bit reversal table modifier */ + S->bitRevFactor = 4U; + /* Initialise the bit reversal table pointer */ + S->pBitRevTable = (uint16_t *) & armBitRevTable[3]; + break; + + case 256U: + /* Initializations of structure parameters for 256 point FFT */ + S->twidCoefModifier = 16U; + S->bitRevFactor = 16U; + S->pBitRevTable = (uint16_t *) & armBitRevTable[15]; + break; + + case 64U: + /* Initializations of structure parameters for 64 point FFT */ + S->twidCoefModifier = 64U; + S->bitRevFactor = 64U; + S->pBitRevTable = (uint16_t *) & armBitRevTable[63]; + break; + + case 16U: + /* Initializations of structure parameters for 16 point FFT */ + S->twidCoefModifier = 256U; + S->bitRevFactor = 256U; + S->pBitRevTable = (uint16_t *) & armBitRevTable[255]; + break; + + default: + /* Reporting argument error if fftSize is not valid value */ + status = ARM_MATH_ARGUMENT_ERROR; + break; + } + +#endif +#endif +#endif + return (status); +} + +/** + @} end of ComplexFFT group + */ diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_q15.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_q15.c new file mode 100644 index 0000000..280acca --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_q15.c @@ -0,0 +1,1809 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_cfft_radix4_q15.c + * Description: This file has function definition of Radix-4 FFT & IFFT function and + * In-place bit reversal using bit reversal table + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/transform_functions.h" + + +void arm_radix4_butterfly_q15( + q15_t * pSrc16, + uint32_t fftLen, + const q15_t * pCoef16, + uint32_t twidCoefModifier); + +void arm_radix4_butterfly_inverse_q15( + q15_t * pSrc16, + uint32_t fftLen, + const q15_t * pCoef16, + uint32_t twidCoefModifier); + +void arm_bitreversal_q15( + q15_t * pSrc, + uint32_t fftLen, + uint16_t bitRevFactor, + const uint16_t * pBitRevTab); + +/** + @ingroup groupTransforms + */ + +/** + @addtogroup ComplexFFT + @{ + */ + + +/** + @brief Processing function for the Q15 CFFT/CIFFT. + @deprecated Do not use this function. It has been superseded by \ref arm_cfft_q15 and will be removed in the future. + @param[in] S points to an instance of the Q15 CFFT/CIFFT structure. + @param[in,out] pSrc points to the complex data buffer. Processing occurs in-place. + @return none + + @par Input and output formats: + Internally input is downscaled by 2 for every stage to avoid saturations inside CFFT/CIFFT process. + Hence the output format is different for different FFT sizes. + The input and output formats for different FFT sizes and number of bits to upscale are mentioned in the tables below for CFFT and CIFFT: + @par + \image html CFFTQ15.gif "Input and Output Formats for Q15 CFFT" + \image html CIFFTQ15.gif "Input and Output Formats for Q15 CIFFT" + */ + +void arm_cfft_radix4_q15( + const arm_cfft_radix4_instance_q15 * S, + q15_t * pSrc) +{ + if (S->ifftFlag == 1U) + { + /* Complex IFFT radix-4 */ + arm_radix4_butterfly_inverse_q15(pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier); + } + else + { + /* Complex FFT radix-4 */ + arm_radix4_butterfly_q15(pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier); + } + + if (S->bitReverseFlag == 1U) + { + /* Bit Reversal */ + arm_bitreversal_q15(pSrc, S->fftLen, S->bitRevFactor, S->pBitRevTable); + } + +} + +/** + @} end of ComplexFFT group + */ + +/* + * Radix-4 FFT algorithm used is : + * + * Input real and imaginary data: + * x(n) = xa + j * ya + * x(n+N/4 ) = xb + j * yb + * x(n+N/2 ) = xc + j * yc + * x(n+3N 4) = xd + j * yd + * + * + * Output real and imaginary data: + * x(4r) = xa'+ j * ya' + * x(4r+1) = xb'+ j * yb' + * x(4r+2) = xc'+ j * yc' + * x(4r+3) = xd'+ j * yd' + * + * + * Twiddle factors for radix-4 FFT: + * Wn = co1 + j * (- si1) + * W2n = co2 + j * (- si2) + * W3n = co3 + j * (- si3) + + * The real and imaginary output values for the radix-4 butterfly are + * xa' = xa + xb + xc + xd + * ya' = ya + yb + yc + yd + * xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) + * yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) + * xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) + * yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) + * xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) + * yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) + * + */ + +/** + @brief Core function for the Q15 CFFT butterfly process. + @param[in,out] pSrc16 points to the in-place buffer of Q15 data type + @param[in] fftLen length of the FFT + @param[in] pCoef16 points to twiddle coefficient buffer + @param[in] twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table + @return none + */ + +void arm_radix4_butterfly_q15( + q15_t * pSrc16, + uint32_t fftLen, + const q15_t * pCoef16, + uint32_t twidCoefModifier) +{ + +#if defined (ARM_MATH_DSP) + + q31_t R, S, T, U; + q31_t C1, C2, C3, out1, out2; + uint32_t n1, n2, ic, i0, j, k; + + q15_t *ptr1; + q15_t *pSi0; + q15_t *pSi1; + q15_t *pSi2; + q15_t *pSi3; + + q31_t xaya, xbyb, xcyc, xdyd; + + /* Total process is divided into three stages */ + + /* process first stage, middle stages, & last stage */ + + /* Initializations for the first stage */ + n2 = fftLen; + n1 = n2; + + /* n2 = fftLen/4 */ + n2 >>= 2U; + + /* Index for twiddle coefficient */ + ic = 0U; + + /* Index for input read and output write */ + j = n2; + + pSi0 = pSrc16; + pSi1 = pSi0 + 2 * n2; + pSi2 = pSi1 + 2 * n2; + pSi3 = pSi2 + 2 * n2; + + /* Input is in 1.15(q15) format */ + + /* start of first stage process */ + do + { + /* Butterfly implementation */ + + /* Reading i0, i0+fftLen/2 inputs */ + /* Read ya (real), xa(imag) input */ + T = read_q15x2 (pSi0); + T = __SHADD16(T, 0); /* this is just a SIMD arithmetic shift right by 1 */ + T = __SHADD16(T, 0); /* it turns out doing this twice is 2 cycles, the alternative takes 3 cycles */ +/* + in = ((int16_t) (T & 0xFFFF)) >> 2; // alternative code that takes 3 cycles + T = ((T >> 2) & 0xFFFF0000) | (in & 0xFFFF); +*/ + + /* Read yc (real), xc(imag) input */ + S = read_q15x2 (pSi2); + S = __SHADD16(S, 0); + S = __SHADD16(S, 0); + + /* R = packed((ya + yc), (xa + xc) ) */ + R = __QADD16(T, S); + + /* S = packed((ya - yc), (xa - xc) ) */ + S = __QSUB16(T, S); + + /* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */ + /* Read yb (real), xb(imag) input */ + T = read_q15x2 (pSi1); + T = __SHADD16(T, 0); + T = __SHADD16(T, 0); + + /* Read yd (real), xd(imag) input */ + U = read_q15x2 (pSi3); + U = __SHADD16(U, 0); + U = __SHADD16(U, 0); + + /* T = packed((yb + yd), (xb + xd) ) */ + T = __QADD16(T, U); + + /* writing the butterfly processed i0 sample */ + /* xa' = xa + xb + xc + xd */ + /* ya' = ya + yb + yc + yd */ + write_q15x2_ia (&pSi0, __SHADD16(R, T)); + + /* R = packed((ya + yc) - (yb + yd), (xa + xc)- (xb + xd)) */ + R = __QSUB16(R, T); + + /* co2 & si2 are read from SIMD Coefficient pointer */ + C2 = read_q15x2 ((q15_t *) pCoef16 + (4U * ic)); + +#ifndef ARM_MATH_BIG_ENDIAN + /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */ + out1 = __SMUAD(C2, R) >> 16U; + /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */ + out2 = __SMUSDX(C2, R); +#else + /* xc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */ + out1 = __SMUSDX(R, C2) >> 16U; + /* yc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */ + out2 = __SMUAD(C2, R); +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* Reading i0+fftLen/4 */ + /* T = packed(yb, xb) */ + T = read_q15x2 (pSi1); + T = __SHADD16(T, 0); + T = __SHADD16(T, 0); + + /* writing the butterfly processed i0 + fftLen/4 sample */ + /* writing output(xc', yc') in little endian format */ + write_q15x2_ia (&pSi1, (q31_t) __PKHBT( out1, out2, 0 )); + + /* Butterfly calculations */ + /* U = packed(yd, xd) */ + U = read_q15x2 (pSi3); + U = __SHADD16(U, 0); + U = __SHADD16(U, 0); + + /* T = packed(yb-yd, xb-xd) */ + T = __QSUB16(T, U); + +#ifndef ARM_MATH_BIG_ENDIAN + /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */ + R = __QASX(S, T); + /* S = packed((ya-yc) - (xb- xd), (xa-xc) + (yb-yd)) */ + S = __QSAX(S, T); +#else + /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */ + R = __QSAX(S, T); + /* S = packed((ya-yc) - (xb- xd), (xa-xc) + (yb-yd)) */ + S = __QASX(S, T); +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* co1 & si1 are read from SIMD Coefficient pointer */ + C1 = read_q15x2 ((q15_t *) pCoef16 + (2U * ic)); + /* Butterfly process for the i0+fftLen/2 sample */ + +#ifndef ARM_MATH_BIG_ENDIAN + /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */ + out1 = __SMUAD(C1, S) >> 16U; + /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */ + out2 = __SMUSDX(C1, S); +#else + /* xb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */ + out1 = __SMUSDX(S, C1) >> 16U; + /* yb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */ + out2 = __SMUAD(C1, S); +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* writing output(xb', yb') in little endian format */ + write_q15x2_ia (&pSi2, __PKHBT( out1, out2, 0 )); + + /* co3 & si3 are read from SIMD Coefficient pointer */ + C3 = read_q15x2 ((q15_t *) pCoef16 + (6U * ic)); + /* Butterfly process for the i0+3fftLen/4 sample */ + +#ifndef ARM_MATH_BIG_ENDIAN + /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */ + out1 = __SMUAD(C3, R) >> 16U; + /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */ + out2 = __SMUSDX(C3, R); +#else + /* xd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */ + out1 = __SMUSDX(R, C3) >> 16U; + /* yd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */ + out2 = __SMUAD(C3, R); +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* writing output(xd', yd') in little endian format */ + write_q15x2_ia (&pSi3, __PKHBT( out1, out2, 0 )); + + /* Twiddle coefficients index modifier */ + ic = ic + twidCoefModifier; + + } while (--j); + /* data is in 4.11(q11) format */ + + /* end of first stage process */ + + + /* start of middle stage process */ + + /* Twiddle coefficients index modifier */ + twidCoefModifier <<= 2U; + + /* Calculation of Middle stage */ + for (k = fftLen / 4U; k > 4U; k >>= 2U) + { + /* Initializations for the middle stage */ + n1 = n2; + n2 >>= 2U; + ic = 0U; + + for (j = 0U; j <= (n2 - 1U); j++) + { + /* index calculation for the coefficients */ + C1 = read_q15x2 ((q15_t *) pCoef16 + (2U * ic)); + C2 = read_q15x2 ((q15_t *) pCoef16 + (4U * ic)); + C3 = read_q15x2 ((q15_t *) pCoef16 + (6U * ic)); + + /* Twiddle coefficients index modifier */ + ic = ic + twidCoefModifier; + + pSi0 = pSrc16 + 2 * j; + pSi1 = pSi0 + 2 * n2; + pSi2 = pSi1 + 2 * n2; + pSi3 = pSi2 + 2 * n2; + + /* Butterfly implementation */ + for (i0 = j; i0 < fftLen; i0 += n1) + { + /* Reading i0, i0+fftLen/2 inputs */ + /* Read ya (real), xa(imag) input */ + T = read_q15x2 (pSi0); + + /* Read yc (real), xc(imag) input */ + S = read_q15x2 (pSi2); + + /* R = packed( (ya + yc), (xa + xc)) */ + R = __QADD16(T, S); + + /* S = packed((ya - yc), (xa - xc)) */ + S = __QSUB16(T, S); + + /* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */ + /* Read yb (real), xb(imag) input */ + T = read_q15x2 (pSi1); + + /* Read yd (real), xd(imag) input */ + U = read_q15x2 (pSi3); + + /* T = packed( (yb + yd), (xb + xd)) */ + T = __QADD16(T, U); + + /* writing the butterfly processed i0 sample */ + + /* xa' = xa + xb + xc + xd */ + /* ya' = ya + yb + yc + yd */ + out1 = __SHADD16(R, T); + out1 = __SHADD16(out1, 0); + write_q15x2 (pSi0, out1); + pSi0 += 2 * n1; + + /* R = packed( (ya + yc) - (yb + yd), (xa + xc) - (xb + xd)) */ + R = __SHSUB16(R, T); + +#ifndef ARM_MATH_BIG_ENDIAN + /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */ + out1 = __SMUAD(C2, R) >> 16U; + + /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */ + out2 = __SMUSDX(C2, R); +#else + /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */ + out1 = __SMUSDX(R, C2) >> 16U; + + /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */ + out2 = __SMUAD(C2, R); +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* Reading i0+3fftLen/4 */ + /* Read yb (real), xb(imag) input */ + T = read_q15x2 (pSi1); + + /* writing the butterfly processed i0 + fftLen/4 sample */ + /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */ + /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */ + write_q15x2 (pSi1, __PKHBT( out1, out2, 0 )); + pSi1 += 2 * n1; + + /* Butterfly calculations */ + + /* Read yd (real), xd(imag) input */ + U = read_q15x2 (pSi3); + + /* T = packed(yb-yd, xb-xd) */ + T = __QSUB16(T, U); + +#ifndef ARM_MATH_BIG_ENDIAN + /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */ + R = __SHASX(S, T); + + /* S = packed((ya-yc) - (xb- xd), (xa-xc) + (yb-yd)) */ + S = __SHSAX(S, T); + + + /* Butterfly process for the i0+fftLen/2 sample */ + out1 = __SMUAD(C1, S) >> 16U; + out2 = __SMUSDX(C1, S); +#else + /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */ + R = __SHSAX(S, T); + + /* S = packed((ya-yc) - (xb- xd), (xa-xc) + (yb-yd)) */ + S = __SHASX(S, T); + + + /* Butterfly process for the i0+fftLen/2 sample */ + out1 = __SMUSDX(S, C1) >> 16U; + out2 = __SMUAD(C1, S); +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */ + /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */ + write_q15x2 (pSi2, __PKHBT( out1, out2, 0 )); + pSi2 += 2 * n1; + + /* Butterfly process for the i0+3fftLen/4 sample */ + +#ifndef ARM_MATH_BIG_ENDIAN + out1 = __SMUAD(C3, R) >> 16U; + out2 = __SMUSDX(C3, R); +#else + out1 = __SMUSDX(R, C3) >> 16U; + out2 = __SMUAD(C3, R); +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */ + /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */ + write_q15x2 (pSi3, __PKHBT( out1, out2, 0 )); + pSi3 += 2 * n1; + } + } + /* Twiddle coefficients index modifier */ + twidCoefModifier <<= 2U; + } + /* end of middle stage process */ + + + /* data is in 10.6(q6) format for the 1024 point */ + /* data is in 8.8(q8) format for the 256 point */ + /* data is in 6.10(q10) format for the 64 point */ + /* data is in 4.12(q12) format for the 16 point */ + + /* Initializations for the last stage */ + j = fftLen >> 2; + + ptr1 = &pSrc16[0]; + + /* start of last stage process */ + + /* Butterfly implementation */ + do + { + /* Read xa (real), ya(imag) input */ + xaya = read_q15x2_ia (&ptr1); + + /* Read xb (real), yb(imag) input */ + xbyb = read_q15x2_ia (&ptr1); + + /* Read xc (real), yc(imag) input */ + xcyc = read_q15x2_ia (&ptr1); + + /* Read xd (real), yd(imag) input */ + xdyd = read_q15x2_ia (&ptr1); + + /* R = packed((ya + yc), (xa + xc)) */ + R = __QADD16(xaya, xcyc); + + /* T = packed((yb + yd), (xb + xd)) */ + T = __QADD16(xbyb, xdyd); + + /* pointer updation for writing */ + ptr1 = ptr1 - 8U; + + + /* xa' = xa + xb + xc + xd */ + /* ya' = ya + yb + yc + yd */ + write_q15x2_ia (&ptr1, __SHADD16(R, T)); + + /* T = packed((yb + yd), (xb + xd)) */ + T = __QADD16(xbyb, xdyd); + + /* xc' = (xa-xb+xc-xd) */ + /* yc' = (ya-yb+yc-yd) */ + write_q15x2_ia (&ptr1, __SHSUB16(R, T)); + + /* S = packed((ya - yc), (xa - xc)) */ + S = __QSUB16(xaya, xcyc); + + /* Read yd (real), xd(imag) input */ + /* T = packed( (yb - yd), (xb - xd)) */ + U = __QSUB16(xbyb, xdyd); + +#ifndef ARM_MATH_BIG_ENDIAN + /* xb' = (xa+yb-xc-yd) */ + /* yb' = (ya-xb-yc+xd) */ + write_q15x2_ia (&ptr1, __SHSAX(S, U)); + + /* xd' = (xa-yb-xc+yd) */ + /* yd' = (ya+xb-yc-xd) */ + write_q15x2_ia (&ptr1, __SHASX(S, U)); +#else + /* xb' = (xa+yb-xc-yd) */ + /* yb' = (ya-xb-yc+xd) */ + write_q15x2_ia (&ptr1, __SHASX(S, U)); + + /* xd' = (xa-yb-xc+yd) */ + /* yd' = (ya+xb-yc-xd) */ + write_q15x2_ia (&ptr1, __SHSAX(S, U)); +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + } while (--j); + + /* end of last stage process */ + + /* output is in 11.5(q5) format for the 1024 point */ + /* output is in 9.7(q7) format for the 256 point */ + /* output is in 7.9(q9) format for the 64 point */ + /* output is in 5.11(q11) format for the 16 point */ + + +#else /* #if defined (ARM_MATH_DSP) */ + + q15_t R0, R1, S0, S1, T0, T1, U0, U1; + q15_t Co1, Si1, Co2, Si2, Co3, Si3, out1, out2; + uint32_t n1, n2, ic, i0, i1, i2, i3, j, k; + + /* Total process is divided into three stages */ + + /* process first stage, middle stages, & last stage */ + + /* Initializations for the first stage */ + n2 = fftLen; + n1 = n2; + + /* n2 = fftLen/4 */ + n2 >>= 2U; + + /* Index for twiddle coefficient */ + ic = 0U; + + /* Index for input read and output write */ + i0 = 0U; + j = n2; + + /* Input is in 1.15(q15) format */ + + /* start of first stage process */ + do + { + /* Butterfly implementation */ + + /* index calculation for the input as, */ + /* pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */ + i1 = i0 + n2; + i2 = i1 + n2; + i3 = i2 + n2; + + /* Reading i0, i0+fftLen/2 inputs */ + + /* input is down scale by 4 to avoid overflow */ + /* Read ya (real), xa(imag) input */ + T0 = pSrc16[i0 * 2U] >> 2U; + T1 = pSrc16[(i0 * 2U) + 1U] >> 2U; + + /* input is down scale by 4 to avoid overflow */ + /* Read yc (real), xc(imag) input */ + S0 = pSrc16[i2 * 2U] >> 2U; + S1 = pSrc16[(i2 * 2U) + 1U] >> 2U; + + /* R0 = (ya + yc) */ + R0 = __SSAT(T0 + S0, 16U); + /* R1 = (xa + xc) */ + R1 = __SSAT(T1 + S1, 16U); + + /* S0 = (ya - yc) */ + S0 = __SSAT(T0 - S0, 16); + /* S1 = (xa - xc) */ + S1 = __SSAT(T1 - S1, 16); + + /* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */ + /* input is down scale by 4 to avoid overflow */ + /* Read yb (real), xb(imag) input */ + T0 = pSrc16[i1 * 2U] >> 2U; + T1 = pSrc16[(i1 * 2U) + 1U] >> 2U; + + /* input is down scale by 4 to avoid overflow */ + /* Read yd (real), xd(imag) input */ + U0 = pSrc16[i3 * 2U] >> 2U; + U1 = pSrc16[(i3 * 2U) + 1] >> 2U; + + /* T0 = (yb + yd) */ + T0 = __SSAT(T0 + U0, 16U); + /* T1 = (xb + xd) */ + T1 = __SSAT(T1 + U1, 16U); + + /* writing the butterfly processed i0 sample */ + /* ya' = ya + yb + yc + yd */ + /* xa' = xa + xb + xc + xd */ + pSrc16[i0 * 2U] = (R0 >> 1U) + (T0 >> 1U); + pSrc16[(i0 * 2U) + 1U] = (R1 >> 1U) + (T1 >> 1U); + + /* R0 = (ya + yc) - (yb + yd) */ + /* R1 = (xa + xc) - (xb + xd) */ + R0 = __SSAT(R0 - T0, 16U); + R1 = __SSAT(R1 - T1, 16U); + + /* co2 & si2 are read from Coefficient pointer */ + Co2 = pCoef16[2U * ic * 2U]; + Si2 = pCoef16[(2U * ic * 2U) + 1]; + + /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */ + out1 = (q15_t) ((Co2 * R0 + Si2 * R1) >> 16U); + /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */ + out2 = (q15_t) ((-Si2 * R0 + Co2 * R1) >> 16U); + + /* Reading i0+fftLen/4 */ + /* input is down scale by 4 to avoid overflow */ + /* T0 = yb, T1 = xb */ + T0 = pSrc16[i1 * 2U] >> 2; + T1 = pSrc16[(i1 * 2U) + 1] >> 2; + + /* writing the butterfly processed i0 + fftLen/4 sample */ + /* writing output(xc', yc') in little endian format */ + pSrc16[i1 * 2U] = out1; + pSrc16[(i1 * 2U) + 1] = out2; + + /* Butterfly calculations */ + /* input is down scale by 4 to avoid overflow */ + /* U0 = yd, U1 = xd */ + U0 = pSrc16[i3 * 2U] >> 2; + U1 = pSrc16[(i3 * 2U) + 1] >> 2; + /* T0 = yb-yd */ + T0 = __SSAT(T0 - U0, 16); + /* T1 = xb-xd */ + T1 = __SSAT(T1 - U1, 16); + + /* R1 = (ya-yc) + (xb- xd), R0 = (xa-xc) - (yb-yd)) */ + R0 = (q15_t) __SSAT((q31_t) (S0 - T1), 16); + R1 = (q15_t) __SSAT((q31_t) (S1 + T0), 16); + + /* S1 = (ya-yc) - (xb- xd), S0 = (xa-xc) + (yb-yd)) */ + S0 = (q15_t) __SSAT(((q31_t) S0 + T1), 16U); + S1 = (q15_t) __SSAT(((q31_t) S1 - T0), 16U); + + /* co1 & si1 are read from Coefficient pointer */ + Co1 = pCoef16[ic * 2U]; + Si1 = pCoef16[(ic * 2U) + 1]; + /* Butterfly process for the i0+fftLen/2 sample */ + /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */ + out1 = (q15_t) ((Si1 * S1 + Co1 * S0) >> 16); + /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */ + out2 = (q15_t) ((-Si1 * S0 + Co1 * S1) >> 16); + + /* writing output(xb', yb') in little endian format */ + pSrc16[i2 * 2U] = out1; + pSrc16[(i2 * 2U) + 1] = out2; + + /* Co3 & si3 are read from Coefficient pointer */ + Co3 = pCoef16[3U * (ic * 2U)]; + Si3 = pCoef16[(3U * (ic * 2U)) + 1]; + /* Butterfly process for the i0+3fftLen/4 sample */ + /* xd' = (xa-yb-xc+yd)* Co3 + (ya+xb-yc-xd)* (si3) */ + out1 = (q15_t) ((Si3 * R1 + Co3 * R0) >> 16U); + /* yd' = (ya+xb-yc-xd)* Co3 - (xa-yb-xc+yd)* (si3) */ + out2 = (q15_t) ((-Si3 * R0 + Co3 * R1) >> 16U); + /* writing output(xd', yd') in little endian format */ + pSrc16[i3 * 2U] = out1; + pSrc16[(i3 * 2U) + 1] = out2; + + /* Twiddle coefficients index modifier */ + ic = ic + twidCoefModifier; + + /* Updating input index */ + i0 = i0 + 1U; + + } while (--j); + /* data is in 4.11(q11) format */ + + /* end of first stage process */ + + + /* start of middle stage process */ + + /* Twiddle coefficients index modifier */ + twidCoefModifier <<= 2U; + + /* Calculation of Middle stage */ + for (k = fftLen / 4U; k > 4U; k >>= 2U) + { + /* Initializations for the middle stage */ + n1 = n2; + n2 >>= 2U; + ic = 0U; + + for (j = 0U; j <= (n2 - 1U); j++) + { + /* index calculation for the coefficients */ + Co1 = pCoef16[ic * 2U]; + Si1 = pCoef16[(ic * 2U) + 1U]; + Co2 = pCoef16[2U * (ic * 2U)]; + Si2 = pCoef16[(2U * (ic * 2U)) + 1U]; + Co3 = pCoef16[3U * (ic * 2U)]; + Si3 = pCoef16[(3U * (ic * 2U)) + 1U]; + + /* Twiddle coefficients index modifier */ + ic = ic + twidCoefModifier; + + /* Butterfly implementation */ + for (i0 = j; i0 < fftLen; i0 += n1) + { + /* index calculation for the input as, */ + /* pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */ + i1 = i0 + n2; + i2 = i1 + n2; + i3 = i2 + n2; + + /* Reading i0, i0+fftLen/2 inputs */ + /* Read ya (real), xa(imag) input */ + T0 = pSrc16[i0 * 2U]; + T1 = pSrc16[(i0 * 2U) + 1U]; + + /* Read yc (real), xc(imag) input */ + S0 = pSrc16[i2 * 2U]; + S1 = pSrc16[(i2 * 2U) + 1U]; + + /* R0 = (ya + yc), R1 = (xa + xc) */ + R0 = __SSAT(T0 + S0, 16); + R1 = __SSAT(T1 + S1, 16); + + /* S0 = (ya - yc), S1 =(xa - xc) */ + S0 = __SSAT(T0 - S0, 16); + S1 = __SSAT(T1 - S1, 16); + + /* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */ + /* Read yb (real), xb(imag) input */ + T0 = pSrc16[i1 * 2U]; + T1 = pSrc16[(i1 * 2U) + 1U]; + + /* Read yd (real), xd(imag) input */ + U0 = pSrc16[i3 * 2U]; + U1 = pSrc16[(i3 * 2U) + 1U]; + + + /* T0 = (yb + yd), T1 = (xb + xd) */ + T0 = __SSAT(T0 + U0, 16); + T1 = __SSAT(T1 + U1, 16); + + /* writing the butterfly processed i0 sample */ + + /* xa' = xa + xb + xc + xd */ + /* ya' = ya + yb + yc + yd */ + out1 = ((R0 >> 1U) + (T0 >> 1U)) >> 1U; + out2 = ((R1 >> 1U) + (T1 >> 1U)) >> 1U; + + pSrc16[i0 * 2U] = out1; + pSrc16[(2U * i0) + 1U] = out2; + + /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */ + R0 = (R0 >> 1U) - (T0 >> 1U); + R1 = (R1 >> 1U) - (T1 >> 1U); + + /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */ + out1 = (q15_t) ((Co2 * R0 + Si2 * R1) >> 16U); + + /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */ + out2 = (q15_t) ((-Si2 * R0 + Co2 * R1) >> 16U); + + /* Reading i0+3fftLen/4 */ + /* Read yb (real), xb(imag) input */ + T0 = pSrc16[i1 * 2U]; + T1 = pSrc16[(i1 * 2U) + 1U]; + + /* writing the butterfly processed i0 + fftLen/4 sample */ + /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */ + /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */ + pSrc16[i1 * 2U] = out1; + pSrc16[(i1 * 2U) + 1U] = out2; + + /* Butterfly calculations */ + + /* Read yd (real), xd(imag) input */ + U0 = pSrc16[i3 * 2U]; + U1 = pSrc16[(i3 * 2U) + 1U]; + + /* T0 = yb-yd, T1 = xb-xd */ + T0 = __SSAT(T0 - U0, 16); + T1 = __SSAT(T1 - U1, 16); + + /* R0 = (ya-yc) + (xb- xd), R1 = (xa-xc) - (yb-yd)) */ + R0 = (S0 >> 1U) - (T1 >> 1U); + R1 = (S1 >> 1U) + (T0 >> 1U); + + /* S0 = (ya-yc) - (xb- xd), S1 = (xa-xc) + (yb-yd)) */ + S0 = (S0 >> 1U) + (T1 >> 1U); + S1 = (S1 >> 1U) - (T0 >> 1U); + + /* Butterfly process for the i0+fftLen/2 sample */ + out1 = (q15_t) ((Co1 * S0 + Si1 * S1) >> 16U); + + out2 = (q15_t) ((-Si1 * S0 + Co1 * S1) >> 16U); + + /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */ + /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */ + pSrc16[i2 * 2U] = out1; + pSrc16[(i2 * 2U) + 1U] = out2; + + /* Butterfly process for the i0+3fftLen/4 sample */ + out1 = (q15_t) ((Si3 * R1 + Co3 * R0) >> 16U); + + out2 = (q15_t) ((-Si3 * R0 + Co3 * R1) >> 16U); + /* xd' = (xa-yb-xc+yd)* Co3 + (ya+xb-yc-xd)* (si3) */ + /* yd' = (ya+xb-yc-xd)* Co3 - (xa-yb-xc+yd)* (si3) */ + pSrc16[i3 * 2U] = out1; + pSrc16[(i3 * 2U) + 1U] = out2; + } + } + /* Twiddle coefficients index modifier */ + twidCoefModifier <<= 2U; + } + /* end of middle stage process */ + + + /* data is in 10.6(q6) format for the 1024 point */ + /* data is in 8.8(q8) format for the 256 point */ + /* data is in 6.10(q10) format for the 64 point */ + /* data is in 4.12(q12) format for the 16 point */ + + /* Initializations for the last stage */ + n1 = n2; + n2 >>= 2U; + + /* start of last stage process */ + + /* Butterfly implementation */ + for (i0 = 0U; i0 <= (fftLen - n1); i0 += n1) + { + /* index calculation for the input as, */ + /* pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */ + i1 = i0 + n2; + i2 = i1 + n2; + i3 = i2 + n2; + + /* Reading i0, i0+fftLen/2 inputs */ + /* Read ya (real), xa(imag) input */ + T0 = pSrc16[i0 * 2U]; + T1 = pSrc16[(i0 * 2U) + 1U]; + + /* Read yc (real), xc(imag) input */ + S0 = pSrc16[i2 * 2U]; + S1 = pSrc16[(i2 * 2U) + 1U]; + + /* R0 = (ya + yc), R1 = (xa + xc) */ + R0 = __SSAT(T0 + S0, 16U); + R1 = __SSAT(T1 + S1, 16U); + + /* S0 = (ya - yc), S1 = (xa - xc) */ + S0 = __SSAT(T0 - S0, 16U); + S1 = __SSAT(T1 - S1, 16U); + + /* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */ + /* Read yb (real), xb(imag) input */ + T0 = pSrc16[i1 * 2U]; + T1 = pSrc16[(i1 * 2U) + 1U]; + /* Read yd (real), xd(imag) input */ + U0 = pSrc16[i3 * 2U]; + U1 = pSrc16[(i3 * 2U) + 1U]; + + /* T0 = (yb + yd), T1 = (xb + xd)) */ + T0 = __SSAT(T0 + U0, 16U); + T1 = __SSAT(T1 + U1, 16U); + + /* writing the butterfly processed i0 sample */ + /* xa' = xa + xb + xc + xd */ + /* ya' = ya + yb + yc + yd */ + pSrc16[i0 * 2U] = (R0 >> 1U) + (T0 >> 1U); + pSrc16[(i0 * 2U) + 1U] = (R1 >> 1U) + (T1 >> 1U); + + /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */ + R0 = (R0 >> 1U) - (T0 >> 1U); + R1 = (R1 >> 1U) - (T1 >> 1U); + /* Read yb (real), xb(imag) input */ + T0 = pSrc16[i1 * 2U]; + T1 = pSrc16[(i1 * 2U) + 1U]; + + /* writing the butterfly processed i0 + fftLen/4 sample */ + /* xc' = (xa-xb+xc-xd) */ + /* yc' = (ya-yb+yc-yd) */ + pSrc16[i1 * 2U] = R0; + pSrc16[(i1 * 2U) + 1U] = R1; + + /* Read yd (real), xd(imag) input */ + U0 = pSrc16[i3 * 2U]; + U1 = pSrc16[(i3 * 2U) + 1U]; + /* T0 = (yb - yd), T1 = (xb - xd) */ + T0 = __SSAT(T0 - U0, 16U); + T1 = __SSAT(T1 - U1, 16U); + + /* writing the butterfly processed i0 + fftLen/2 sample */ + /* xb' = (xa+yb-xc-yd) */ + /* yb' = (ya-xb-yc+xd) */ + pSrc16[i2 * 2U] = (S0 >> 1U) + (T1 >> 1U); + pSrc16[(i2 * 2U) + 1U] = (S1 >> 1U) - (T0 >> 1U); + + /* writing the butterfly processed i0 + 3fftLen/4 sample */ + /* xd' = (xa-yb-xc+yd) */ + /* yd' = (ya+xb-yc-xd) */ + pSrc16[i3 * 2U] = (S0 >> 1U) - (T1 >> 1U); + pSrc16[(i3 * 2U) + 1U] = (S1 >> 1U) + (T0 >> 1U); + + } + + /* end of last stage process */ + + /* output is in 11.5(q5) format for the 1024 point */ + /* output is in 9.7(q7) format for the 256 point */ + /* output is in 7.9(q9) format for the 64 point */ + /* output is in 5.11(q11) format for the 16 point */ + +#endif /* #if defined (ARM_MATH_DSP) */ + +} + + +/** + @brief Core function for the Q15 CIFFT butterfly process. + @param[in,out] pSrc16 points to the in-place buffer of Q15 data type + @param[in] fftLen length of the FFT + @param[in] pCoef16 points to twiddle coefficient buffer + @param[in] twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. + @return none + */ + +/* + * Radix-4 IFFT algorithm used is : + * + * CIFFT uses same twiddle coefficients as CFFT function + * x[k] = x[n] + (j)k * x[n + fftLen/4] + (-1)k * x[n+fftLen/2] + (-j)k * x[n+3*fftLen/4] + * + * + * IFFT is implemented with following changes in equations from FFT + * + * Input real and imaginary data: + * x(n) = xa + j * ya + * x(n+N/4 ) = xb + j * yb + * x(n+N/2 ) = xc + j * yc + * x(n+3N 4) = xd + j * yd + * + * + * Output real and imaginary data: + * x(4r) = xa'+ j * ya' + * x(4r+1) = xb'+ j * yb' + * x(4r+2) = xc'+ j * yc' + * x(4r+3) = xd'+ j * yd' + * + * + * Twiddle factors for radix-4 IFFT: + * Wn = co1 + j * (si1) + * W2n = co2 + j * (si2) + * W3n = co3 + j * (si3) + + * The real and imaginary output values for the radix-4 butterfly are + * xa' = xa + xb + xc + xd + * ya' = ya + yb + yc + yd + * xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1) + * yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1) + * xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2) + * yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2) + * xd' = (xa+yb-xc-yd)* co3 - (ya-xb-yc+xd)* (si3) + * yd' = (ya-xb-yc+xd)* co3 + (xa+yb-xc-yd)* (si3) + * + */ + +void arm_radix4_butterfly_inverse_q15( + q15_t * pSrc16, + uint32_t fftLen, + const q15_t * pCoef16, + uint32_t twidCoefModifier) +{ + +#if defined (ARM_MATH_DSP) + + q31_t R, S, T, U; + q31_t C1, C2, C3, out1, out2; + uint32_t n1, n2, ic, i0, j, k; + + q15_t *ptr1; + q15_t *pSi0; + q15_t *pSi1; + q15_t *pSi2; + q15_t *pSi3; + + q31_t xaya, xbyb, xcyc, xdyd; + + /* Total process is divided into three stages */ + + /* process first stage, middle stages, & last stage */ + + /* Initializations for the first stage */ + n2 = fftLen; + n1 = n2; + + /* n2 = fftLen/4 */ + n2 >>= 2U; + + /* Index for twiddle coefficient */ + ic = 0U; + + /* Index for input read and output write */ + j = n2; + + pSi0 = pSrc16; + pSi1 = pSi0 + 2 * n2; + pSi2 = pSi1 + 2 * n2; + pSi3 = pSi2 + 2 * n2; + + /* Input is in 1.15(q15) format */ + + /* start of first stage process */ + do + { + /* Butterfly implementation */ + + /* Reading i0, i0+fftLen/2 inputs */ + /* Read ya (real), xa(imag) input */ + T = read_q15x2 (pSi0); + T = __SHADD16(T, 0); + T = __SHADD16(T, 0); + + /* Read yc (real), xc(imag) input */ + S = read_q15x2 (pSi2); + S = __SHADD16(S, 0); + S = __SHADD16(S, 0); + + /* R = packed((ya + yc), (xa + xc) ) */ + R = __QADD16(T, S); + + /* S = packed((ya - yc), (xa - xc) ) */ + S = __QSUB16(T, S); + + /* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */ + /* Read yb (real), xb(imag) input */ + T = read_q15x2 (pSi1); + T = __SHADD16(T, 0); + T = __SHADD16(T, 0); + + /* Read yd (real), xd(imag) input */ + U = read_q15x2 (pSi3); + U = __SHADD16(U, 0); + U = __SHADD16(U, 0); + + /* T = packed((yb + yd), (xb + xd) ) */ + T = __QADD16(T, U); + + /* writing the butterfly processed i0 sample */ + /* xa' = xa + xb + xc + xd */ + /* ya' = ya + yb + yc + yd */ + write_q15x2_ia (&pSi0, __SHADD16(R, T)); + + /* R = packed((ya + yc) - (yb + yd), (xa + xc)- (xb + xd)) */ + R = __QSUB16(R, T); + + /* co2 & si2 are read from SIMD Coefficient pointer */ + C2 = read_q15x2 ((q15_t *) pCoef16 + (4U * ic)); + +#ifndef ARM_MATH_BIG_ENDIAN + /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */ + out1 = __SMUSD(C2, R) >> 16U; + /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */ + out2 = __SMUADX(C2, R); +#else + /* xc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */ + out1 = __SMUADX(C2, R) >> 16U; + /* yc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */ + out2 = __SMUSD(__QSUB16(0, C2), R); +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* Reading i0+fftLen/4 */ + /* T = packed(yb, xb) */ + T = read_q15x2 (pSi1); + T = __SHADD16(T, 0); + T = __SHADD16(T, 0); + + /* writing the butterfly processed i0 + fftLen/4 sample */ + /* writing output(xc', yc') in little endian format */ + write_q15x2_ia (&pSi1, (q31_t) __PKHBT( out1, out2, 0 )); + + /* Butterfly calculations */ + /* U = packed(yd, xd) */ + U = read_q15x2 (pSi3); + U = __SHADD16(U, 0); + U = __SHADD16(U, 0); + + /* T = packed(yb-yd, xb-xd) */ + T = __QSUB16(T, U); + +#ifndef ARM_MATH_BIG_ENDIAN + /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */ + R = __QSAX(S, T); + /* S = packed((ya-yc) + (xb- xd), (xa-xc) - (yb-yd)) */ + S = __QASX(S, T); +#else + /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */ + R = __QASX(S, T); + /* S = packed((ya-yc) - (xb- xd), (xa-xc) + (yb-yd)) */ + S = __QSAX(S, T); +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* co1 & si1 are read from SIMD Coefficient pointer */ + C1 = read_q15x2 ((q15_t *) pCoef16 + (2U * ic)); + /* Butterfly process for the i0+fftLen/2 sample */ + +#ifndef ARM_MATH_BIG_ENDIAN + /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */ + out1 = __SMUSD(C1, S) >> 16U; + /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */ + out2 = __SMUADX(C1, S); +#else + /* xb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */ + out1 = __SMUADX(C1, S) >> 16U; + /* yb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */ + out2 = __SMUSD(__QSUB16(0, C1), S); +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* writing output(xb', yb') in little endian format */ + write_q15x2_ia (&pSi2, __PKHBT( out1, out2, 0 )); + + /* co3 & si3 are read from SIMD Coefficient pointer */ + C3 = read_q15x2 ((q15_t *) pCoef16 + (6U * ic)); + /* Butterfly process for the i0+3fftLen/4 sample */ + +#ifndef ARM_MATH_BIG_ENDIAN + /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */ + out1 = __SMUSD(C3, R) >> 16U; + /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */ + out2 = __SMUADX(C3, R); +#else + /* xd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */ + out1 = __SMUADX(C3, R) >> 16U; + /* yd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */ + out2 = __SMUSD(__QSUB16(0, C3), R); +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* writing output(xd', yd') in little endian format */ + write_q15x2_ia (&pSi3, __PKHBT( out1, out2, 0 )); + + /* Twiddle coefficients index modifier */ + ic = ic + twidCoefModifier; + + } while (--j); + /* data is in 4.11(q11) format */ + + /* end of first stage process */ + + + /* start of middle stage process */ + + /* Twiddle coefficients index modifier */ + twidCoefModifier <<= 2U; + + /* Calculation of Middle stage */ + for (k = fftLen / 4U; k > 4U; k >>= 2U) + { + /* Initializations for the middle stage */ + n1 = n2; + n2 >>= 2U; + ic = 0U; + + for (j = 0U; j <= (n2 - 1U); j++) + { + /* index calculation for the coefficients */ + C1 = read_q15x2 ((q15_t *) pCoef16 + (2U * ic)); + C2 = read_q15x2 ((q15_t *) pCoef16 + (4U * ic)); + C3 = read_q15x2 ((q15_t *) pCoef16 + (6U * ic)); + + /* Twiddle coefficients index modifier */ + ic = ic + twidCoefModifier; + + pSi0 = pSrc16 + 2 * j; + pSi1 = pSi0 + 2 * n2; + pSi2 = pSi1 + 2 * n2; + pSi3 = pSi2 + 2 * n2; + + /* Butterfly implementation */ + for (i0 = j; i0 < fftLen; i0 += n1) + { + /* Reading i0, i0+fftLen/2 inputs */ + /* Read ya (real), xa(imag) input */ + T = read_q15x2 (pSi0); + + /* Read yc (real), xc(imag) input */ + S = read_q15x2 (pSi2); + + /* R = packed( (ya + yc), (xa + xc)) */ + R = __QADD16(T, S); + + /* S = packed((ya - yc), (xa - xc)) */ + S = __QSUB16(T, S); + + /* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */ + /* Read yb (real), xb(imag) input */ + T = read_q15x2 (pSi1); + + /* Read yd (real), xd(imag) input */ + U = read_q15x2 (pSi3); + + /* T = packed( (yb + yd), (xb + xd)) */ + T = __QADD16(T, U); + + /* writing the butterfly processed i0 sample */ + + /* xa' = xa + xb + xc + xd */ + /* ya' = ya + yb + yc + yd */ + out1 = __SHADD16(R, T); + out1 = __SHADD16(out1, 0); + write_q15x2 (pSi0, out1); + pSi0 += 2 * n1; + + /* R = packed( (ya + yc) - (yb + yd), (xa + xc) - (xb + xd)) */ + R = __SHSUB16(R, T); + +#ifndef ARM_MATH_BIG_ENDIAN + /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */ + out1 = __SMUSD(C2, R) >> 16U; + + /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */ + out2 = __SMUADX(C2, R); +#else + /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */ + out1 = __SMUADX(R, C2) >> 16U; + + /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */ + out2 = __SMUSD(__QSUB16(0, C2), R); +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* Reading i0+3fftLen/4 */ + /* Read yb (real), xb(imag) input */ + T = read_q15x2 (pSi1); + + /* writing the butterfly processed i0 + fftLen/4 sample */ + /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */ + /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */ + write_q15x2 (pSi1, __PKHBT( out1, out2, 0 )); + pSi1 += 2 * n1; + + /* Butterfly calculations */ + + /* Read yd (real), xd(imag) input */ + U = read_q15x2 (pSi3); + + /* T = packed(yb-yd, xb-xd) */ + T = __QSUB16(T, U); + +#ifndef ARM_MATH_BIG_ENDIAN + /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */ + R = __SHSAX(S, T); + + /* S = packed((ya-yc) - (xb- xd), (xa-xc) + (yb-yd)) */ + S = __SHASX(S, T); + + /* Butterfly process for the i0+fftLen/2 sample */ + out1 = __SMUSD(C1, S) >> 16U; + out2 = __SMUADX(C1, S); +#else + /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */ + R = __SHASX(S, T); + + /* S = packed((ya-yc) - (xb- xd), (xa-xc) + (yb-yd)) */ + S = __SHSAX(S, T); + + /* Butterfly process for the i0+fftLen/2 sample */ + out1 = __SMUADX(S, C1) >> 16U; + out2 = __SMUSD(__QSUB16(0, C1), S); +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */ + /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */ + write_q15x2 (pSi2, __PKHBT( out1, out2, 0 )); + pSi2 += 2 * n1; + + /* Butterfly process for the i0+3fftLen/4 sample */ + +#ifndef ARM_MATH_BIG_ENDIAN + out1 = __SMUSD(C3, R) >> 16U; + out2 = __SMUADX(C3, R); +#else + out1 = __SMUADX(C3, R) >> 16U; + out2 = __SMUSD(__QSUB16(0, C3), R); +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */ + /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */ + write_q15x2 (pSi3, __PKHBT( out1, out2, 0 )); + pSi3 += 2 * n1; + } + } + /* Twiddle coefficients index modifier */ + twidCoefModifier <<= 2U; + } + /* end of middle stage process */ + + /* data is in 10.6(q6) format for the 1024 point */ + /* data is in 8.8(q8) format for the 256 point */ + /* data is in 6.10(q10) format for the 64 point */ + /* data is in 4.12(q12) format for the 16 point */ + + /* Initializations for the last stage */ + j = fftLen >> 2; + + ptr1 = &pSrc16[0]; + + /* start of last stage process */ + + /* Butterfly implementation */ + do + { + /* Read xa (real), ya(imag) input */ + xaya = read_q15x2_ia (&ptr1); + + /* Read xb (real), yb(imag) input */ + xbyb = read_q15x2_ia (&ptr1); + + /* Read xc (real), yc(imag) input */ + xcyc = read_q15x2_ia (&ptr1); + + /* Read xd (real), yd(imag) input */ + xdyd = read_q15x2_ia (&ptr1); + + /* R = packed((ya + yc), (xa + xc)) */ + R = __QADD16(xaya, xcyc); + + /* T = packed((yb + yd), (xb + xd)) */ + T = __QADD16(xbyb, xdyd); + + /* pointer updation for writing */ + ptr1 = ptr1 - 8U; + + + /* xa' = xa + xb + xc + xd */ + /* ya' = ya + yb + yc + yd */ + write_q15x2_ia (&ptr1, __SHADD16(R, T)); + + /* T = packed((yb + yd), (xb + xd)) */ + T = __QADD16(xbyb, xdyd); + + /* xc' = (xa-xb+xc-xd) */ + /* yc' = (ya-yb+yc-yd) */ + write_q15x2_ia (&ptr1, __SHSUB16(R, T)); + + /* S = packed((ya - yc), (xa - xc)) */ + S = __QSUB16(xaya, xcyc); + + /* Read yd (real), xd(imag) input */ + /* T = packed( (yb - yd), (xb - xd)) */ + U = __QSUB16(xbyb, xdyd); + +#ifndef ARM_MATH_BIG_ENDIAN + /* xb' = (xa+yb-xc-yd) */ + /* yb' = (ya-xb-yc+xd) */ + write_q15x2_ia (&ptr1, __SHASX(S, U)); + + /* xd' = (xa-yb-xc+yd) */ + /* yd' = (ya+xb-yc-xd) */ + write_q15x2_ia (&ptr1, __SHSAX(S, U)); +#else + /* xb' = (xa+yb-xc-yd) */ + /* yb' = (ya-xb-yc+xd) */ + write_q15x2_ia (&ptr1, __SHSAX(S, U)); + + /* xd' = (xa-yb-xc+yd) */ + /* yd' = (ya+xb-yc-xd) */ + write_q15x2_ia (&ptr1, __SHASX(S, U)); +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + } while (--j); + + /* end of last stage process */ + + /* output is in 11.5(q5) format for the 1024 point */ + /* output is in 9.7(q7) format for the 256 point */ + /* output is in 7.9(q9) format for the 64 point */ + /* output is in 5.11(q11) format for the 16 point */ + + +#else /* arm_radix4_butterfly_inverse_q15 */ + + q15_t R0, R1, S0, S1, T0, T1, U0, U1; + q15_t Co1, Si1, Co2, Si2, Co3, Si3, out1, out2; + uint32_t n1, n2, ic, i0, i1, i2, i3, j, k; + + /* Total process is divided into three stages */ + + /* process first stage, middle stages, & last stage */ + + /* Initializations for the first stage */ + n2 = fftLen; + n1 = n2; + + /* n2 = fftLen/4 */ + n2 >>= 2U; + + /* Index for twiddle coefficient */ + ic = 0U; + + /* Index for input read and output write */ + i0 = 0U; + + j = n2; + + /* Input is in 1.15(q15) format */ + + /* Start of first stage process */ + do + { + /* Butterfly implementation */ + + /* index calculation for the input as, */ + /* pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */ + i1 = i0 + n2; + i2 = i1 + n2; + i3 = i2 + n2; + + /* Reading i0, i0+fftLen/2 inputs */ + /* input is down scale by 4 to avoid overflow */ + /* Read ya (real), xa(imag) input */ + T0 = pSrc16[i0 * 2U] >> 2U; + T1 = pSrc16[(i0 * 2U) + 1U] >> 2U; + /* input is down scale by 4 to avoid overflow */ + /* Read yc (real), xc(imag) input */ + S0 = pSrc16[i2 * 2U] >> 2U; + S1 = pSrc16[(i2 * 2U) + 1U] >> 2U; + + /* R0 = (ya + yc), R1 = (xa + xc) */ + R0 = __SSAT(T0 + S0, 16U); + R1 = __SSAT(T1 + S1, 16U); + /* S0 = (ya - yc), S1 = (xa - xc) */ + S0 = __SSAT(T0 - S0, 16U); + S1 = __SSAT(T1 - S1, 16U); + + /* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */ + /* input is down scale by 4 to avoid overflow */ + /* Read yb (real), xb(imag) input */ + T0 = pSrc16[i1 * 2U] >> 2U; + T1 = pSrc16[(i1 * 2U) + 1U] >> 2U; + /* Read yd (real), xd(imag) input */ + /* input is down scale by 4 to avoid overflow */ + U0 = pSrc16[i3 * 2U] >> 2U; + U1 = pSrc16[(i3 * 2U) + 1U] >> 2U; + + /* T0 = (yb + yd), T1 = (xb + xd) */ + T0 = __SSAT(T0 + U0, 16U); + T1 = __SSAT(T1 + U1, 16U); + + /* writing the butterfly processed i0 sample */ + /* xa' = xa + xb + xc + xd */ + /* ya' = ya + yb + yc + yd */ + pSrc16[i0 * 2U] = (R0 >> 1U) + (T0 >> 1U); + pSrc16[(i0 * 2U) + 1U] = (R1 >> 1U) + (T1 >> 1U); + + /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc)- (xb + xd) */ + R0 = __SSAT(R0 - T0, 16U); + R1 = __SSAT(R1 - T1, 16U); + /* co2 & si2 are read from Coefficient pointer */ + Co2 = pCoef16[2U * ic * 2U]; + Si2 = pCoef16[(2U * ic * 2U) + 1U]; + /* xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2) */ + out1 = (q15_t) ((Co2 * R0 - Si2 * R1) >> 16U); + /* yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2) */ + out2 = (q15_t) ((Si2 * R0 + Co2 * R1) >> 16U); + + /* Reading i0+fftLen/4 */ + /* input is down scale by 4 to avoid overflow */ + /* T0 = yb, T1 = xb */ + T0 = pSrc16[i1 * 2U] >> 2U; + T1 = pSrc16[(i1 * 2U) + 1U] >> 2U; + + /* writing the butterfly processed i0 + fftLen/4 sample */ + /* writing output(xc', yc') in little endian format */ + pSrc16[i1 * 2U] = out1; + pSrc16[(i1 * 2U) + 1U] = out2; + + /* Butterfly calculations */ + /* input is down scale by 4 to avoid overflow */ + /* U0 = yd, U1 = xd) */ + U0 = pSrc16[i3 * 2U] >> 2U; + U1 = pSrc16[(i3 * 2U) + 1U] >> 2U; + + /* T0 = yb-yd, T1 = xb-xd) */ + T0 = __SSAT(T0 - U0, 16U); + T1 = __SSAT(T1 - U1, 16U); + /* R0 = (ya-yc) - (xb- xd) , R1 = (xa-xc) + (yb-yd) */ + R0 = (q15_t) __SSAT((q31_t) (S0 + T1), 16); + R1 = (q15_t) __SSAT((q31_t) (S1 - T0), 16); + /* S = (ya-yc) + (xb- xd), S1 = (xa-xc) - (yb-yd) */ + S0 = (q15_t) __SSAT((q31_t) (S0 - T1), 16); + S1 = (q15_t) __SSAT((q31_t) (S1 + T0), 16); + + /* co1 & si1 are read from Coefficient pointer */ + Co1 = pCoef16[ic * 2U]; + Si1 = pCoef16[(ic * 2U) + 1U]; + /* Butterfly process for the i0+fftLen/2 sample */ + /* xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1) */ + out1 = (q15_t) ((Co1 * S0 - Si1 * S1) >> 16U); + /* yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1) */ + out2 = (q15_t) ((Si1 * S0 + Co1 * S1) >> 16U); + /* writing output(xb', yb') in little endian format */ + pSrc16[i2 * 2U] = out1; + pSrc16[(i2 * 2U) + 1U] = out2; + + /* Co3 & si3 are read from Coefficient pointer */ + Co3 = pCoef16[3U * ic * 2U]; + Si3 = pCoef16[(3U * ic * 2U) + 1U]; + /* Butterfly process for the i0+3fftLen/4 sample */ + /* xd' = (xa+yb-xc-yd)* Co3 - (ya-xb-yc+xd)* (si3) */ + out1 = (q15_t) ((Co3 * R0 - Si3 * R1) >> 16U); + /* yd' = (ya-xb-yc+xd)* Co3 + (xa+yb-xc-yd)* (si3) */ + out2 = (q15_t) ((Si3 * R0 + Co3 * R1) >> 16U); + /* writing output(xd', yd') in little endian format */ + pSrc16[i3 * 2U] = out1; + pSrc16[(i3 * 2U) + 1U] = out2; + + /* Twiddle coefficients index modifier */ + ic = ic + twidCoefModifier; + + /* Updating input index */ + i0 = i0 + 1U; + + } while (--j); + + /* End of first stage process */ + + /* data is in 4.11(q11) format */ + + + /* Start of Middle stage process */ + + /* Twiddle coefficients index modifier */ + twidCoefModifier <<= 2U; + + /* Calculation of Middle stage */ + for (k = fftLen / 4U; k > 4U; k >>= 2U) + { + /* Initializations for the middle stage */ + n1 = n2; + n2 >>= 2U; + ic = 0U; + + for (j = 0U; j <= (n2 - 1U); j++) + { + /* index calculation for the coefficients */ + Co1 = pCoef16[ic * 2U]; + Si1 = pCoef16[(ic * 2U) + 1U]; + Co2 = pCoef16[2U * ic * 2U]; + Si2 = pCoef16[2U * ic * 2U + 1U]; + Co3 = pCoef16[3U * ic * 2U]; + Si3 = pCoef16[(3U * ic * 2U) + 1U]; + + /* Twiddle coefficients index modifier */ + ic = ic + twidCoefModifier; + + /* Butterfly implementation */ + for (i0 = j; i0 < fftLen; i0 += n1) + { + /* index calculation for the input as, */ + /* pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */ + i1 = i0 + n2; + i2 = i1 + n2; + i3 = i2 + n2; + + /* Reading i0, i0+fftLen/2 inputs */ + /* Read ya (real), xa(imag) input */ + T0 = pSrc16[i0 * 2U]; + T1 = pSrc16[(i0 * 2U) + 1U]; + + /* Read yc (real), xc(imag) input */ + S0 = pSrc16[i2 * 2U]; + S1 = pSrc16[(i2 * 2U) + 1U]; + + + /* R0 = (ya + yc), R1 = (xa + xc) */ + R0 = __SSAT(T0 + S0, 16U); + R1 = __SSAT(T1 + S1, 16U); + /* S0 = (ya - yc), S1 = (xa - xc) */ + S0 = __SSAT(T0 - S0, 16U); + S1 = __SSAT(T1 - S1, 16U); + + /* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */ + /* Read yb (real), xb(imag) input */ + T0 = pSrc16[i1 * 2U]; + T1 = pSrc16[(i1 * 2U) + 1U]; + + /* Read yd (real), xd(imag) input */ + U0 = pSrc16[i3 * 2U]; + U1 = pSrc16[(i3 * 2U) + 1U]; + + /* T0 = (yb + yd), T1 = (xb + xd) */ + T0 = __SSAT(T0 + U0, 16U); + T1 = __SSAT(T1 + U1, 16U); + + /* writing the butterfly processed i0 sample */ + /* xa' = xa + xb + xc + xd */ + /* ya' = ya + yb + yc + yd */ + pSrc16[i0 * 2U] = ((R0 >> 1U) + (T0 >> 1U)) >> 1U; + pSrc16[(i0 * 2U) + 1U] = ((R1 >> 1U) + (T1 >> 1U)) >> 1U; + + /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */ + R0 = (R0 >> 1U) - (T0 >> 1U); + R1 = (R1 >> 1U) - (T1 >> 1U); + + /* (ya-yb+yc-yd)* (si2) - (xa-xb+xc-xd)* co2 */ + out1 = (q15_t) ((Co2 * R0 - Si2 * R1) >> 16); + /* (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2) */ + out2 = (q15_t) ((Si2 * R0 + Co2 * R1) >> 16); + + /* Reading i0+3fftLen/4 */ + /* Read yb (real), xb(imag) input */ + T0 = pSrc16[i1 * 2U]; + T1 = pSrc16[(i1 * 2U) + 1U]; + + /* writing the butterfly processed i0 + fftLen/4 sample */ + /* xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2) */ + /* yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2) */ + pSrc16[i1 * 2U] = out1; + pSrc16[(i1 * 2U) + 1U] = out2; + + /* Butterfly calculations */ + /* Read yd (real), xd(imag) input */ + U0 = pSrc16[i3 * 2U]; + U1 = pSrc16[(i3 * 2U) + 1U]; + + /* T0 = yb-yd, T1 = xb-xd) */ + T0 = __SSAT(T0 - U0, 16U); + T1 = __SSAT(T1 - U1, 16U); + + /* R0 = (ya-yc) - (xb- xd) , R1 = (xa-xc) + (yb-yd) */ + R0 = (S0 >> 1U) + (T1 >> 1U); + R1 = (S1 >> 1U) - (T0 >> 1U); + + /* S1 = (ya-yc) + (xb- xd), S1 = (xa-xc) - (yb-yd) */ + S0 = (S0 >> 1U) - (T1 >> 1U); + S1 = (S1 >> 1U) + (T0 >> 1U); + + /* Butterfly process for the i0+fftLen/2 sample */ + out1 = (q15_t) ((Co1 * S0 - Si1 * S1) >> 16U); + out2 = (q15_t) ((Si1 * S0 + Co1 * S1) >> 16U); + /* xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1) */ + /* yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1) */ + pSrc16[i2 * 2U] = out1; + pSrc16[(i2 * 2U) + 1U] = out2; + + /* Butterfly process for the i0+3fftLen/4 sample */ + out1 = (q15_t) ((Co3 * R0 - Si3 * R1) >> 16U); + + out2 = (q15_t) ((Si3 * R0 + Co3 * R1) >> 16U); + /* xd' = (xa+yb-xc-yd)* Co3 - (ya-xb-yc+xd)* (si3) */ + /* yd' = (ya-xb-yc+xd)* Co3 + (xa+yb-xc-yd)* (si3) */ + pSrc16[i3 * 2U] = out1; + pSrc16[(i3 * 2U) + 1U] = out2; + + + } + } + /* Twiddle coefficients index modifier */ + twidCoefModifier <<= 2U; + } + /* End of Middle stages process */ + + + /* data is in 10.6(q6) format for the 1024 point */ + /* data is in 8.8(q8) format for the 256 point */ + /* data is in 6.10(q10) format for the 64 point */ + /* data is in 4.12(q12) format for the 16 point */ + + /* start of last stage process */ + + + /* Initializations for the last stage */ + n1 = n2; + n2 >>= 2U; + + /* Butterfly implementation */ + for (i0 = 0U; i0 <= (fftLen - n1); i0 += n1) + { + /* index calculation for the input as, */ + /* pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */ + i1 = i0 + n2; + i2 = i1 + n2; + i3 = i2 + n2; + + /* Reading i0, i0+fftLen/2 inputs */ + /* Read ya (real), xa(imag) input */ + T0 = pSrc16[i0 * 2U]; + T1 = pSrc16[(i0 * 2U) + 1U]; + /* Read yc (real), xc(imag) input */ + S0 = pSrc16[i2 * 2U]; + S1 = pSrc16[(i2 * 2U) + 1U]; + + /* R0 = (ya + yc), R1 = (xa + xc) */ + R0 = __SSAT(T0 + S0, 16U); + R1 = __SSAT(T1 + S1, 16U); + /* S0 = (ya - yc), S1 = (xa - xc) */ + S0 = __SSAT(T0 - S0, 16U); + S1 = __SSAT(T1 - S1, 16U); + + /* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */ + /* Read yb (real), xb(imag) input */ + T0 = pSrc16[i1 * 2U]; + T1 = pSrc16[(i1 * 2U) + 1U]; + /* Read yd (real), xd(imag) input */ + U0 = pSrc16[i3 * 2U]; + U1 = pSrc16[(i3 * 2U) + 1U]; + + /* T0 = (yb + yd), T1 = (xb + xd) */ + T0 = __SSAT(T0 + U0, 16U); + T1 = __SSAT(T1 + U1, 16U); + + /* writing the butterfly processed i0 sample */ + /* xa' = xa + xb + xc + xd */ + /* ya' = ya + yb + yc + yd */ + pSrc16[i0 * 2U] = (R0 >> 1U) + (T0 >> 1U); + pSrc16[(i0 * 2U) + 1U] = (R1 >> 1U) + (T1 >> 1U); + + /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */ + R0 = (R0 >> 1U) - (T0 >> 1U); + R1 = (R1 >> 1U) - (T1 >> 1U); + + /* Read yb (real), xb(imag) input */ + T0 = pSrc16[i1 * 2U]; + T1 = pSrc16[(i1 * 2U) + 1U]; + + /* writing the butterfly processed i0 + fftLen/4 sample */ + /* xc' = (xa-xb+xc-xd) */ + /* yc' = (ya-yb+yc-yd) */ + pSrc16[i1 * 2U] = R0; + pSrc16[(i1 * 2U) + 1U] = R1; + + /* Read yd (real), xd(imag) input */ + U0 = pSrc16[i3 * 2U]; + U1 = pSrc16[(i3 * 2U) + 1U]; + /* T0 = (yb - yd), T1 = (xb - xd) */ + T0 = __SSAT(T0 - U0, 16U); + T1 = __SSAT(T1 - U1, 16U); + + /* writing the butterfly processed i0 + fftLen/2 sample */ + /* xb' = (xa-yb-xc+yd) */ + /* yb' = (ya+xb-yc-xd) */ + pSrc16[i2 * 2U] = (S0 >> 1U) - (T1 >> 1U); + pSrc16[(i2 * 2U) + 1U] = (S1 >> 1U) + (T0 >> 1U); + + + /* writing the butterfly processed i0 + 3fftLen/4 sample */ + /* xd' = (xa+yb-xc-yd) */ + /* yd' = (ya-xb-yc+xd) */ + pSrc16[i3 * 2U] = (S0 >> 1U) + (T1 >> 1U); + pSrc16[(i3 * 2U) + 1U] = (S1 >> 1U) - (T0 >> 1U); + } + /* end of last stage process */ + + /* output is in 11.5(q5) format for the 1024 point */ + /* output is in 9.7(q7) format for the 256 point */ + /* output is in 7.9(q9) format for the 64 point */ + /* output is in 5.11(q11) format for the 16 point */ + +#endif /* #if defined (ARM_MATH_DSP) */ + +} diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_q31.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_q31.c new file mode 100644 index 0000000..46c1e47 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_q31.c @@ -0,0 +1,827 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_cfft_radix4_q31.c + * Description: This file has function definition of Radix-4 FFT & IFFT function and + * In-place bit reversal using bit reversal table + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/transform_functions.h" + +void arm_radix4_butterfly_inverse_q31( + q31_t * pSrc, + uint32_t fftLen, + const q31_t * pCoef, + uint32_t twidCoefModifier); + +void arm_radix4_butterfly_q31( + q31_t * pSrc, + uint32_t fftLen, + const q31_t * pCoef, + uint32_t twidCoefModifier); + +void arm_bitreversal_q31( + q31_t * pSrc, + uint32_t fftLen, + uint16_t bitRevFactor, + const uint16_t * pBitRevTab); + +/** + @ingroup groupTransforms + */ + +/** + @addtogroup ComplexFFT + @{ + */ + +/** + @brief Processing function for the Q31 CFFT/CIFFT. + @deprecated Do not use this function. It has been superseded by \ref arm_cfft_q31 and will be removed in the future. + @param[in] S points to an instance of the Q31 CFFT/CIFFT structure + @param[in,out] pSrc points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place + @return none + + @par Input and output formats: + Internally input is downscaled by 2 for every stage to avoid saturations inside CFFT/CIFFT process. + Hence the output format is different for different FFT sizes. + The input and output formats for different FFT sizes and number of bits to upscale are mentioned in the tables below for CFFT and CIFFT: + @par + \image html CFFTQ31.gif "Input and Output Formats for Q31 CFFT" + \image html CIFFTQ31.gif "Input and Output Formats for Q31 CIFFT" + */ + +void arm_cfft_radix4_q31( + const arm_cfft_radix4_instance_q31 * S, + q31_t * pSrc) +{ + if (S->ifftFlag == 1U) + { + /* Complex IFFT radix-4 */ + arm_radix4_butterfly_inverse_q31(pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier); + } + else + { + /* Complex FFT radix-4 */ + arm_radix4_butterfly_q31(pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier); + } + + if (S->bitReverseFlag == 1U) + { + /* Bit Reversal */ + arm_bitreversal_q31(pSrc, S->fftLen, S->bitRevFactor, S->pBitRevTable); + } + +} + +/** + @} end of ComplexFFT group + */ + +/* + * Radix-4 FFT algorithm used is : + * + * Input real and imaginary data: + * x(n) = xa + j * ya + * x(n+N/4 ) = xb + j * yb + * x(n+N/2 ) = xc + j * yc + * x(n+3N 4) = xd + j * yd + * + * + * Output real and imaginary data: + * x(4r) = xa'+ j * ya' + * x(4r+1) = xb'+ j * yb' + * x(4r+2) = xc'+ j * yc' + * x(4r+3) = xd'+ j * yd' + * + * + * Twiddle factors for radix-4 FFT: + * Wn = co1 + j * (- si1) + * W2n = co2 + j * (- si2) + * W3n = co3 + j * (- si3) + * + * Butterfly implementation: + * xa' = xa + xb + xc + xd + * ya' = ya + yb + yc + yd + * xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) + * yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) + * xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) + * yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) + * xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) + * yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) + * + */ + +/** + @brief Core function for the Q31 CFFT butterfly process. + @param[in,out] pSrc points to the in-place buffer of Q31 data type. + @param[in] fftLen length of the FFT. + @param[in] pCoef points to twiddle coefficient buffer. + @param[in] twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. + @return none + */ + +void arm_radix4_butterfly_q31( + q31_t * pSrc, + uint32_t fftLen, + const q31_t * pCoef, + uint32_t twidCoefModifier) +{ + uint32_t n1, n2, ia1, ia2, ia3, i0, i1, i2, i3, j, k; + q31_t t1, t2, r1, r2, s1, s2, co1, co2, co3, si1, si2, si3; + + q31_t xa, xb, xc, xd; + q31_t ya, yb, yc, yd; + q31_t xa_out, xb_out, xc_out, xd_out; + q31_t ya_out, yb_out, yc_out, yd_out; + + q31_t *ptr1; + + /* Total process is divided into three stages */ + + /* process first stage, middle stages, & last stage */ + + + /* start of first stage process */ + + /* Initializations for the first stage */ + n2 = fftLen; + n1 = n2; + /* n2 = fftLen/4 */ + n2 >>= 2U; + i0 = 0U; + ia1 = 0U; + + j = n2; + + /* Calculation of first stage */ + do + { + /* index calculation for the input as, */ + /* pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2U], pSrc[i0 + 3fftLen/4] */ + i1 = i0 + n2; + i2 = i1 + n2; + i3 = i2 + n2; + + /* input is in 1.31(q31) format and provide 4 guard bits for the input */ + + /* Butterfly implementation */ + /* xa + xc */ + r1 = (pSrc[(2U * i0)] >> 4U) + (pSrc[(2U * i2)] >> 4U); + /* xa - xc */ + r2 = (pSrc[(2U * i0)] >> 4U) - (pSrc[(2U * i2)] >> 4U); + + /* xb + xd */ + t1 = (pSrc[(2U * i1)] >> 4U) + (pSrc[(2U * i3)] >> 4U); + + /* ya + yc */ + s1 = (pSrc[(2U * i0) + 1U] >> 4U) + (pSrc[(2U * i2) + 1U] >> 4U); + /* ya - yc */ + s2 = (pSrc[(2U * i0) + 1U] >> 4U) - (pSrc[(2U * i2) + 1U] >> 4U); + + /* xa' = xa + xb + xc + xd */ + pSrc[2U * i0] = (r1 + t1); + /* (xa + xc) - (xb + xd) */ + r1 = r1 - t1; + /* yb + yd */ + t2 = (pSrc[(2U * i1) + 1U] >> 4U) + (pSrc[(2U * i3) + 1U] >> 4U); + + /* ya' = ya + yb + yc + yd */ + pSrc[(2U * i0) + 1U] = (s1 + t2); + + /* (ya + yc) - (yb + yd) */ + s1 = s1 - t2; + + /* yb - yd */ + t1 = (pSrc[(2U * i1) + 1U] >> 4U) - (pSrc[(2U * i3) + 1U] >> 4U); + /* xb - xd */ + t2 = (pSrc[(2U * i1)] >> 4U) - (pSrc[(2U * i3)] >> 4U); + + /* index calculation for the coefficients */ + ia2 = 2U * ia1; + co2 = pCoef[(ia2 * 2U)]; + si2 = pCoef[(ia2 * 2U) + 1U]; + + /* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */ + pSrc[2U * i1] = (((int32_t) (((q63_t) r1 * co2) >> 32)) + + ((int32_t) (((q63_t) s1 * si2) >> 32))) << 1U; + + /* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */ + pSrc[(2U * i1) + 1U] = (((int32_t) (((q63_t) s1 * co2) >> 32)) - + ((int32_t) (((q63_t) r1 * si2) >> 32))) << 1U; + + /* (xa - xc) + (yb - yd) */ + r1 = r2 + t1; + /* (xa - xc) - (yb - yd) */ + r2 = r2 - t1; + + /* (ya - yc) - (xb - xd) */ + s1 = s2 - t2; + /* (ya - yc) + (xb - xd) */ + s2 = s2 + t2; + + co1 = pCoef[(ia1 * 2U)]; + si1 = pCoef[(ia1 * 2U) + 1U]; + + /* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */ + pSrc[2U * i2] = (((int32_t) (((q63_t) r1 * co1) >> 32)) + + ((int32_t) (((q63_t) s1 * si1) >> 32))) << 1U; + + /* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */ + pSrc[(2U * i2) + 1U] = (((int32_t) (((q63_t) s1 * co1) >> 32)) - + ((int32_t) (((q63_t) r1 * si1) >> 32))) << 1U; + + /* index calculation for the coefficients */ + ia3 = 3U * ia1; + co3 = pCoef[(ia3 * 2U)]; + si3 = pCoef[(ia3 * 2U) + 1U]; + + /* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */ + pSrc[2U * i3] = (((int32_t) (((q63_t) r2 * co3) >> 32)) + + ((int32_t) (((q63_t) s2 * si3) >> 32))) << 1U; + + /* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */ + pSrc[(2U * i3) + 1U] = (((int32_t) (((q63_t) s2 * co3) >> 32)) - + ((int32_t) (((q63_t) r2 * si3) >> 32))) << 1U; + + /* Twiddle coefficients index modifier */ + ia1 = ia1 + twidCoefModifier; + + /* Updating input index */ + i0 = i0 + 1U; + + } while (--j); + + /* end of first stage process */ + + /* data is in 5.27(q27) format */ + + + /* start of Middle stages process */ + + + /* each stage in middle stages provides two down scaling of the input */ + + twidCoefModifier <<= 2U; + + + for (k = fftLen / 4U; k > 4U; k >>= 2U) + { + /* Initializations for the first stage */ + n1 = n2; + n2 >>= 2U; + ia1 = 0U; + + /* Calculation of first stage */ + for (j = 0U; j <= (n2 - 1U); j++) + { + /* index calculation for the coefficients */ + ia2 = ia1 + ia1; + ia3 = ia2 + ia1; + co1 = pCoef[(ia1 * 2U)]; + si1 = pCoef[(ia1 * 2U) + 1U]; + co2 = pCoef[(ia2 * 2U)]; + si2 = pCoef[(ia2 * 2U) + 1U]; + co3 = pCoef[(ia3 * 2U)]; + si3 = pCoef[(ia3 * 2U) + 1U]; + /* Twiddle coefficients index modifier */ + ia1 = ia1 + twidCoefModifier; + + for (i0 = j; i0 < fftLen; i0 += n1) + { + /* index calculation for the input as, */ + /* pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2U], pSrc[i0 + 3fftLen/4] */ + i1 = i0 + n2; + i2 = i1 + n2; + i3 = i2 + n2; + + /* Butterfly implementation */ + /* xa + xc */ + r1 = pSrc[2U * i0] + pSrc[2U * i2]; + /* xa - xc */ + r2 = pSrc[2U * i0] - pSrc[2U * i2]; + + /* ya + yc */ + s1 = pSrc[(2U * i0) + 1U] + pSrc[(2U * i2) + 1U]; + /* ya - yc */ + s2 = pSrc[(2U * i0) + 1U] - pSrc[(2U * i2) + 1U]; + + /* xb + xd */ + t1 = pSrc[2U * i1] + pSrc[2U * i3]; + + /* xa' = xa + xb + xc + xd */ + pSrc[2U * i0] = (r1 + t1) >> 2U; + /* xa + xc -(xb + xd) */ + r1 = r1 - t1; + + /* yb + yd */ + t2 = pSrc[(2U * i1) + 1U] + pSrc[(2U * i3) + 1U]; + /* ya' = ya + yb + yc + yd */ + pSrc[(2U * i0) + 1U] = (s1 + t2) >> 2U; + + /* (ya + yc) - (yb + yd) */ + s1 = s1 - t2; + + /* (yb - yd) */ + t1 = pSrc[(2U * i1) + 1U] - pSrc[(2U * i3) + 1U]; + /* (xb - xd) */ + t2 = pSrc[2U * i1] - pSrc[2U * i3]; + + /* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */ + pSrc[2U * i1] = (((int32_t) (((q63_t) r1 * co2) >> 32)) + + ((int32_t) (((q63_t) s1 * si2) >> 32))) >> 1U; + + /* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */ + pSrc[(2U * i1) + 1U] = (((int32_t) (((q63_t) s1 * co2) >> 32)) - + ((int32_t) (((q63_t) r1 * si2) >> 32))) >> 1U; + + /* (xa - xc) + (yb - yd) */ + r1 = r2 + t1; + /* (xa - xc) - (yb - yd) */ + r2 = r2 - t1; + + /* (ya - yc) - (xb - xd) */ + s1 = s2 - t2; + /* (ya - yc) + (xb - xd) */ + s2 = s2 + t2; + + /* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */ + pSrc[2U * i2] = (((int32_t) (((q63_t) r1 * co1) >> 32)) + + ((int32_t) (((q63_t) s1 * si1) >> 32))) >> 1U; + + /* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */ + pSrc[(2U * i2) + 1U] = (((int32_t) (((q63_t) s1 * co1) >> 32)) - + ((int32_t) (((q63_t) r1 * si1) >> 32))) >> 1U; + + /* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */ + pSrc[2U * i3] = (((int32_t) (((q63_t) r2 * co3) >> 32)) + + ((int32_t) (((q63_t) s2 * si3) >> 32))) >> 1U; + + /* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */ + pSrc[(2U * i3) + 1U] = (((int32_t) (((q63_t) s2 * co3) >> 32)) - + ((int32_t) (((q63_t) r2 * si3) >> 32))) >> 1U; + } + } + twidCoefModifier <<= 2U; + } + + /* End of Middle stages process */ + + /* data is in 11.21(q21) format for the 1024 point as there are 3 middle stages */ + /* data is in 9.23(q23) format for the 256 point as there are 2 middle stages */ + /* data is in 7.25(q25) format for the 64 point as there are 1 middle stage */ + /* data is in 5.27(q27) format for the 16 point as there are no middle stages */ + + + /* start of Last stage process */ + /* Initializations for the last stage */ + j = fftLen >> 2; + ptr1 = &pSrc[0]; + + /* Calculations of last stage */ + do + { + /* Read xa (real), ya(imag) input */ + xa = *ptr1++; + ya = *ptr1++; + + /* Read xb (real), yb(imag) input */ + xb = *ptr1++; + yb = *ptr1++; + + /* Read xc (real), yc(imag) input */ + xc = *ptr1++; + yc = *ptr1++; + + /* Read xc (real), yc(imag) input */ + xd = *ptr1++; + yd = *ptr1++; + + /* xa' = xa + xb + xc + xd */ + xa_out = xa + xb + xc + xd; + + /* ya' = ya + yb + yc + yd */ + ya_out = ya + yb + yc + yd; + + /* pointer updation for writing */ + ptr1 = ptr1 - 8U; + + /* writing xa' and ya' */ + *ptr1++ = xa_out; + *ptr1++ = ya_out; + + xc_out = (xa - xb + xc - xd); + yc_out = (ya - yb + yc - yd); + + /* writing xc' and yc' */ + *ptr1++ = xc_out; + *ptr1++ = yc_out; + + xb_out = (xa + yb - xc - yd); + yb_out = (ya - xb - yc + xd); + + /* writing xb' and yb' */ + *ptr1++ = xb_out; + *ptr1++ = yb_out; + + xd_out = (xa - yb - xc + yd); + yd_out = (ya + xb - yc - xd); + + /* writing xd' and yd' */ + *ptr1++ = xd_out; + *ptr1++ = yd_out; + + + } while (--j); + + /* output is in 11.21(q21) format for the 1024 point */ + /* output is in 9.23(q23) format for the 256 point */ + /* output is in 7.25(q25) format for the 64 point */ + /* output is in 5.27(q27) format for the 16 point */ + + /* End of last stage process */ + +} + + +/** + @brief Core function for the Q31 CIFFT butterfly process. + @param[in,out] pSrc points to the in-place buffer of Q31 data type. + @param[in] fftLen length of the FFT. + @param[in] pCoef points to twiddle coefficient buffer. + @param[in] twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. + @return none + */ + +/* + * Radix-4 IFFT algorithm used is : + * + * CIFFT uses same twiddle coefficients as CFFT Function + * x[k] = x[n] + (j)k * x[n + fftLen/4] + (-1)k * x[n+fftLen/2] + (-j)k * x[n+3*fftLen/4] + * + * + * IFFT is implemented with following changes in equations from FFT + * + * Input real and imaginary data: + * x(n) = xa + j * ya + * x(n+N/4 ) = xb + j * yb + * x(n+N/2 ) = xc + j * yc + * x(n+3N 4) = xd + j * yd + * + * + * Output real and imaginary data: + * x(4r) = xa'+ j * ya' + * x(4r+1) = xb'+ j * yb' + * x(4r+2) = xc'+ j * yc' + * x(4r+3) = xd'+ j * yd' + * + * + * Twiddle factors for radix-4 IFFT: + * Wn = co1 + j * (si1) + * W2n = co2 + j * (si2) + * W3n = co3 + j * (si3) + + * The real and imaginary output values for the radix-4 butterfly are + * xa' = xa + xb + xc + xd + * ya' = ya + yb + yc + yd + * xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1) + * yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1) + * xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2) + * yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2) + * xd' = (xa+yb-xc-yd)* co3 - (ya-xb-yc+xd)* (si3) + * yd' = (ya-xb-yc+xd)* co3 + (xa+yb-xc-yd)* (si3) + * + */ + +void arm_radix4_butterfly_inverse_q31( + q31_t * pSrc, + uint32_t fftLen, + const q31_t * pCoef, + uint32_t twidCoefModifier) +{ + uint32_t n1, n2, ia1, ia2, ia3, i0, i1, i2, i3, j, k; + q31_t t1, t2, r1, r2, s1, s2, co1, co2, co3, si1, si2, si3; + q31_t xa, xb, xc, xd; + q31_t ya, yb, yc, yd; + q31_t xa_out, xb_out, xc_out, xd_out; + q31_t ya_out, yb_out, yc_out, yd_out; + + q31_t *ptr1; + + /* input is be 1.31(q31) format for all FFT sizes */ + /* Total process is divided into three stages */ + /* process first stage, middle stages, & last stage */ + + /* Start of first stage process */ + + /* Initializations for the first stage */ + n2 = fftLen; + n1 = n2; + /* n2 = fftLen/4 */ + n2 >>= 2U; + i0 = 0U; + ia1 = 0U; + + j = n2; + + do + { + /* input is in 1.31(q31) format and provide 4 guard bits for the input */ + + /* index calculation for the input as, */ + /* pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2U], pSrc[i0 + 3fftLen/4] */ + i1 = i0 + n2; + i2 = i1 + n2; + i3 = i2 + n2; + + /* Butterfly implementation */ + /* xa + xc */ + r1 = (pSrc[2U * i0] >> 4U) + (pSrc[2U * i2] >> 4U); + /* xa - xc */ + r2 = (pSrc[2U * i0] >> 4U) - (pSrc[2U * i2] >> 4U); + + /* xb + xd */ + t1 = (pSrc[2U * i1] >> 4U) + (pSrc[2U * i3] >> 4U); + + /* ya + yc */ + s1 = (pSrc[(2U * i0) + 1U] >> 4U) + (pSrc[(2U * i2) + 1U] >> 4U); + /* ya - yc */ + s2 = (pSrc[(2U * i0) + 1U] >> 4U) - (pSrc[(2U * i2) + 1U] >> 4U); + + /* xa' = xa + xb + xc + xd */ + pSrc[2U * i0] = (r1 + t1); + /* (xa + xc) - (xb + xd) */ + r1 = r1 - t1; + /* yb + yd */ + t2 = (pSrc[(2U * i1) + 1U] >> 4U) + (pSrc[(2U * i3) + 1U] >> 4U); + /* ya' = ya + yb + yc + yd */ + pSrc[(2U * i0) + 1U] = (s1 + t2); + + /* (ya + yc) - (yb + yd) */ + s1 = s1 - t2; + + /* yb - yd */ + t1 = (pSrc[(2U * i1) + 1U] >> 4U) - (pSrc[(2U * i3) + 1U] >> 4U); + /* xb - xd */ + t2 = (pSrc[2U * i1] >> 4U) - (pSrc[2U * i3] >> 4U); + + /* index calculation for the coefficients */ + ia2 = 2U * ia1; + co2 = pCoef[ia2 * 2U]; + si2 = pCoef[(ia2 * 2U) + 1U]; + + /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */ + pSrc[2U * i1] = (((int32_t) (((q63_t) r1 * co2) >> 32)) - + ((int32_t) (((q63_t) s1 * si2) >> 32))) << 1U; + + /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */ + pSrc[2U * i1 + 1U] = (((int32_t) (((q63_t) s1 * co2) >> 32)) + + ((int32_t) (((q63_t) r1 * si2) >> 32))) << 1U; + + /* (xa - xc) - (yb - yd) */ + r1 = r2 - t1; + /* (xa - xc) + (yb - yd) */ + r2 = r2 + t1; + + /* (ya - yc) + (xb - xd) */ + s1 = s2 + t2; + /* (ya - yc) - (xb - xd) */ + s2 = s2 - t2; + + co1 = pCoef[ia1 * 2U]; + si1 = pCoef[(ia1 * 2U) + 1U]; + + /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */ + pSrc[2U * i2] = (((int32_t) (((q63_t) r1 * co1) >> 32)) - + ((int32_t) (((q63_t) s1 * si1) >> 32))) << 1U; + + /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */ + pSrc[(2U * i2) + 1U] = (((int32_t) (((q63_t) s1 * co1) >> 32)) + + ((int32_t) (((q63_t) r1 * si1) >> 32))) << 1U; + + /* index calculation for the coefficients */ + ia3 = 3U * ia1; + co3 = pCoef[ia3 * 2U]; + si3 = pCoef[(ia3 * 2U) + 1U]; + + /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */ + pSrc[2U * i3] = (((int32_t) (((q63_t) r2 * co3) >> 32)) - + ((int32_t) (((q63_t) s2 * si3) >> 32))) << 1U; + + /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */ + pSrc[(2U * i3) + 1U] = (((int32_t) (((q63_t) s2 * co3) >> 32)) + + ((int32_t) (((q63_t) r2 * si3) >> 32))) << 1U; + + /* Twiddle coefficients index modifier */ + ia1 = ia1 + twidCoefModifier; + + /* Updating input index */ + i0 = i0 + 1U; + + } while (--j); + + /* data is in 5.27(q27) format */ + /* each stage provides two down scaling of the input */ + + + /* Start of Middle stages process */ + + twidCoefModifier <<= 2U; + + /* Calculation of second stage to excluding last stage */ + for (k = fftLen / 4U; k > 4U; k >>= 2U) + { + /* Initializations for the first stage */ + n1 = n2; + n2 >>= 2U; + ia1 = 0U; + + for (j = 0; j <= (n2 - 1U); j++) + { + /* index calculation for the coefficients */ + ia2 = ia1 + ia1; + ia3 = ia2 + ia1; + co1 = pCoef[(ia1 * 2U)]; + si1 = pCoef[(ia1 * 2U) + 1U]; + co2 = pCoef[(ia2 * 2U)]; + si2 = pCoef[(ia2 * 2U) + 1U]; + co3 = pCoef[(ia3 * 2U)]; + si3 = pCoef[(ia3 * 2U) + 1U]; + /* Twiddle coefficients index modifier */ + ia1 = ia1 + twidCoefModifier; + + for (i0 = j; i0 < fftLen; i0 += n1) + { + /* index calculation for the input as, */ + /* pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2U], pSrc[i0 + 3fftLen/4] */ + i1 = i0 + n2; + i2 = i1 + n2; + i3 = i2 + n2; + + /* Butterfly implementation */ + /* xa + xc */ + r1 = pSrc[2U * i0] + pSrc[2U * i2]; + /* xa - xc */ + r2 = pSrc[2U * i0] - pSrc[2U * i2]; + + /* ya + yc */ + s1 = pSrc[(2U * i0) + 1U] + pSrc[(2U * i2) + 1U]; + /* ya - yc */ + s2 = pSrc[(2U * i0) + 1U] - pSrc[(2U * i2) + 1U]; + + /* xb + xd */ + t1 = pSrc[2U * i1] + pSrc[2U * i3]; + + /* xa' = xa + xb + xc + xd */ + pSrc[2U * i0] = (r1 + t1) >> 2U; + /* xa + xc -(xb + xd) */ + r1 = r1 - t1; + /* yb + yd */ + t2 = pSrc[(2U * i1) + 1U] + pSrc[(2U * i3) + 1U]; + /* ya' = ya + yb + yc + yd */ + pSrc[(2U * i0) + 1U] = (s1 + t2) >> 2U; + + /* (ya + yc) - (yb + yd) */ + s1 = s1 - t2; + + /* (yb - yd) */ + t1 = pSrc[(2U * i1) + 1U] - pSrc[(2U * i3) + 1U]; + /* (xb - xd) */ + t2 = pSrc[2U * i1] - pSrc[2U * i3]; + + /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */ + pSrc[2U * i1] = (((int32_t) (((q63_t) r1 * co2) >> 32U)) - + ((int32_t) (((q63_t) s1 * si2) >> 32U))) >> 1U; + + /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */ + pSrc[(2U * i1) + 1U] = (((int32_t) (((q63_t) s1 * co2) >> 32U)) + + ((int32_t) (((q63_t) r1 * si2) >> 32U))) >> 1U; + + /* (xa - xc) - (yb - yd) */ + r1 = r2 - t1; + /* (xa - xc) + (yb - yd) */ + r2 = r2 + t1; + + /* (ya - yc) + (xb - xd) */ + s1 = s2 + t2; + /* (ya - yc) - (xb - xd) */ + s2 = s2 - t2; + + /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */ + pSrc[2U * i2] = (((int32_t) (((q63_t) r1 * co1) >> 32)) - + ((int32_t) (((q63_t) s1 * si1) >> 32))) >> 1U; + + /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */ + pSrc[(2U * i2) + 1U] = (((int32_t) (((q63_t) s1 * co1) >> 32)) + + ((int32_t) (((q63_t) r1 * si1) >> 32))) >> 1U; + + /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */ + pSrc[(2U * i3)] = (((int32_t) (((q63_t) r2 * co3) >> 32)) - + ((int32_t) (((q63_t) s2 * si3) >> 32))) >> 1U; + + /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */ + pSrc[(2U * i3) + 1U] = (((int32_t) (((q63_t) s2 * co3) >> 32)) + + ((int32_t) (((q63_t) r2 * si3) >> 32))) >> 1U; + } + } + twidCoefModifier <<= 2U; + } + + /* End of Middle stages process */ + + /* data is in 11.21(q21) format for the 1024 point as there are 3 middle stages */ + /* data is in 9.23(q23) format for the 256 point as there are 2 middle stages */ + /* data is in 7.25(q25) format for the 64 point as there are 1 middle stage */ + /* data is in 5.27(q27) format for the 16 point as there are no middle stages */ + + + /* Start of last stage process */ + + + /* Initializations for the last stage */ + j = fftLen >> 2; + ptr1 = &pSrc[0]; + + /* Calculations of last stage */ + do + { + /* Read xa (real), ya(imag) input */ + xa = *ptr1++; + ya = *ptr1++; + + /* Read xb (real), yb(imag) input */ + xb = *ptr1++; + yb = *ptr1++; + + /* Read xc (real), yc(imag) input */ + xc = *ptr1++; + yc = *ptr1++; + + /* Read xc (real), yc(imag) input */ + xd = *ptr1++; + yd = *ptr1++; + + /* xa' = xa + xb + xc + xd */ + xa_out = xa + xb + xc + xd; + + /* ya' = ya + yb + yc + yd */ + ya_out = ya + yb + yc + yd; + + /* pointer updation for writing */ + ptr1 = ptr1 - 8U; + + /* writing xa' and ya' */ + *ptr1++ = xa_out; + *ptr1++ = ya_out; + + xc_out = (xa - xb + xc - xd); + yc_out = (ya - yb + yc - yd); + + /* writing xc' and yc' */ + *ptr1++ = xc_out; + *ptr1++ = yc_out; + + xb_out = (xa - yb - xc + yd); + yb_out = (ya + xb - yc - xd); + + /* writing xb' and yb' */ + *ptr1++ = xb_out; + *ptr1++ = yb_out; + + xd_out = (xa + yb - xc - yd); + yd_out = (ya - xb - yc + xd); + + /* writing xd' and yd' */ + *ptr1++ = xd_out; + *ptr1++ = yd_out; + + } while (--j); + + /* output is in 11.21(q21) format for the 1024 point */ + /* output is in 9.23(q23) format for the 256 point */ + /* output is in 7.25(q25) format for the 64 point */ + /* output is in 5.27(q27) format for the 16 point */ + + /* End of last stage process */ +} diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix8_f16.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix8_f16.c new file mode 100644 index 0000000..b7eb37a --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix8_f16.c @@ -0,0 +1,289 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_cfft_radix8_f16.c + * Description: Radix-8 Decimation in Frequency CFFT & CIFFT Floating point processing function + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/transform_functions_f16.h" + +#if defined(ARM_FLOAT16_SUPPORTED) + + +/* ---------------------------------------------------------------------- + * Internal helper function used by the FFTs + * -------------------------------------------------------------------- */ + +/** + brief Core function for the floating-point CFFT butterfly process. + param[in,out] pSrc points to the in-place buffer of floating-point data type. + param[in] fftLen length of the FFT. + param[in] pCoef points to the twiddle coefficient buffer. + param[in] twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. + return none +*/ + +void arm_radix8_butterfly_f16( + float16_t * pSrc, + uint16_t fftLen, + const float16_t * pCoef, + uint16_t twidCoefModifier) +{ + uint32_t ia1, ia2, ia3, ia4, ia5, ia6, ia7; + uint32_t i1, i2, i3, i4, i5, i6, i7, i8; + uint32_t id; + uint32_t n1, n2, j; + + float16_t r1, r2, r3, r4, r5, r6, r7, r8; + float16_t t1, t2; + float16_t s1, s2, s3, s4, s5, s6, s7, s8; + float16_t p1, p2, p3, p4; + float16_t co2, co3, co4, co5, co6, co7, co8; + float16_t si2, si3, si4, si5, si6, si7, si8; + const float16_t C81 = 0.70710678118f16; + + n2 = fftLen; + + do + { + n1 = n2; + n2 = n2 >> 3; + i1 = 0; + + do + { + i2 = i1 + n2; + i3 = i2 + n2; + i4 = i3 + n2; + i5 = i4 + n2; + i6 = i5 + n2; + i7 = i6 + n2; + i8 = i7 + n2; + r1 = (_Float16)pSrc[2 * i1] + (_Float16)pSrc[2 * i5]; + r5 = (_Float16)pSrc[2 * i1] - (_Float16)pSrc[2 * i5]; + r2 = (_Float16)pSrc[2 * i2] + (_Float16)pSrc[2 * i6]; + r6 = (_Float16)pSrc[2 * i2] - (_Float16)pSrc[2 * i6]; + r3 = (_Float16)pSrc[2 * i3] + (_Float16)pSrc[2 * i7]; + r7 = (_Float16)pSrc[2 * i3] - (_Float16)pSrc[2 * i7]; + r4 = (_Float16)pSrc[2 * i4] + (_Float16)pSrc[2 * i8]; + r8 = (_Float16)pSrc[2 * i4] - (_Float16)pSrc[2 * i8]; + t1 = (_Float16)r1 - (_Float16)r3; + r1 = (_Float16)r1 + (_Float16)r3; + r3 = (_Float16)r2 - (_Float16)r4; + r2 = (_Float16)r2 + (_Float16)r4; + pSrc[2 * i1] = (_Float16)r1 + (_Float16)r2; + pSrc[2 * i5] = (_Float16)r1 - (_Float16)r2; + r1 = (_Float16)pSrc[2 * i1 + 1] + (_Float16)pSrc[2 * i5 + 1]; + s5 = (_Float16)pSrc[2 * i1 + 1] - (_Float16)pSrc[2 * i5 + 1]; + r2 = (_Float16)pSrc[2 * i2 + 1] + (_Float16)pSrc[2 * i6 + 1]; + s6 = (_Float16)pSrc[2 * i2 + 1] - (_Float16)pSrc[2 * i6 + 1]; + s3 = (_Float16)pSrc[2 * i3 + 1] + (_Float16)pSrc[2 * i7 + 1]; + s7 = (_Float16)pSrc[2 * i3 + 1] - (_Float16)pSrc[2 * i7 + 1]; + r4 = (_Float16)pSrc[2 * i4 + 1] + (_Float16)pSrc[2 * i8 + 1]; + s8 = (_Float16)pSrc[2 * i4 + 1] - (_Float16)pSrc[2 * i8 + 1]; + t2 = (_Float16)r1 - (_Float16)s3; + r1 = (_Float16)r1 + (_Float16)s3; + s3 = (_Float16)r2 - (_Float16)r4; + r2 = (_Float16)r2 + (_Float16)r4; + pSrc[2 * i1 + 1] = (_Float16)r1 + (_Float16)r2; + pSrc[2 * i5 + 1] = (_Float16)r1 - (_Float16)r2; + pSrc[2 * i3] = (_Float16)t1 + (_Float16)s3; + pSrc[2 * i7] = (_Float16)t1 - (_Float16)s3; + pSrc[2 * i3 + 1] = (_Float16)t2 - (_Float16)r3; + pSrc[2 * i7 + 1] = (_Float16)t2 + (_Float16)r3; + r1 = ((_Float16)r6 - (_Float16)r8) * (_Float16)C81; + r6 = ((_Float16)r6 + (_Float16)r8) * (_Float16)C81; + r2 = ((_Float16)s6 - (_Float16)s8) * (_Float16)C81; + s6 = ((_Float16)s6 + (_Float16)s8) * (_Float16)C81; + t1 = (_Float16)r5 - (_Float16)r1; + r5 = (_Float16)r5 + (_Float16)r1; + r8 = (_Float16)r7 - (_Float16)r6; + r7 = (_Float16)r7 + (_Float16)r6; + t2 = (_Float16)s5 - (_Float16)r2; + s5 = (_Float16)s5 + (_Float16)r2; + s8 = (_Float16)s7 - (_Float16)s6; + s7 = (_Float16)s7 + (_Float16)s6; + pSrc[2 * i2] = (_Float16)r5 + (_Float16)s7; + pSrc[2 * i8] = (_Float16)r5 - (_Float16)s7; + pSrc[2 * i6] = (_Float16)t1 + (_Float16)s8; + pSrc[2 * i4] = (_Float16)t1 - (_Float16)s8; + pSrc[2 * i2 + 1] = (_Float16)s5 - (_Float16)r7; + pSrc[2 * i8 + 1] = (_Float16)s5 + (_Float16)r7; + pSrc[2 * i6 + 1] = (_Float16)t2 - (_Float16)r8; + pSrc[2 * i4 + 1] = (_Float16)t2 + (_Float16)r8; + + i1 += n1; + } while (i1 < fftLen); + + if (n2 < 8) + break; + + ia1 = 0; + j = 1; + + do + { + /* index calculation for the coefficients */ + id = ia1 + twidCoefModifier; + ia1 = id; + ia2 = ia1 + id; + ia3 = ia2 + id; + ia4 = ia3 + id; + ia5 = ia4 + id; + ia6 = ia5 + id; + ia7 = ia6 + id; + + co2 = pCoef[2 * ia1]; + co3 = pCoef[2 * ia2]; + co4 = pCoef[2 * ia3]; + co5 = pCoef[2 * ia4]; + co6 = pCoef[2 * ia5]; + co7 = pCoef[2 * ia6]; + co8 = pCoef[2 * ia7]; + si2 = pCoef[2 * ia1 + 1]; + si3 = pCoef[2 * ia2 + 1]; + si4 = pCoef[2 * ia3 + 1]; + si5 = pCoef[2 * ia4 + 1]; + si6 = pCoef[2 * ia5 + 1]; + si7 = pCoef[2 * ia6 + 1]; + si8 = pCoef[2 * ia7 + 1]; + + i1 = j; + + do + { + /* index calculation for the input */ + i2 = i1 + n2; + i3 = i2 + n2; + i4 = i3 + n2; + i5 = i4 + n2; + i6 = i5 + n2; + i7 = i6 + n2; + i8 = i7 + n2; + r1 = (_Float16)pSrc[2 * i1] + (_Float16)pSrc[2 * i5]; + r5 = (_Float16)pSrc[2 * i1] - (_Float16)pSrc[2 * i5]; + r2 = (_Float16)pSrc[2 * i2] + (_Float16)pSrc[2 * i6]; + r6 = (_Float16)pSrc[2 * i2] - (_Float16)pSrc[2 * i6]; + r3 = (_Float16)pSrc[2 * i3] + (_Float16)pSrc[2 * i7]; + r7 = (_Float16)pSrc[2 * i3] - (_Float16)pSrc[2 * i7]; + r4 = (_Float16)pSrc[2 * i4] + (_Float16)pSrc[2 * i8]; + r8 = (_Float16)pSrc[2 * i4] - (_Float16)pSrc[2 * i8]; + t1 = (_Float16)r1 - (_Float16)r3; + r1 = (_Float16)r1 + (_Float16)r3; + r3 = (_Float16)r2 - (_Float16)r4; + r2 = (_Float16)r2 + (_Float16)r4; + pSrc[2 * i1] = (_Float16)r1 + (_Float16)r2; + r2 = (_Float16)r1 - (_Float16)r2; + s1 = (_Float16)pSrc[2 * i1 + 1] + (_Float16)pSrc[2 * i5 + 1]; + s5 = (_Float16)pSrc[2 * i1 + 1] - (_Float16)pSrc[2 * i5 + 1]; + s2 = (_Float16)pSrc[2 * i2 + 1] + (_Float16)pSrc[2 * i6 + 1]; + s6 = (_Float16)pSrc[2 * i2 + 1] - (_Float16)pSrc[2 * i6 + 1]; + s3 = (_Float16)pSrc[2 * i3 + 1] + (_Float16)pSrc[2 * i7 + 1]; + s7 = (_Float16)pSrc[2 * i3 + 1] - (_Float16)pSrc[2 * i7 + 1]; + s4 = (_Float16)pSrc[2 * i4 + 1] + (_Float16)pSrc[2 * i8 + 1]; + s8 = (_Float16)pSrc[2 * i4 + 1] - (_Float16)pSrc[2 * i8 + 1]; + t2 = (_Float16)s1 - (_Float16)s3; + s1 = (_Float16)s1 + (_Float16)s3; + s3 = (_Float16)s2 - (_Float16)s4; + s2 = (_Float16)s2 + (_Float16)s4; + r1 = (_Float16)t1 + (_Float16)s3; + t1 = (_Float16)t1 - (_Float16)s3; + pSrc[2 * i1 + 1] = (_Float16)s1 + (_Float16)s2; + s2 = (_Float16)s1 - (_Float16)s2; + s1 = (_Float16)t2 - (_Float16)r3; + t2 = (_Float16)t2 + (_Float16)r3; + p1 = (_Float16)co5 * (_Float16)r2; + p2 = (_Float16)si5 * (_Float16)s2; + p3 = (_Float16)co5 * (_Float16)s2; + p4 = (_Float16)si5 * (_Float16)r2; + pSrc[2 * i5] = (_Float16)p1 + (_Float16)p2; + pSrc[2 * i5 + 1] = (_Float16)p3 - (_Float16)p4; + p1 = (_Float16)co3 * (_Float16)r1; + p2 = (_Float16)si3 * (_Float16)s1; + p3 = (_Float16)co3 * (_Float16)s1; + p4 = (_Float16)si3 * (_Float16)r1; + pSrc[2 * i3] = (_Float16)p1 + (_Float16)p2; + pSrc[2 * i3 + 1] = (_Float16)p3 - (_Float16)p4; + p1 = (_Float16)co7 * (_Float16)t1; + p2 = (_Float16)si7 * (_Float16)t2; + p3 = (_Float16)co7 * (_Float16)t2; + p4 = (_Float16)si7 * (_Float16)t1; + pSrc[2 * i7] = (_Float16)p1 + (_Float16)p2; + pSrc[2 * i7 + 1] = (_Float16)p3 - (_Float16)p4; + r1 = ((_Float16)r6 - (_Float16)r8) * (_Float16)C81; + r6 = ((_Float16)r6 + (_Float16)r8) * (_Float16)C81; + s1 = ((_Float16)s6 - (_Float16)s8) * (_Float16)C81; + s6 = ((_Float16)s6 + (_Float16)s8) * (_Float16)C81; + t1 = (_Float16)r5 - (_Float16)r1; + r5 = (_Float16)r5 + (_Float16)r1; + r8 = (_Float16)r7 - (_Float16)r6; + r7 = (_Float16)r7 + (_Float16)r6; + t2 = (_Float16)s5 - (_Float16)s1; + s5 = (_Float16)s5 + (_Float16)s1; + s8 = (_Float16)s7 - (_Float16)s6; + s7 = (_Float16)s7 + (_Float16)s6; + r1 = (_Float16)r5 + (_Float16)s7; + r5 = (_Float16)r5 - (_Float16)s7; + r6 = (_Float16)t1 + (_Float16)s8; + t1 = (_Float16)t1 - (_Float16)s8; + s1 = (_Float16)s5 - (_Float16)r7; + s5 = (_Float16)s5 + (_Float16)r7; + s6 = (_Float16)t2 - (_Float16)r8; + t2 = (_Float16)t2 + (_Float16)r8; + p1 = (_Float16)co2 * (_Float16)r1; + p2 = (_Float16)si2 * (_Float16)s1; + p3 = (_Float16)co2 * (_Float16)s1; + p4 = (_Float16)si2 * (_Float16)r1; + pSrc[2 * i2] = (_Float16)p1 + (_Float16)p2; + pSrc[2 * i2 + 1] = (_Float16)p3 - (_Float16)p4; + p1 = (_Float16)co8 * (_Float16)r5; + p2 = (_Float16)si8 * (_Float16)s5; + p3 = (_Float16)co8 * (_Float16)s5; + p4 = (_Float16)si8 * (_Float16)r5; + pSrc[2 * i8] = (_Float16)p1 + (_Float16)p2; + pSrc[2 * i8 + 1] = (_Float16)p3 - (_Float16)p4; + p1 = (_Float16)co6 * (_Float16)r6; + p2 = (_Float16)si6 * (_Float16)s6; + p3 = (_Float16)co6 * (_Float16)s6; + p4 = (_Float16)si6 * (_Float16)r6; + pSrc[2 * i6] = (_Float16)p1 + (_Float16)p2; + pSrc[2 * i6 + 1] = (_Float16)p3 - (_Float16)p4; + p1 = (_Float16)co4 * (_Float16)t1; + p2 = (_Float16)si4 * (_Float16)t2; + p3 = (_Float16)co4 * (_Float16)t2; + p4 = (_Float16)si4 * (_Float16)t1; + pSrc[2 * i4] = (_Float16)p1 + (_Float16)p2; + pSrc[2 * i4 + 1] = (_Float16)p3 - (_Float16)p4; + + i1 += n1; + } while (i1 < fftLen); + + j++; + } while (j < n2); + + twidCoefModifier <<= 3; + } while (n2 > 7); +} + +#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix8_f32.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix8_f32.c new file mode 100644 index 0000000..7990d47 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix8_f32.c @@ -0,0 +1,285 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_cfft_radix8_f32.c + * Description: Radix-8 Decimation in Frequency CFFT & CIFFT Floating point processing function + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/transform_functions.h" + + +/* ---------------------------------------------------------------------- + * Internal helper function used by the FFTs + * -------------------------------------------------------------------- */ + +/** + brief Core function for the floating-point CFFT butterfly process. + param[in,out] pSrc points to the in-place buffer of floating-point data type. + param[in] fftLen length of the FFT. + param[in] pCoef points to the twiddle coefficient buffer. + param[in] twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. + return none +*/ + +void arm_radix8_butterfly_f32( + float32_t * pSrc, + uint16_t fftLen, + const float32_t * pCoef, + uint16_t twidCoefModifier) +{ + uint32_t ia1, ia2, ia3, ia4, ia5, ia6, ia7; + uint32_t i1, i2, i3, i4, i5, i6, i7, i8; + uint32_t id; + uint32_t n1, n2, j; + + float32_t r1, r2, r3, r4, r5, r6, r7, r8; + float32_t t1, t2; + float32_t s1, s2, s3, s4, s5, s6, s7, s8; + float32_t p1, p2, p3, p4; + float32_t co2, co3, co4, co5, co6, co7, co8; + float32_t si2, si3, si4, si5, si6, si7, si8; + const float32_t C81 = 0.70710678118f; + + n2 = fftLen; + + do + { + n1 = n2; + n2 = n2 >> 3; + i1 = 0; + + do + { + i2 = i1 + n2; + i3 = i2 + n2; + i4 = i3 + n2; + i5 = i4 + n2; + i6 = i5 + n2; + i7 = i6 + n2; + i8 = i7 + n2; + r1 = pSrc[2 * i1] + pSrc[2 * i5]; + r5 = pSrc[2 * i1] - pSrc[2 * i5]; + r2 = pSrc[2 * i2] + pSrc[2 * i6]; + r6 = pSrc[2 * i2] - pSrc[2 * i6]; + r3 = pSrc[2 * i3] + pSrc[2 * i7]; + r7 = pSrc[2 * i3] - pSrc[2 * i7]; + r4 = pSrc[2 * i4] + pSrc[2 * i8]; + r8 = pSrc[2 * i4] - pSrc[2 * i8]; + t1 = r1 - r3; + r1 = r1 + r3; + r3 = r2 - r4; + r2 = r2 + r4; + pSrc[2 * i1] = r1 + r2; + pSrc[2 * i5] = r1 - r2; + r1 = pSrc[2 * i1 + 1] + pSrc[2 * i5 + 1]; + s5 = pSrc[2 * i1 + 1] - pSrc[2 * i5 + 1]; + r2 = pSrc[2 * i2 + 1] + pSrc[2 * i6 + 1]; + s6 = pSrc[2 * i2 + 1] - pSrc[2 * i6 + 1]; + s3 = pSrc[2 * i3 + 1] + pSrc[2 * i7 + 1]; + s7 = pSrc[2 * i3 + 1] - pSrc[2 * i7 + 1]; + r4 = pSrc[2 * i4 + 1] + pSrc[2 * i8 + 1]; + s8 = pSrc[2 * i4 + 1] - pSrc[2 * i8 + 1]; + t2 = r1 - s3; + r1 = r1 + s3; + s3 = r2 - r4; + r2 = r2 + r4; + pSrc[2 * i1 + 1] = r1 + r2; + pSrc[2 * i5 + 1] = r1 - r2; + pSrc[2 * i3] = t1 + s3; + pSrc[2 * i7] = t1 - s3; + pSrc[2 * i3 + 1] = t2 - r3; + pSrc[2 * i7 + 1] = t2 + r3; + r1 = (r6 - r8) * C81; + r6 = (r6 + r8) * C81; + r2 = (s6 - s8) * C81; + s6 = (s6 + s8) * C81; + t1 = r5 - r1; + r5 = r5 + r1; + r8 = r7 - r6; + r7 = r7 + r6; + t2 = s5 - r2; + s5 = s5 + r2; + s8 = s7 - s6; + s7 = s7 + s6; + pSrc[2 * i2] = r5 + s7; + pSrc[2 * i8] = r5 - s7; + pSrc[2 * i6] = t1 + s8; + pSrc[2 * i4] = t1 - s8; + pSrc[2 * i2 + 1] = s5 - r7; + pSrc[2 * i8 + 1] = s5 + r7; + pSrc[2 * i6 + 1] = t2 - r8; + pSrc[2 * i4 + 1] = t2 + r8; + + i1 += n1; + } while (i1 < fftLen); + + if (n2 < 8) + break; + + ia1 = 0; + j = 1; + + do + { + /* index calculation for the coefficients */ + id = ia1 + twidCoefModifier; + ia1 = id; + ia2 = ia1 + id; + ia3 = ia2 + id; + ia4 = ia3 + id; + ia5 = ia4 + id; + ia6 = ia5 + id; + ia7 = ia6 + id; + + co2 = pCoef[2 * ia1]; + co3 = pCoef[2 * ia2]; + co4 = pCoef[2 * ia3]; + co5 = pCoef[2 * ia4]; + co6 = pCoef[2 * ia5]; + co7 = pCoef[2 * ia6]; + co8 = pCoef[2 * ia7]; + si2 = pCoef[2 * ia1 + 1]; + si3 = pCoef[2 * ia2 + 1]; + si4 = pCoef[2 * ia3 + 1]; + si5 = pCoef[2 * ia4 + 1]; + si6 = pCoef[2 * ia5 + 1]; + si7 = pCoef[2 * ia6 + 1]; + si8 = pCoef[2 * ia7 + 1]; + + i1 = j; + + do + { + /* index calculation for the input */ + i2 = i1 + n2; + i3 = i2 + n2; + i4 = i3 + n2; + i5 = i4 + n2; + i6 = i5 + n2; + i7 = i6 + n2; + i8 = i7 + n2; + r1 = pSrc[2 * i1] + pSrc[2 * i5]; + r5 = pSrc[2 * i1] - pSrc[2 * i5]; + r2 = pSrc[2 * i2] + pSrc[2 * i6]; + r6 = pSrc[2 * i2] - pSrc[2 * i6]; + r3 = pSrc[2 * i3] + pSrc[2 * i7]; + r7 = pSrc[2 * i3] - pSrc[2 * i7]; + r4 = pSrc[2 * i4] + pSrc[2 * i8]; + r8 = pSrc[2 * i4] - pSrc[2 * i8]; + t1 = r1 - r3; + r1 = r1 + r3; + r3 = r2 - r4; + r2 = r2 + r4; + pSrc[2 * i1] = r1 + r2; + r2 = r1 - r2; + s1 = pSrc[2 * i1 + 1] + pSrc[2 * i5 + 1]; + s5 = pSrc[2 * i1 + 1] - pSrc[2 * i5 + 1]; + s2 = pSrc[2 * i2 + 1] + pSrc[2 * i6 + 1]; + s6 = pSrc[2 * i2 + 1] - pSrc[2 * i6 + 1]; + s3 = pSrc[2 * i3 + 1] + pSrc[2 * i7 + 1]; + s7 = pSrc[2 * i3 + 1] - pSrc[2 * i7 + 1]; + s4 = pSrc[2 * i4 + 1] + pSrc[2 * i8 + 1]; + s8 = pSrc[2 * i4 + 1] - pSrc[2 * i8 + 1]; + t2 = s1 - s3; + s1 = s1 + s3; + s3 = s2 - s4; + s2 = s2 + s4; + r1 = t1 + s3; + t1 = t1 - s3; + pSrc[2 * i1 + 1] = s1 + s2; + s2 = s1 - s2; + s1 = t2 - r3; + t2 = t2 + r3; + p1 = co5 * r2; + p2 = si5 * s2; + p3 = co5 * s2; + p4 = si5 * r2; + pSrc[2 * i5] = p1 + p2; + pSrc[2 * i5 + 1] = p3 - p4; + p1 = co3 * r1; + p2 = si3 * s1; + p3 = co3 * s1; + p4 = si3 * r1; + pSrc[2 * i3] = p1 + p2; + pSrc[2 * i3 + 1] = p3 - p4; + p1 = co7 * t1; + p2 = si7 * t2; + p3 = co7 * t2; + p4 = si7 * t1; + pSrc[2 * i7] = p1 + p2; + pSrc[2 * i7 + 1] = p3 - p4; + r1 = (r6 - r8) * C81; + r6 = (r6 + r8) * C81; + s1 = (s6 - s8) * C81; + s6 = (s6 + s8) * C81; + t1 = r5 - r1; + r5 = r5 + r1; + r8 = r7 - r6; + r7 = r7 + r6; + t2 = s5 - s1; + s5 = s5 + s1; + s8 = s7 - s6; + s7 = s7 + s6; + r1 = r5 + s7; + r5 = r5 - s7; + r6 = t1 + s8; + t1 = t1 - s8; + s1 = s5 - r7; + s5 = s5 + r7; + s6 = t2 - r8; + t2 = t2 + r8; + p1 = co2 * r1; + p2 = si2 * s1; + p3 = co2 * s1; + p4 = si2 * r1; + pSrc[2 * i2] = p1 + p2; + pSrc[2 * i2 + 1] = p3 - p4; + p1 = co8 * r5; + p2 = si8 * s5; + p3 = co8 * s5; + p4 = si8 * r5; + pSrc[2 * i8] = p1 + p2; + pSrc[2 * i8 + 1] = p3 - p4; + p1 = co6 * r6; + p2 = si6 * s6; + p3 = co6 * s6; + p4 = si6 * r6; + pSrc[2 * i6] = p1 + p2; + pSrc[2 * i6 + 1] = p3 - p4; + p1 = co4 * t1; + p2 = si4 * t2; + p3 = co4 * t2; + p4 = si4 * t1; + pSrc[2 * i4] = p1 + p2; + pSrc[2 * i4 + 1] = p3 - p4; + + i1 += n1; + } while (i1 < fftLen); + + j++; + } while (j < n2); + + twidCoefModifier <<= 3; + } while (n2 > 7); +} diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_dct4_f32.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_dct4_f32.c new file mode 100644 index 0000000..b9dff3f --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_dct4_f32.c @@ -0,0 +1,448 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_dct4_f32.c + * Description: Processing function of DCT4 & IDCT4 F32 + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/transform_functions.h" + +/** + @ingroup groupTransforms + */ + +/** + @defgroup DCT4_IDCT4 DCT Type IV Functions + + Representation of signals by minimum number of values is important for storage and transmission. + The possibility of large discontinuity between the beginning and end of a period of a signal + in DFT can be avoided by extending the signal so that it is even-symmetric. + Discrete Cosine Transform (DCT) is constructed such that its energy is heavily concentrated in the lower part of the + spectrum and is very widely used in signal and image coding applications. + The family of DCTs (DCT type- 1,2,3,4) is the outcome of different combinations of homogeneous boundary conditions. + DCT has an excellent energy-packing capability, hence has many applications and in data compression in particular. + + DCT is essentially the Discrete Fourier Transform(DFT) of an even-extended real signal. + Reordering of the input data makes the computation of DCT just a problem of + computing the DFT of a real signal with a few additional operations. + This approach provides regular, simple, and very efficient DCT algorithms for practical hardware and software implementations. + + DCT type-II can be implemented using Fast fourier transform (FFT) internally, as the transform is applied on real values, Real FFT can be used. + DCT4 is implemented using DCT2 as their implementations are similar except with some added pre-processing and post-processing. + DCT2 implementation can be described in the following steps: + - Re-ordering input + - Calculating Real FFT + - Multiplication of weights and Real FFT output and getting real part from the product. + + This process is explained by the block diagram below: + \image html DCT4.gif "Discrete Cosine Transform - type-IV" + + @par Algorithm + The N-point type-IV DCT is defined as a real, linear transformation by the formula: + \image html DCT4Equation.gif + where <code>k = 0, 1, 2, ..., N-1</code> + @par + Its inverse is defined as follows: + \image html IDCT4Equation.gif + where <code>n = 0, 1, 2, ..., N-1</code> + @par + The DCT4 matrices become involutory (i.e. they are self-inverse) by multiplying with an overall scale factor of sqrt(2/N). + The symmetry of the transform matrix indicates that the fast algorithms for the forward + and inverse transform computation are identical. + Note that the implementation of Inverse DCT4 and DCT4 is same, hence same process function can be used for both. + + @par Lengths supported by the transform: + As DCT4 internally uses Real FFT, it supports all the lengths 128, 512, 2048 and 8192. + The library provides separate functions for Q15, Q31, and floating-point data types. + + @par Instance Structure + The instances for Real FFT and FFT, cosine values table and twiddle factor table are stored in an instance data structure. + A separate instance structure must be defined for each transform. + There are separate instance structure declarations for each of the 3 supported data types. + + @par Initialization Functions + There is also an associated initialization function for each data type. + The initialization function performs the following operations: + - Sets the values of the internal structure fields. + - Initializes Real FFT as its process function is used internally in DCT4, by calling \ref arm_rfft_init_f32(). + @par + Use of the initialization function is optional. + However, if the initialization function is used, then the instance structure cannot be placed into a const data section. + To place an instance structure into a const data section, the instance structure must be manually initialized. + Manually initialize the instance structure as follows: + <pre> + arm_dct4_instance_f32 S = {N, Nby2, normalize, pTwiddle, pCosFactor, pRfft, pCfft}; + arm_dct4_instance_q31 S = {N, Nby2, normalize, pTwiddle, pCosFactor, pRfft, pCfft}; + arm_dct4_instance_q15 S = {N, Nby2, normalize, pTwiddle, pCosFactor, pRfft, pCfft}; + </pre> + where \c N is the length of the DCT4; \c Nby2 is half of the length of the DCT4; + \c normalize is normalizing factor used and is equal to <code>sqrt(2/N)</code>; + \c pTwiddle points to the twiddle factor table; + \c pCosFactor points to the cosFactor table; + \c pRfft points to the real FFT instance; + \c pCfft points to the complex FFT instance; + The CFFT and RFFT structures also needs to be initialized, refer to arm_cfft_radix4_f32() + and arm_rfft_f32() respectively for details regarding static initialization. + + @par Fixed-Point Behavior + Care must be taken when using the fixed-point versions of the DCT4 transform functions. + In particular, the overflow and saturation behavior of the accumulator used in each function must be considered. + Refer to the function specific documentation below for usage guidelines. + */ + + /** + @addtogroup DCT4_IDCT4 + @{ + */ + +/** + @brief Processing function for the floating-point DCT4/IDCT4. + @param[in] S points to an instance of the floating-point DCT4/IDCT4 structure + @param[in] pState points to state buffer + @param[in,out] pInlineBuffer points to the in-place input and output buffer + @return none + */ + +void arm_dct4_f32( + const arm_dct4_instance_f32 * S, + float32_t * pState, + float32_t * pInlineBuffer) +{ + const float32_t *weights = S->pTwiddle; /* Pointer to the Weights table */ + const float32_t *cosFact = S->pCosFactor; /* Pointer to the cos factors table */ + float32_t *pS1, *pS2, *pbuff; /* Temporary pointers for input buffer and pState buffer */ + float32_t in; /* Temporary variable */ + uint32_t i; /* Loop counter */ + + + /* DCT4 computation involves DCT2 (which is calculated using RFFT) + * along with some pre-processing and post-processing. + * Computational procedure is explained as follows: + * (a) Pre-processing involves multiplying input with cos factor, + * r(n) = 2 * u(n) * cos(pi*(2*n+1)/(4*n)) + * where, + * r(n) -- output of preprocessing + * u(n) -- input to preprocessing(actual Source buffer) + * (b) Calculation of DCT2 using FFT is divided into three steps: + * Step1: Re-ordering of even and odd elements of input. + * Step2: Calculating FFT of the re-ordered input. + * Step3: Taking the real part of the product of FFT output and weights. + * (c) Post-processing - DCT4 can be obtained from DCT2 output using the following equation: + * Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0) + * where, + * Y4 -- DCT4 output, Y2 -- DCT2 output + * (d) Multiplying the output with the normalizing factor sqrt(2/N). + */ + + /*-------- Pre-processing ------------*/ + /* Multiplying input with cos factor i.e. r(n) = 2 * x(n) * cos(pi*(2*n+1)/(4*n)) */ + arm_scale_f32(pInlineBuffer, 2.0f, pInlineBuffer, S->N); + arm_mult_f32(pInlineBuffer, cosFact, pInlineBuffer, S->N); + + /* ---------------------------------------------------------------- + * Step1: Re-ordering of even and odd elements as + * pState[i] = pInlineBuffer[2*i] and + * pState[N-i-1] = pInlineBuffer[2*i+1] where i = 0 to N/2 + ---------------------------------------------------------------------*/ + + /* pS1 initialized to pState */ + pS1 = pState; + + /* pS2 initialized to pState+N-1, so that it points to the end of the state buffer */ + pS2 = pState + (S->N - 1U); + + /* pbuff initialized to input buffer */ + pbuff = pInlineBuffer; + + +#if defined (ARM_MATH_LOOPUNROLL) + + /* Initializing the loop counter to N/2 >> 2 for loop unrolling by 4 */ + i = S->Nby2 >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 outputs at a time. + ** a second loop below computes the remaining 1 to 3 samples. */ + do + { + /* Re-ordering of even and odd elements */ + /* pState[i] = pInlineBuffer[2*i] */ + *pS1++ = *pbuff++; + /* pState[N-i-1] = pInlineBuffer[2*i+1] */ + *pS2-- = *pbuff++; + + *pS1++ = *pbuff++; + *pS2-- = *pbuff++; + + *pS1++ = *pbuff++; + *pS2-- = *pbuff++; + + *pS1++ = *pbuff++; + *pS2-- = *pbuff++; + + /* Decrement loop counter */ + i--; + } while (i > 0U); + + /* pbuff initialized to input buffer */ + pbuff = pInlineBuffer; + + /* pS1 initialized to pState */ + pS1 = pState; + + /* Initializing the loop counter to N/4 instead of N for loop unrolling */ + i = S->N >> 2U; + + /* Processing with loop unrolling 4 times as N is always multiple of 4. + * Compute 4 outputs at a time */ + do + { + /* Writing the re-ordered output back to inplace input buffer */ + *pbuff++ = *pS1++; + *pbuff++ = *pS1++; + *pbuff++ = *pS1++; + *pbuff++ = *pS1++; + + /* Decrement the loop counter */ + i--; + } while (i > 0U); + + + /* --------------------------------------------------------- + * Step2: Calculate RFFT for N-point input + * ---------------------------------------------------------- */ + /* pInlineBuffer is real input of length N , pState is the complex output of length 2N */ + arm_rfft_f32 (S->pRfft, pInlineBuffer, pState); + + /*---------------------------------------------------------------------- + * Step3: Multiply the FFT output with the weights. + *----------------------------------------------------------------------*/ + arm_cmplx_mult_cmplx_f32 (pState, weights, pState, S->N); + + /* ----------- Post-processing ---------- */ + /* DCT-IV can be obtained from DCT-II by the equation, + * Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0) + * Hence, Y4(0) = Y2(0)/2 */ + /* Getting only real part from the output and Converting to DCT-IV */ + + /* Initializing the loop counter to N >> 2 for loop unrolling by 4 */ + i = (S->N - 1U) >> 2U; + + /* pbuff initialized to input buffer. */ + pbuff = pInlineBuffer; + + /* pS1 initialized to pState */ + pS1 = pState; + + /* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */ + in = *pS1++ * (float32_t) 0.5; + /* input buffer acts as inplace, so output values are stored in the input itself. */ + *pbuff++ = in; + + /* pState pointer is incremented twice as the real values are located alternatively in the array */ + pS1++; + + /* First part of the processing with loop unrolling. Compute 4 outputs at a time. + ** a second loop below computes the remaining 1 to 3 samples. */ + do + { + /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */ + /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */ + in = *pS1++ - in; + *pbuff++ = in; + /* points to the next real value */ + pS1++; + + in = *pS1++ - in; + *pbuff++ = in; + pS1++; + + in = *pS1++ - in; + *pbuff++ = in; + pS1++; + + in = *pS1++ - in; + *pbuff++ = in; + pS1++; + + /* Decrement the loop counter */ + i--; + } while (i > 0U); + + /* If the blockSize is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + i = (S->N - 1U) % 0x4U; + + while (i > 0U) + { + /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */ + /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */ + in = *pS1++ - in; + *pbuff++ = in; + + /* points to the next real value */ + pS1++; + + /* Decrement the loop counter */ + i--; + } + + + /*------------ Normalizing the output by multiplying with the normalizing factor ----------*/ + + /* Initializing the loop counter to N/4 instead of N for loop unrolling */ + i = S->N >> 2U; + + /* pbuff initialized to the pInlineBuffer(now contains the output values) */ + pbuff = pInlineBuffer; + + /* Processing with loop unrolling 4 times as N is always multiple of 4. Compute 4 outputs at a time */ + do + { + /* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */ + in = *pbuff; + *pbuff++ = in * S->normalize; + + in = *pbuff; + *pbuff++ = in * S->normalize; + + in = *pbuff; + *pbuff++ = in * S->normalize; + + in = *pbuff; + *pbuff++ = in * S->normalize; + + /* Decrement the loop counter */ + i--; + } while (i > 0U); + + +#else + + /* Initializing the loop counter to N/2 */ + i = S->Nby2; + + do + { + /* Re-ordering of even and odd elements */ + /* pState[i] = pInlineBuffer[2*i] */ + *pS1++ = *pbuff++; + /* pState[N-i-1] = pInlineBuffer[2*i+1] */ + *pS2-- = *pbuff++; + + /* Decrement the loop counter */ + i--; + } while (i > 0U); + + /* pbuff initialized to input buffer */ + pbuff = pInlineBuffer; + + /* pS1 initialized to pState */ + pS1 = pState; + + /* Initializing the loop counter */ + i = S->N; + + do + { + /* Writing the re-ordered output back to inplace input buffer */ + *pbuff++ = *pS1++; + + /* Decrement the loop counter */ + i--; + } while (i > 0U); + + + /* --------------------------------------------------------- + * Step2: Calculate RFFT for N-point input + * ---------------------------------------------------------- */ + /* pInlineBuffer is real input of length N , pState is the complex output of length 2N */ + arm_rfft_f32 (S->pRfft, pInlineBuffer, pState); + + /*---------------------------------------------------------------------- + * Step3: Multiply the FFT output with the weights. + *----------------------------------------------------------------------*/ + arm_cmplx_mult_cmplx_f32 (pState, weights, pState, S->N); + + /* ----------- Post-processing ---------- */ + /* DCT-IV can be obtained from DCT-II by the equation, + * Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0) + * Hence, Y4(0) = Y2(0)/2 */ + /* Getting only real part from the output and Converting to DCT-IV */ + + /* pbuff initialized to input buffer. */ + pbuff = pInlineBuffer; + + /* pS1 initialized to pState */ + pS1 = pState; + + /* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */ + in = *pS1++ * (float32_t) 0.5; + /* input buffer acts as inplace, so output values are stored in the input itself. */ + *pbuff++ = in; + + /* pState pointer is incremented twice as the real values are located alternatively in the array */ + pS1++; + + /* Initializing the loop counter */ + i = (S->N - 1U); + + do + { + /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */ + /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */ + in = *pS1++ - in; + *pbuff++ = in; + + /* points to the next real value */ + pS1++; + + /* Decrement loop counter */ + i--; + } while (i > 0U); + + /*------------ Normalizing the output by multiplying with the normalizing factor ----------*/ + + /* Initializing loop counter */ + i = S->N; + + /* pbuff initialized to the pInlineBuffer (now contains the output values) */ + pbuff = pInlineBuffer; + + do + { + /* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */ + in = *pbuff; + *pbuff++ = in * S->normalize; + + /* Decrement loop counter */ + i--; + } while (i > 0U); + +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ + +} + +/** + @} end of DCT4_IDCT4 group + */ diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_dct4_init_f32.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_dct4_init_f32.c new file mode 100644 index 0000000..e1d80c0 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_dct4_init_f32.c @@ -0,0 +1,130 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_dct4_init_f32.c + * Description: Initialization function of DCT-4 & IDCT4 F32 + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/transform_functions.h" +#include "arm_common_tables.h" + +/** + @ingroup groupTransforms + */ + + /** + @addtogroup DCT4_IDCT4 + @{ + */ + +/** + @brief Initialization function for the floating-point DCT4/IDCT4. + @param[in,out] S points to an instance of floating-point DCT4/IDCT4 structure + @param[in] S_RFFT points to an instance of floating-point RFFT/RIFFT structure + @param[in] S_CFFT points to an instance of floating-point CFFT/CIFFT structure + @param[in] N length of the DCT4 + @param[in] Nby2 half of the length of the DCT4 + @param[in] normalize normalizing factor. + @return execution status + - \ref ARM_MATH_SUCCESS : Operation successful + - \ref ARM_MATH_ARGUMENT_ERROR : <code>N</code> is not a supported transform length + + @par Normalizing factor + The normalizing factor is <code>sqrt(2/N)</code>, which depends on the size of transform <code>N</code>. + Floating-point normalizing factors are mentioned in the table below for different DCT sizes: + + \image html dct4NormalizingF32Table.gif + */ + +arm_status arm_dct4_init_f32( + arm_dct4_instance_f32 * S, + arm_rfft_instance_f32 * S_RFFT, + arm_cfft_radix4_instance_f32 * S_CFFT, + uint16_t N, + uint16_t Nby2, + float32_t normalize) +{ + /* Initialize the default arm status */ + arm_status status = ARM_MATH_SUCCESS; + + + /* Initialize the DCT4 length */ + S->N = N; + + /* Initialize the half of DCT4 length */ + S->Nby2 = Nby2; + + /* Initialize the DCT4 Normalizing factor */ + S->normalize = normalize; + + /* Initialize Real FFT Instance */ + S->pRfft = S_RFFT; + + /* Initialize Complex FFT Instance */ + S->pCfft = S_CFFT; + + switch (N) + { + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_F32_8192) + /* Initialize the table modifier values */ + case 8192U: + S->pTwiddle = Weights_8192; + S->pCosFactor = cos_factors_8192; + break; + #endif + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_F32_2048) + case 2048U: + S->pTwiddle = Weights_2048; + S->pCosFactor = cos_factors_2048; + break; + #endif + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_F32_512) + case 512U: + S->pTwiddle = Weights_512; + S->pCosFactor = cos_factors_512; + break; + #endif + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_F32_128) + case 128U: + S->pTwiddle = Weights_128; + S->pCosFactor = cos_factors_128; + break; + #endif + default: + status = ARM_MATH_ARGUMENT_ERROR; + } + + /* Initialize the RFFT/RIFFT Function */ + arm_rfft_init_f32(S->pRfft, S->pCfft, S->N, 0U, 1U); + + /* return the status of DCT4 Init function */ + return (status); +} + +/** + @} end of DCT4_IDCT4 group + */ diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_dct4_init_q15.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_dct4_init_q15.c new file mode 100644 index 0000000..5390da3 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_dct4_init_q15.c @@ -0,0 +1,130 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_dct4_init_q15.c + * Description: Initialization function of DCT-4 & IDCT4 Q15 + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/transform_functions.h" +#include "arm_common_tables.h" + +/** + @ingroup groupTransforms + */ + + /** + @addtogroup DCT4_IDCT4 + @{ + */ + +/** + @brief Initialization function for the Q15 DCT4/IDCT4. + @param[in,out] S points to an instance of Q15 DCT4/IDCT4 structure + @param[in] S_RFFT points to an instance of Q15 RFFT/RIFFT structure + @param[in] S_CFFT points to an instance of Q15 CFFT/CIFFT structure + @param[in] N length of the DCT4 + @param[in] Nby2 half of the length of the DCT4 + @param[in] normalize normalizing factor + @return execution status + - \ref ARM_MATH_SUCCESS : Operation successful + - \ref ARM_MATH_ARGUMENT_ERROR : <code>N</code> is not a supported transform length + + @par Normalizing factor + The normalizing factor is <code>sqrt(2/N)</code>, which depends on the size of transform <code>N</code>. + Normalizing factors in 1.15 format are mentioned in the table below for different DCT sizes: + + \image html dct4NormalizingQ15Table.gif + */ + +arm_status arm_dct4_init_q15( + arm_dct4_instance_q15 * S, + arm_rfft_instance_q15 * S_RFFT, + arm_cfft_radix4_instance_q15 * S_CFFT, + uint16_t N, + uint16_t Nby2, + q15_t normalize) +{ + /* Initialise the default arm status */ + arm_status status = ARM_MATH_SUCCESS; + + /* Initialize the DCT4 length */ + S->N = N; + + /* Initialize the half of DCT4 length */ + S->Nby2 = Nby2; + + /* Initialize the DCT4 Normalizing factor */ + S->normalize = normalize; + + /* Initialize Real FFT Instance */ + S->pRfft = S_RFFT; + + /* Initialize Complex FFT Instance */ + S->pCfft = S_CFFT; + + switch (N) + { + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_Q15_8192) + /* Initialize the table modifier values */ + case 8192U: + S->pTwiddle = WeightsQ15_8192; + S->pCosFactor = cos_factorsQ15_8192; + break; + #endif + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_Q15_2048) + case 2048U: + S->pTwiddle = WeightsQ15_2048; + S->pCosFactor = cos_factorsQ15_2048; + break; + #endif + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_Q15_512) + case 512U: + S->pTwiddle = WeightsQ15_512; + S->pCosFactor = cos_factorsQ15_512; + break; + #endif + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_Q15_128) + case 128U: + S->pTwiddle = WeightsQ15_128; + S->pCosFactor = cos_factorsQ15_128; + break; + #endif + + default: + status = ARM_MATH_ARGUMENT_ERROR; + } + + /* Initialize the RFFT/RIFFT */ + arm_rfft_init_q15(S->pRfft, S->N, 0U, 1U); + + /* return the status of DCT4 Init function */ + return (status); +} + +/** + @} end of DCT4_IDCT4 group + */ diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_dct4_init_q31.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_dct4_init_q31.c new file mode 100644 index 0000000..4c7622a --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_dct4_init_q31.c @@ -0,0 +1,129 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_dct4_init_q31.c + * Description: Initialization function of DCT-4 & IDCT4 Q31 + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/transform_functions.h" +#include "arm_common_tables.h" + +/** + @ingroup groupTransforms + */ + + /** + @addtogroup DCT4_IDCT4 + @{ + */ + +/** + @brief Initialization function for the Q31 DCT4/IDCT4. + @param[in,out] S points to an instance of Q31 DCT4/IDCT4 structure. + @param[in] S_RFFT points to an instance of Q31 RFFT/RIFFT structure + @param[in] S_CFFT points to an instance of Q31 CFFT/CIFFT structure + @param[in] N length of the DCT4. + @param[in] Nby2 half of the length of the DCT4. + @param[in] normalize normalizing factor. + @return execution status + - \ref ARM_MATH_SUCCESS : Operation successful + - \ref ARM_MATH_ARGUMENT_ERROR : <code>N</code> is not a supported transform length + + @par Normalizing factor: + The normalizing factor is <code>sqrt(2/N)</code>, which depends on the size of transform <code>N</code>. + Normalizing factors in 1.31 format are mentioned in the table below for different DCT sizes: + + \image html dct4NormalizingQ31Table.gif + */ + +arm_status arm_dct4_init_q31( + arm_dct4_instance_q31 * S, + arm_rfft_instance_q31 * S_RFFT, + arm_cfft_radix4_instance_q31 * S_CFFT, + uint16_t N, + uint16_t Nby2, + q31_t normalize) +{ + /* Initialize the default arm status */ + arm_status status = ARM_MATH_SUCCESS; + + /* Initialize the DCT4 length */ + S->N = N; + + /* Initialize the half of DCT4 length */ + S->Nby2 = Nby2; + + /* Initialize the DCT4 Normalizing factor */ + S->normalize = normalize; + + /* Initialize Real FFT Instance */ + S->pRfft = S_RFFT; + + /* Initialize Complex FFT Instance */ + S->pCfft = S_CFFT; + + switch (N) + { + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_Q31_8192) + /* Initialize the table modifier values */ + case 8192U: + S->pTwiddle = WeightsQ31_8192; + S->pCosFactor = cos_factorsQ31_8192; + break; + #endif + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_Q31_2048) + case 2048U: + S->pTwiddle = WeightsQ31_2048; + S->pCosFactor = cos_factorsQ31_2048; + break; + #endif + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_Q31_512) + case 512U: + S->pTwiddle = WeightsQ31_512; + S->pCosFactor = cos_factorsQ31_512; + break; + #endif + + #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_Q31_128) + case 128U: + S->pTwiddle = WeightsQ31_128; + S->pCosFactor = cos_factorsQ31_128; + break; + #endif + default: + status = ARM_MATH_ARGUMENT_ERROR; + } + + /* Initialize the RFFT/RIFFT Function */ + arm_rfft_init_q31(S->pRfft, S->N, 0U, 1U); + + /* return the status of DCT4 Init function */ + return (status); +} + +/** + @} end of DCT4_IDCT4 group + */ diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_dct4_q15.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_dct4_q15.c new file mode 100644 index 0000000..a4650da --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_dct4_q15.c @@ -0,0 +1,381 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_dct4_q15.c + * Description: Processing function of DCT4 & IDCT4 Q15 + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/transform_functions.h" + +/** + @addtogroup DCT4_IDCT4 + @{ + */ + +/** + @brief Processing function for the Q15 DCT4/IDCT4. + @param[in] S points to an instance of the Q15 DCT4 structure. + @param[in] pState points to state buffer. + @param[in,out] pInlineBuffer points to the in-place input and output buffer. + @return none + + @par Input an output formats + Internally inputs are downscaled in the RFFT process function to avoid overflows. + Number of bits downscaled, depends on the size of the transform. The input and output + formats for different DCT sizes and number of bits to upscale are mentioned in the table below: + + \image html dct4FormatsQ15Table.gif + */ + +void arm_dct4_q15( + const arm_dct4_instance_q15 * S, + q15_t * pState, + q15_t * pInlineBuffer) +{ + const q15_t *weights = S->pTwiddle; /* Pointer to the Weights table */ + const q15_t *cosFact = S->pCosFactor; /* Pointer to the cos factors table */ + q15_t *pS1, *pS2, *pbuff; /* Temporary pointers for input buffer and pState buffer */ + q15_t in; /* Temporary variable */ + uint32_t i; /* Loop counter */ + + + /* DCT4 computation involves DCT2 (which is calculated using RFFT) + * along with some pre-processing and post-processing. + * Computational procedure is explained as follows: + * (a) Pre-processing involves multiplying input with cos factor, + * r(n) = 2 * u(n) * cos(pi*(2*n+1)/(4*n)) + * where, + * r(n) -- output of preprocessing + * u(n) -- input to preprocessing(actual Source buffer) + * (b) Calculation of DCT2 using FFT is divided into three steps: + * Step1: Re-ordering of even and odd elements of input. + * Step2: Calculating FFT of the re-ordered input. + * Step3: Taking the real part of the product of FFT output and weights. + * (c) Post-processing - DCT4 can be obtained from DCT2 output using the following equation: + * Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0) + * where, + * Y4 -- DCT4 output, Y2 -- DCT2 output + * (d) Multiplying the output with the normalizing factor sqrt(2/N). + */ + + /*-------- Pre-processing ------------*/ + /* Multiplying input with cos factor i.e. r(n) = 2 * x(n) * cos(pi*(2*n+1)/(4*n)) */ + arm_mult_q15 (pInlineBuffer, cosFact, pInlineBuffer, S->N); + arm_shift_q15 (pInlineBuffer, 1, pInlineBuffer, S->N); + + /* ---------------------------------------------------------------- + * Step1: Re-ordering of even and odd elements as + * pState[i] = pInlineBuffer[2*i] and + * pState[N-i-1] = pInlineBuffer[2*i+1] where i = 0 to N/2 + ---------------------------------------------------------------------*/ + + /* pS1 initialized to pState */ + pS1 = pState; + + /* pS2 initialized to pState+N-1, so that it points to the end of the state buffer */ + pS2 = pState + (S->N - 1U); + + /* pbuff initialized to input buffer */ + pbuff = pInlineBuffer; + + +#if defined (ARM_MATH_LOOPUNROLL) + + /* Initializing the loop counter to N/2 >> 2 for loop unrolling by 4 */ + i = S->Nby2 >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 outputs at a time. + ** a second loop below computes the remaining 1 to 3 samples. */ + do + { + /* Re-ordering of even and odd elements */ + /* pState[i] = pInlineBuffer[2*i] */ + *pS1++ = *pbuff++; + /* pState[N-i-1] = pInlineBuffer[2*i+1] */ + *pS2-- = *pbuff++; + + *pS1++ = *pbuff++; + *pS2-- = *pbuff++; + + *pS1++ = *pbuff++; + *pS2-- = *pbuff++; + + *pS1++ = *pbuff++; + *pS2-- = *pbuff++; + + /* Decrement loop counter */ + i--; + } while (i > 0U); + + /* pbuff initialized to input buffer */ + pbuff = pInlineBuffer; + + /* pS1 initialized to pState */ + pS1 = pState; + + /* Initializing the loop counter to N/4 instead of N for loop unrolling */ + i = S->N >> 2U; + + /* Processing with loop unrolling 4 times as N is always multiple of 4. + * Compute 4 outputs at a time */ + do + { + /* Writing the re-ordered output back to inplace input buffer */ + *pbuff++ = *pS1++; + *pbuff++ = *pS1++; + *pbuff++ = *pS1++; + *pbuff++ = *pS1++; + + /* Decrement the loop counter */ + i--; + } while (i > 0U); + + + /* --------------------------------------------------------- + * Step2: Calculate RFFT for N-point input + * ---------------------------------------------------------- */ + /* pInlineBuffer is real input of length N , pState is the complex output of length 2N */ + arm_rfft_q15 (S->pRfft, pInlineBuffer, pState); + + /*---------------------------------------------------------------------- + * Step3: Multiply the FFT output with the weights. + *----------------------------------------------------------------------*/ + arm_cmplx_mult_cmplx_q15 (pState, weights, pState, S->N); + + /* The output of complex multiplication is in 3.13 format. + * Hence changing the format of N (i.e. 2*N elements) complex numbers to 1.15 format by shifting left by 2 bits. */ + arm_shift_q15 (pState, 2, pState, S->N * 2); + + /* ----------- Post-processing ---------- */ + /* DCT-IV can be obtained from DCT-II by the equation, + * Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0) + * Hence, Y4(0) = Y2(0)/2 */ + /* Getting only real part from the output and Converting to DCT-IV */ + + /* Initializing the loop counter to N >> 2 for loop unrolling by 4 */ + i = (S->N - 1U) >> 2U; + + /* pbuff initialized to input buffer. */ + pbuff = pInlineBuffer; + + /* pS1 initialized to pState */ + pS1 = pState; + + /* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */ + in = *pS1++ >> 1U; + /* input buffer acts as inplace, so output values are stored in the input itself. */ + *pbuff++ = in; + + /* pState pointer is incremented twice as the real values are located alternatively in the array */ + pS1++; + + /* First part of the processing with loop unrolling. Compute 4 outputs at a time. + ** a second loop below computes the remaining 1 to 3 samples. */ + do + { + /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */ + /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */ + in = *pS1++ - in; + *pbuff++ = in; + /* points to the next real value */ + pS1++; + + in = *pS1++ - in; + *pbuff++ = in; + pS1++; + + in = *pS1++ - in; + *pbuff++ = in; + pS1++; + + in = *pS1++ - in; + *pbuff++ = in; + pS1++; + + /* Decrement the loop counter */ + i--; + } while (i > 0U); + + /* If the blockSize is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + i = (S->N - 1U) % 0x4U; + + while (i > 0U) + { + /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */ + /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */ + in = *pS1++ - in; + *pbuff++ = in; + + /* points to the next real value */ + pS1++; + + /* Decrement loop counter */ + i--; + } + + + /*------------ Normalizing the output by multiplying with the normalizing factor ----------*/ + + /* Initializing the loop counter to N/4 instead of N for loop unrolling */ + i = S->N >> 2U; + + /* pbuff initialized to the pInlineBuffer(now contains the output values) */ + pbuff = pInlineBuffer; + + /* Processing with loop unrolling 4 times as N is always multiple of 4. Compute 4 outputs at a time */ + do + { + /* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */ + in = *pbuff; + *pbuff++ = ((q15_t) (((q31_t) in * S->normalize) >> 15)); + + in = *pbuff; + *pbuff++ = ((q15_t) (((q31_t) in * S->normalize) >> 15)); + + in = *pbuff; + *pbuff++ = ((q15_t) (((q31_t) in * S->normalize) >> 15)); + + in = *pbuff; + *pbuff++ = ((q15_t) (((q31_t) in * S->normalize) >> 15)); + + /* Decrement loop counter */ + i--; + } while (i > 0U); + + +#else + + /* Initializing the loop counter to N/2 */ + i = S->Nby2; + + do + { + /* Re-ordering of even and odd elements */ + /* pState[i] = pInlineBuffer[2*i] */ + *pS1++ = *pbuff++; + /* pState[N-i-1] = pInlineBuffer[2*i+1] */ + *pS2-- = *pbuff++; + + /* Decrement the loop counter */ + i--; + } while (i > 0U); + + /* pbuff initialized to input buffer */ + pbuff = pInlineBuffer; + + /* pS1 initialized to pState */ + pS1 = pState; + + /* Initializing the loop counter */ + i = S->N; + + do + { + /* Writing the re-ordered output back to inplace input buffer */ + *pbuff++ = *pS1++; + + /* Decrement the loop counter */ + i--; + } while (i > 0U); + + + /* --------------------------------------------------------- + * Step2: Calculate RFFT for N-point input + * ---------------------------------------------------------- */ + /* pInlineBuffer is real input of length N , pState is the complex output of length 2N */ + arm_rfft_q15 (S->pRfft, pInlineBuffer, pState); + + /*---------------------------------------------------------------------- + * Step3: Multiply the FFT output with the weights. + *----------------------------------------------------------------------*/ + arm_cmplx_mult_cmplx_q15 (pState, weights, pState, S->N); + + /* The output of complex multiplication is in 3.13 format. + * Hence changing the format of N (i.e. 2*N elements) complex numbers to 1.15 format by shifting left by 2 bits. */ + arm_shift_q15 (pState, 2, pState, S->N * 2); + + /* ----------- Post-processing ---------- */ + /* DCT-IV can be obtained from DCT-II by the equation, + * Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0) + * Hence, Y4(0) = Y2(0)/2 */ + /* Getting only real part from the output and Converting to DCT-IV */ + + /* pbuff initialized to input buffer. */ + pbuff = pInlineBuffer; + + /* pS1 initialized to pState */ + pS1 = pState; + + /* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */ + in = *pS1++ >> 1U; + /* input buffer acts as inplace, so output values are stored in the input itself. */ + *pbuff++ = in; + + /* pState pointer is incremented twice as the real values are located alternatively in the array */ + pS1++; + + /* Initializing the loop counter */ + i = (S->N - 1U); + + do + { + /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */ + /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */ + in = *pS1++ - in; + *pbuff++ = in; + + /* points to the next real value */ + pS1++; + + /* Decrement loop counter */ + i--; + } while (i > 0U); + + /*------------ Normalizing the output by multiplying with the normalizing factor ----------*/ + + /* Initializing loop counter */ + i = S->N; + + /* pbuff initialized to the pInlineBuffer (now contains the output values) */ + pbuff = pInlineBuffer; + + do + { + /* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */ + in = *pbuff; + *pbuff++ = ((q15_t) (((q31_t) in * S->normalize) >> 15)); + + /* Decrement loop counter */ + i--; + + } while (i > 0U); + +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ + +} + +/** + @} end of DCT4_IDCT4 group + */ diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_dct4_q31.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_dct4_q31.c new file mode 100644 index 0000000..6cbccff --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_dct4_q31.c @@ -0,0 +1,383 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_dct4_q31.c + * Description: Processing function of DCT4 & IDCT4 Q31 + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/transform_functions.h" + +/** + @addtogroup DCT4_IDCT4 + @{ + */ + +/** + @brief Processing function for the Q31 DCT4/IDCT4. + @param[in] S points to an instance of the Q31 DCT4 structure. + @param[in] pState points to state buffer. + @param[in,out] pInlineBuffer points to the in-place input and output buffer. + @return none + + @par Input an output formats + Input samples need to be downscaled by 1 bit to avoid saturations in the Q31 DCT process, + as the conversion from DCT2 to DCT4 involves one subtraction. + Internally inputs are downscaled in the RFFT process function to avoid overflows. + Number of bits downscaled, depends on the size of the transform. + The input and output formats for different DCT sizes and number of bits to upscale are + mentioned in the table below: + + \image html dct4FormatsQ31Table.gif + */ + +void arm_dct4_q31( + const arm_dct4_instance_q31 * S, + q31_t * pState, + q31_t * pInlineBuffer) +{ + const q31_t *weights = S->pTwiddle; /* Pointer to the Weights table */ + const q31_t *cosFact = S->pCosFactor; /* Pointer to the cos factors table */ + q31_t *pS1, *pS2, *pbuff; /* Temporary pointers for input buffer and pState buffer */ + q31_t in; /* Temporary variable */ + uint32_t i; /* Loop counter */ + + + /* DCT4 computation involves DCT2 (which is calculated using RFFT) + * along with some pre-processing and post-processing. + * Computational procedure is explained as follows: + * (a) Pre-processing involves multiplying input with cos factor, + * r(n) = 2 * u(n) * cos(pi*(2*n+1)/(4*n)) + * where, + * r(n) -- output of preprocessing + * u(n) -- input to preprocessing(actual Source buffer) + * (b) Calculation of DCT2 using FFT is divided into three steps: + * Step1: Re-ordering of even and odd elements of input. + * Step2: Calculating FFT of the re-ordered input. + * Step3: Taking the real part of the product of FFT output and weights. + * (c) Post-processing - DCT4 can be obtained from DCT2 output using the following equation: + * Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0) + * where, + * Y4 -- DCT4 output, Y2 -- DCT2 output + * (d) Multiplying the output with the normalizing factor sqrt(2/N). + */ + + /*-------- Pre-processing ------------*/ + /* Multiplying input with cos factor i.e. r(n) = 2 * x(n) * cos(pi*(2*n+1)/(4*n)) */ + arm_mult_q31 (pInlineBuffer, cosFact, pInlineBuffer, S->N); + arm_shift_q31 (pInlineBuffer, 1, pInlineBuffer, S->N); + + /* ---------------------------------------------------------------- + * Step1: Re-ordering of even and odd elements as + * pState[i] = pInlineBuffer[2*i] and + * pState[N-i-1] = pInlineBuffer[2*i+1] where i = 0 to N/2 + ---------------------------------------------------------------------*/ + + /* pS1 initialized to pState */ + pS1 = pState; + + /* pS2 initialized to pState+N-1, so that it points to the end of the state buffer */ + pS2 = pState + (S->N - 1U); + + /* pbuff initialized to input buffer */ + pbuff = pInlineBuffer; + + +#if defined (ARM_MATH_LOOPUNROLL) + + /* Initializing the loop counter to N/2 >> 2 for loop unrolling by 4 */ + i = S->Nby2 >> 2U; + + /* First part of the processing with loop unrolling. Compute 4 outputs at a time. + ** a second loop below computes the remaining 1 to 3 samples. */ + do + { + /* Re-ordering of even and odd elements */ + /* pState[i] = pInlineBuffer[2*i] */ + *pS1++ = *pbuff++; + /* pState[N-i-1] = pInlineBuffer[2*i+1] */ + *pS2-- = *pbuff++; + + *pS1++ = *pbuff++; + *pS2-- = *pbuff++; + + *pS1++ = *pbuff++; + *pS2-- = *pbuff++; + + *pS1++ = *pbuff++; + *pS2-- = *pbuff++; + + /* Decrement loop counter */ + i--; + } while (i > 0U); + + /* pbuff initialized to input buffer */ + pbuff = pInlineBuffer; + + /* pS1 initialized to pState */ + pS1 = pState; + + /* Initializing the loop counter to N/4 instead of N for loop unrolling */ + i = S->N >> 2U; + + /* Processing with loop unrolling 4 times as N is always multiple of 4. + * Compute 4 outputs at a time */ + do + { + /* Writing the re-ordered output back to inplace input buffer */ + *pbuff++ = *pS1++; + *pbuff++ = *pS1++; + *pbuff++ = *pS1++; + *pbuff++ = *pS1++; + + /* Decrement the loop counter */ + i--; + } while (i > 0U); + + + /* --------------------------------------------------------- + * Step2: Calculate RFFT for N-point input + * ---------------------------------------------------------- */ + /* pInlineBuffer is real input of length N , pState is the complex output of length 2N */ + arm_rfft_q31 (S->pRfft, pInlineBuffer, pState); + + /*---------------------------------------------------------------------- + * Step3: Multiply the FFT output with the weights. + *----------------------------------------------------------------------*/ + arm_cmplx_mult_cmplx_q31 (pState, weights, pState, S->N); + + /* The output of complex multiplication is in 3.29 format. + * Hence changing the format of N (i.e. 2*N elements) complex numbers to 1.31 format by shifting left by 2 bits. */ + arm_shift_q31 (pState, 2, pState, S->N * 2); + + /* ----------- Post-processing ---------- */ + /* DCT-IV can be obtained from DCT-II by the equation, + * Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0) + * Hence, Y4(0) = Y2(0)/2 */ + /* Getting only real part from the output and Converting to DCT-IV */ + + /* Initializing the loop counter to N >> 2 for loop unrolling by 4 */ + i = (S->N - 1U) >> 2U; + + /* pbuff initialized to input buffer. */ + pbuff = pInlineBuffer; + + /* pS1 initialized to pState */ + pS1 = pState; + + /* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */ + in = *pS1++ >> 1U; + /* input buffer acts as inplace, so output values are stored in the input itself. */ + *pbuff++ = in; + + /* pState pointer is incremented twice as the real values are located alternatively in the array */ + pS1++; + + /* First part of the processing with loop unrolling. Compute 4 outputs at a time. + ** a second loop below computes the remaining 1 to 3 samples. */ + do + { + /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */ + /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */ + in = *pS1++ - in; + *pbuff++ = in; + /* points to the next real value */ + pS1++; + + in = *pS1++ - in; + *pbuff++ = in; + pS1++; + + in = *pS1++ - in; + *pbuff++ = in; + pS1++; + + in = *pS1++ - in; + *pbuff++ = in; + pS1++; + + /* Decrement the loop counter */ + i--; + } while (i > 0U); + + /* If the blockSize is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + i = (S->N - 1U) % 0x4U; + + while (i > 0U) + { + /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */ + /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */ + in = *pS1++ - in; + *pbuff++ = in; + + /* points to the next real value */ + pS1++; + + /* Decrement loop counter */ + i--; + } + + + /*------------ Normalizing the output by multiplying with the normalizing factor ----------*/ + + /* Initializing the loop counter to N/4 instead of N for loop unrolling */ + i = S->N >> 2U; + + /* pbuff initialized to the pInlineBuffer(now contains the output values) */ + pbuff = pInlineBuffer; + + /* Processing with loop unrolling 4 times as N is always multiple of 4. Compute 4 outputs at a time */ + do + { + /* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */ + in = *pbuff; + *pbuff++ = ((q31_t) (((q63_t) in * S->normalize) >> 31)); + + in = *pbuff; + *pbuff++ = ((q31_t) (((q63_t) in * S->normalize) >> 31)); + + in = *pbuff; + *pbuff++ = ((q31_t) (((q63_t) in * S->normalize) >> 31)); + + in = *pbuff; + *pbuff++ = ((q31_t) (((q63_t) in * S->normalize) >> 31)); + + /* Decrement loop counter */ + i--; + } while (i > 0U); + + +#else + + /* Initializing the loop counter to N/2 */ + i = S->Nby2; + + do + { + /* Re-ordering of even and odd elements */ + /* pState[i] = pInlineBuffer[2*i] */ + *pS1++ = *pbuff++; + /* pState[N-i-1] = pInlineBuffer[2*i+1] */ + *pS2-- = *pbuff++; + + /* Decrement the loop counter */ + i--; + } while (i > 0U); + + /* pbuff initialized to input buffer */ + pbuff = pInlineBuffer; + + /* pS1 initialized to pState */ + pS1 = pState; + + /* Initializing the loop counter */ + i = S->N; + + do + { + /* Writing the re-ordered output back to inplace input buffer */ + *pbuff++ = *pS1++; + + /* Decrement the loop counter */ + i--; + } while (i > 0U); + + + /* --------------------------------------------------------- + * Step2: Calculate RFFT for N-point input + * ---------------------------------------------------------- */ + /* pInlineBuffer is real input of length N , pState is the complex output of length 2N */ + arm_rfft_q31 (S->pRfft, pInlineBuffer, pState); + + /*---------------------------------------------------------------------- + * Step3: Multiply the FFT output with the weights. + *----------------------------------------------------------------------*/ + arm_cmplx_mult_cmplx_q31 (pState, weights, pState, S->N); + + /* The output of complex multiplication is in 3.29 format. + * Hence changing the format of N (i.e. 2*N elements) complex numbers to 1.31 format by shifting left by 2 bits. */ + arm_shift_q31(pState, 2, pState, S->N * 2); + + /* ----------- Post-processing ---------- */ + /* DCT-IV can be obtained from DCT-II by the equation, + * Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0) + * Hence, Y4(0) = Y2(0)/2 */ + /* Getting only real part from the output and Converting to DCT-IV */ + + /* pbuff initialized to input buffer. */ + pbuff = pInlineBuffer; + + /* pS1 initialized to pState */ + pS1 = pState; + + /* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */ + in = *pS1++ >> 1U; + /* input buffer acts as inplace, so output values are stored in the input itself. */ + *pbuff++ = in; + + /* pState pointer is incremented twice as the real values are located alternatively in the array */ + pS1++; + + /* Initializing the loop counter */ + i = (S->N - 1U); + + while (i > 0U) + { + /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */ + /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */ + in = *pS1++ - in; + *pbuff++ = in; + + /* points to the next real value */ + pS1++; + + /* Decrement loop counter */ + i--; + } + + /*------------ Normalizing the output by multiplying with the normalizing factor ----------*/ + + /* Initializing loop counter */ + i = S->N; + + /* pbuff initialized to the pInlineBuffer (now contains the output values) */ + pbuff = pInlineBuffer; + + do + { + /* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */ + in = *pbuff; + *pbuff++ = ((q31_t) (((q63_t) in * S->normalize) >> 31)); + + /* Decrement loop counter */ + i--; + } while (i > 0U); + +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ + +} + +/** + @} end of DCT4_IDCT4 group + */ diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_mfcc_f16.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_mfcc_f16.c new file mode 100644 index 0000000..4ec6cd9 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_mfcc_f16.c @@ -0,0 +1,161 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_mfcc_f16.c + * Description: MFCC function for the f16 version + * + * $Date: 07 September 2021 + * $Revision: V1.10.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +#include "dsp/transform_functions_f16.h" +#include "dsp/statistics_functions_f16.h" +#include "dsp/basic_math_functions_f16.h" +#include "dsp/complex_math_functions_f16.h" +#include "dsp/fast_math_functions_f16.h" +#include "dsp/matrix_functions_f16.h" + +#if defined(ARM_FLOAT16_SUPPORTED) + +/** + @ingroup groupTransforms + */ + + +/** + @defgroup MFCC MFCC + + MFCC Transform + + There are separate functions for floating-point, Q15, and Q31 data types. + */ + + + +/** + @addtogroup MFCC + @{ + */ + +/** + @brief MFCC F16 + @param[in] S points to the mfcc instance structure + @param[in] pSrc points to the input samples + @param[out] pDst points to the output MFCC values + @param[inout] pTmp points to a temporary buffer of complex + + @return none + + @par Description + The number of input samples if the FFT length used + when initializing the instance data structure. + + The temporary buffer has a 2*fft length size when MFCC + is implemented with CFFT. + It has length FFT Length + 2 when implemented with RFFT + (default implementation). + + The source buffer is modified by this function. + + */ +void arm_mfcc_f16( + const arm_mfcc_instance_f16 * S, + float16_t *pSrc, + float16_t *pDst, + float16_t *pTmp + ) +{ + float16_t maxValue; + uint32_t index; + uint32_t i; + float16_t result; + const float16_t *coefs=S->filterCoefs; + arm_matrix_instance_f16 pDctMat; + + /* Normalize */ + arm_absmax_f16(pSrc,S->fftLen,&maxValue,&index); + + arm_scale_f16(pSrc,1.0f16/(_Float16)maxValue,pSrc,S->fftLen); + + /* Multiply by window */ + arm_mult_f16(pSrc,S->windowCoefs,pSrc,S->fftLen); + + /* Compute spectrum magnitude + */ +#if defined(ARM_MFCC_CFFT_BASED) + /* some HW accelerator for CMSIS-DSP used in some boards + are only providing acceleration for CFFT. + With ARM_MFCC_CFFT_BASED enabled, CFFT is used and the MFCC + will be accelerated on those boards. + + The default is to use RFFT + */ + /* Convert from real to complex */ + for(i=0; i < S->fftLen ; i++) + { + pTmp[2*i] = pSrc[i]; + pTmp[2*i+1] = 0.0f16; + } + arm_cfft_f16(&(S->cfft),pTmp,0,1); +#else + /* Default RFFT based implementation */ + arm_rfft_fast_f16(&(S->rfft),pSrc,pTmp,0); + /* Unpack real values */ + pTmp[S->fftLen]=pTmp[1]; + pTmp[S->fftLen+1]=0.0f16; + pTmp[1]=0.0f; +#endif + arm_cmplx_mag_f16(pTmp,pSrc,S->fftLen); + + /* Apply MEL filters */ + for(i=0; i<S->nbMelFilters; i++) + { + arm_dot_prod_f16(pSrc+S->filterPos[i], + coefs, + S->filterLengths[i], + &result); + + coefs += S->filterLengths[i]; + + pTmp[i] = result; + + } + + /* Compute the log */ + arm_offset_f16(pTmp,1.0e-4f16,pTmp,S->nbMelFilters); + arm_vlog_f16(pTmp,pTmp,S->nbMelFilters); + + /* Multiply with the DCT matrix */ + + pDctMat.numRows=S->nbDctOutputs; + pDctMat.numCols=S->nbMelFilters; + pDctMat.pData=(float16_t*)S->dctCoefs; + + arm_mat_vec_mult_f16(&pDctMat, pTmp, pDst); + + +} + +#endif /* defined(ARM_FLOAT16_SUPPORTED) */ +/** + @} end of MFCC group + */ diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_mfcc_f32.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_mfcc_f32.c new file mode 100644 index 0000000..5ac9458 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_mfcc_f32.c @@ -0,0 +1,159 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_mfcc_f32.c + * Description: MFCC function for the f32 version + * + * $Date: 07 September 2021 + * $Revision: V1.10.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + + +#include "dsp/transform_functions.h" +#include "dsp/statistics_functions.h" +#include "dsp/basic_math_functions.h" +#include "dsp/complex_math_functions.h" +#include "dsp/fast_math_functions.h" +#include "dsp/matrix_functions.h" + +/** + @ingroup groupTransforms + */ + + +/** + @defgroup MFCC MFCC + + MFCC Transform + + There are separate functions for floating-point, Q15, and Q31 data types. + */ + + + +/** + @addtogroup MFCC + @{ + */ + +/** + @brief MFCC F32 + @param[in] S points to the mfcc instance structure + @param[in] pSrc points to the input samples + @param[out] pDst points to the output MFCC values + @param[inout] pTmp points to a temporary buffer of complex + + @return none + + @par Description + The number of input samples if the FFT length used + when initializing the instance data structure. + + The temporary buffer has a 2*fft length size when MFCC + is implemented with CFFT. + It has length FFT Length + 2 when implemented with RFFT + (default implementation). + + The source buffer is modified by this function. + + */ +void arm_mfcc_f32( + const arm_mfcc_instance_f32 * S, + float32_t *pSrc, + float32_t *pDst, + float32_t *pTmp + ) +{ + float32_t maxValue; + uint32_t index; + uint32_t i; + float32_t result; + const float32_t *coefs=S->filterCoefs; + arm_matrix_instance_f32 pDctMat; + + /* Normalize */ + arm_absmax_f32(pSrc,S->fftLen,&maxValue,&index); + + arm_scale_f32(pSrc,1.0f/maxValue,pSrc,S->fftLen); + + /* Multiply by window */ + arm_mult_f32(pSrc,S->windowCoefs,pSrc,S->fftLen); + + /* Compute spectrum magnitude + */ +#if defined(ARM_MFCC_CFFT_BASED) + /* some HW accelerator for CMSIS-DSP used in some boards + are only providing acceleration for CFFT. + With ARM_MFCC_CFFT_BASED enabled, CFFT is used and the MFCC + will be accelerated on those boards. + + The default is to use RFFT + */ + /* Convert from real to complex */ + for(i=0; i < S->fftLen ; i++) + { + pTmp[2*i] = pSrc[i]; + pTmp[2*i+1] = 0.0f; + } + arm_cfft_f32(&(S->cfft),pTmp,0,1); +#else + /* Default RFFT based implementation */ + arm_rfft_fast_f32(&(S->rfft),pSrc,pTmp,0); + /* Unpack real values */ + pTmp[S->fftLen]=pTmp[1]; + pTmp[S->fftLen+1]=0.0f; + pTmp[1]=0.0f; +#endif + arm_cmplx_mag_f32(pTmp,pSrc,S->fftLen); + + /* Apply MEL filters */ + for(i=0; i<S->nbMelFilters; i++) + { + arm_dot_prod_f32(pSrc+S->filterPos[i], + coefs, + S->filterLengths[i], + &result); + + coefs += S->filterLengths[i]; + + pTmp[i] = result; + + } + + /* Compute the log */ + arm_offset_f32(pTmp,1.0e-6f,pTmp,S->nbMelFilters); + arm_vlog_f32(pTmp,pTmp,S->nbMelFilters); + + /* Multiply with the DCT matrix */ + + pDctMat.numRows=S->nbDctOutputs; + pDctMat.numCols=S->nbMelFilters; + pDctMat.pData=(float32_t*)S->dctCoefs; + + arm_mat_vec_mult_f32(&pDctMat, pTmp, pDst); + + +} + +/** + @} end of MFCC group + */ diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_mfcc_init_f16.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_mfcc_init_f16.c new file mode 100644 index 0000000..d90fd0d --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_mfcc_init_f16.c @@ -0,0 +1,110 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_mfcc_init_f16.c + * Description: MFCC initialization function for the f16 version + * + * $Date: 07 September 2021 + * $Revision: V1.10.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + @ingroup groupTransforms + */ + + +/** + @addtogroup MFCC + @{ + */ + + +#include "dsp/transform_functions_f16.h" + +#if defined(ARM_FLOAT16_SUPPORTED) + + + +/** + @brief Initialization of the MFCC F16 instance structure + @param[out] S points to the mfcc instance structure + @param[in] fftLen fft length + @param[in] nbMelFilters number of Mel filters + @param[in] nbDctOutputs number of Dct outputs + @param[in] dctCoefs points to an array of DCT coefficients + @param[in] filterPos points of the array of filter positions + @param[in] filterLengths points to the array of filter lengths + @param[in] filterCoefs points to the array of filter coefficients + @param[in] windowCoefs points to the array of window coefficients + + @return error status + + @par Description + The matrix of Mel filter coefficients is sparse. + Most of the coefficients are zero. + To avoid multiplying the spectrogram by those zeros, the + filter is applied only to a given position in the spectrogram + and on a given number of FFT bins (the filter length). + It is the reason for the arrays filterPos and filterLengths. + + window coefficients can describe (for instance) a Hamming window. + The array has the same size as the FFT length. + + The folder Scripts is containing a Python script which can be used + to generate the filter, dct and window arrays. + */ + +arm_status arm_mfcc_init_f16( + arm_mfcc_instance_f16 * S, + uint32_t fftLen, + uint32_t nbMelFilters, + uint32_t nbDctOutputs, + const float16_t *dctCoefs, + const uint32_t *filterPos, + const uint32_t *filterLengths, + const float16_t *filterCoefs, + const float16_t *windowCoefs + ) +{ + arm_status status; + + S->fftLen=fftLen; + S->nbMelFilters=nbMelFilters; + S->nbDctOutputs=nbDctOutputs; + S->dctCoefs=dctCoefs; + S->filterPos=filterPos; + S->filterLengths=filterLengths; + S->filterCoefs=filterCoefs; + S->windowCoefs=windowCoefs; + + #if defined(ARM_MFCC_CFFT_BASED) + status=arm_cfft_init_f16(&(S->cfft),fftLen); + #else + status=arm_rfft_fast_init_f16(&(S->rfft),fftLen); + #endif + + return(status); +} + +#endif /* defined(ARM_FLOAT16_SUPPORTED) */ +/** + @} end of MFCC group + */ diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_mfcc_init_f32.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_mfcc_init_f32.c new file mode 100644 index 0000000..0690782 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_mfcc_init_f32.c @@ -0,0 +1,107 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_mfcc_init_f32.c + * Description: MFCC initialization function for the f32 version + * + * $Date: 07 September 2021 + * $Revision: V1.10.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + @ingroup groupTransforms + */ + + +/** + @addtogroup MFCC + @{ + */ + + +#include "dsp/transform_functions.h" + + + +/** + @brief Initialization of the MFCC F32 instance structure + @param[out] S points to the mfcc instance structure + @param[in] fftLen fft length + @param[in] nbMelFilters number of Mel filters + @param[in] nbDctOutputs number of Dct outputs + @param[in] dctCoefs points to an array of DCT coefficients + @param[in] filterPos points of the array of filter positions + @param[in] filterLengths points to the array of filter lengths + @param[in] filterCoefs points to the array of filter coefficients + @param[in] windowCoefs points to the array of window coefficients + + @return error status + + @par Description + The matrix of Mel filter coefficients is sparse. + Most of the coefficients are zero. + To avoid multiplying the spectrogram by those zeros, the + filter is applied only to a given position in the spectrogram + and on a given number of FFT bins (the filter length). + It is the reason for the arrays filterPos and filterLengths. + + window coefficients can describe (for instance) a Hamming window. + The array has the same size as the FFT length. + + The folder Scripts is containing a Python script which can be used + to generate the filter, dct and window arrays. + */ + +arm_status arm_mfcc_init_f32( + arm_mfcc_instance_f32 * S, + uint32_t fftLen, + uint32_t nbMelFilters, + uint32_t nbDctOutputs, + const float32_t *dctCoefs, + const uint32_t *filterPos, + const uint32_t *filterLengths, + const float32_t *filterCoefs, + const float32_t *windowCoefs + ) +{ + arm_status status; + + S->fftLen=fftLen; + S->nbMelFilters=nbMelFilters; + S->nbDctOutputs=nbDctOutputs; + S->dctCoefs=dctCoefs; + S->filterPos=filterPos; + S->filterLengths=filterLengths; + S->filterCoefs=filterCoefs; + S->windowCoefs=windowCoefs; + + #if defined(ARM_MFCC_CFFT_BASED) + status=arm_cfft_init_f32(&(S->cfft),fftLen); + #else + status=arm_rfft_fast_init_f32(&(S->rfft),fftLen); + #endif + + return(status); +} + +/** + @} end of MFCC group + */ diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_mfcc_init_q15.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_mfcc_init_q15.c new file mode 100644 index 0000000..60ff9f2 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_mfcc_init_q15.c @@ -0,0 +1,107 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_mfcc_init_q15.c + * Description: MFCC initialization function for the q15 version + * + * $Date: 07 September 2021 + * $Revision: V1.10.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + @ingroup groupTransforms + */ + + +/** + @addtogroup MFCC + @{ + */ + + +#include "dsp/transform_functions.h" + + + +/** + @brief Initialization of the MFCC F32 instance structure + @param[out] S points to the mfcc instance structure + @param[in] fftLen fft length + @param[in] nbMelFilters number of Mel filters + @param[in] nbDctOutputs number of Dct outputs + @param[in] dctCoefs points to an array of DCT coefficients + @param[in] filterPos points of the array of filter positions + @param[in] filterLengths points to the array of filter lengths + @param[in] filterCoefs points to the array of filter coefficients + @param[in] windowCoefs points to the array of window coefficients + + @return error status + + @par Description + The matrix of Mel filter coefficients is sparse. + Most of the coefficients are zero. + To avoid multiplying the spectrogram by those zeros, the + filter is applied only to a given position in the spectrogram + and on a given number of FFT bins (the filter length). + It is the reason for the arrays filterPos and filterLengths. + + window coefficients can describe (for instance) a Hamming window. + The array has the same size as the FFT length. + + The folder Scripts is containing a Python script which can be used + to generate the filter, dct and window arrays. + */ + +arm_status arm_mfcc_init_q15( + arm_mfcc_instance_q15 * S, + uint32_t fftLen, + uint32_t nbMelFilters, + uint32_t nbDctOutputs, + const q15_t *dctCoefs, + const uint32_t *filterPos, + const uint32_t *filterLengths, + const q15_t *filterCoefs, + const q15_t *windowCoefs + ) +{ + arm_status status; + + S->fftLen=fftLen; + S->nbMelFilters=nbMelFilters; + S->nbDctOutputs=nbDctOutputs; + S->dctCoefs=dctCoefs; + S->filterPos=filterPos; + S->filterLengths=filterLengths; + S->filterCoefs=filterCoefs; + S->windowCoefs=windowCoefs; + + #if defined(ARM_MFCC_CFFT_BASED) + status=arm_cfft_init_q15(&(S->cfft),fftLen); + #else + status=arm_rfft_init_q15(&(S->rfft),fftLen,0,1); + #endif + + return(status); +} + +/** + @} end of MFCC group + */ diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_mfcc_init_q31.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_mfcc_init_q31.c new file mode 100644 index 0000000..ce3d9dc --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_mfcc_init_q31.c @@ -0,0 +1,107 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_mfcc_init_q31.c + * Description: MFCC initialization function for the q31 version + * + * $Date: 07 September 2021 + * $Revision: V1.10.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + @ingroup groupTransforms + */ + + +/** + @addtogroup MFCC + @{ + */ + + +#include "dsp/transform_functions.h" + + + +/** + @brief Initialization of the MFCC F32 instance structure + @param[out] S points to the mfcc instance structure + @param[in] fftLen fft length + @param[in] nbMelFilters number of Mel filters + @param[in] nbDctOutputs number of Dct outputs + @param[in] dctCoefs points to an array of DCT coefficients + @param[in] filterPos points of the array of filter positions + @param[in] filterLengths points to the array of filter lengths + @param[in] filterCoefs points to the array of filter coefficients + @param[in] windowCoefs points to the array of window coefficients + + @return error status + + @par Description + The matrix of Mel filter coefficients is sparse. + Most of the coefficients are zero. + To avoid multiplying the spectrogram by those zeros, the + filter is applied only to a given position in the spectrogram + and on a given number of FFT bins (the filter length). + It is the reason for the arrays filterPos and filterLengths. + + window coefficients can describe (for instance) a Hamming window. + The array has the same size as the FFT length. + + The folder Scripts is containing a Python script which can be used + to generate the filter, dct and window arrays. + */ + +arm_status arm_mfcc_init_q31( + arm_mfcc_instance_q31 * S, + uint32_t fftLen, + uint32_t nbMelFilters, + uint32_t nbDctOutputs, + const q31_t *dctCoefs, + const uint32_t *filterPos, + const uint32_t *filterLengths, + const q31_t *filterCoefs, + const q31_t *windowCoefs + ) +{ + arm_status status; + + S->fftLen=fftLen; + S->nbMelFilters=nbMelFilters; + S->nbDctOutputs=nbDctOutputs; + S->dctCoefs=dctCoefs; + S->filterPos=filterPos; + S->filterLengths=filterLengths; + S->filterCoefs=filterCoefs; + S->windowCoefs=windowCoefs; + + #if defined(ARM_MFCC_CFFT_BASED) + status=arm_cfft_init_q31(&(S->cfft),fftLen); + #else + status=arm_rfft_init_q31(&(S->rfft),fftLen,0,1); + #endif + + return(status); +} + +/** + @} end of MFCC group + */ diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_mfcc_q15.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_mfcc_q15.c new file mode 100644 index 0000000..417cf9e --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_mfcc_q15.c @@ -0,0 +1,208 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_mfcc_q15.c + * Description: MFCC function for the q15 version + * + * $Date: 07 September 2021 + * $Revision: V1.10.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + + +#include "dsp/transform_functions.h" +#include "dsp/statistics_functions.h" +#include "dsp/basic_math_functions.h" +#include "dsp/complex_math_functions.h" +#include "dsp/fast_math_functions.h" +#include "dsp/matrix_functions.h" + +/* Constants for Q15 implementation */ +#define LOG2TOLOG_Q15 0x02C5C860 +#define MICRO_Q15 0x00000219 +#define SHIFT_MELFILTER_SATURATION_Q15 10 +/** + @ingroup groupTransforms + */ + + +/** + @defgroup MFCC MFCC + + MFCC Transform + + There are separate functions for floating-point, Q15, and Q15 data types. + */ + + + +/** + @addtogroup MFCC + @{ + */ + +/** + @brief MFCC Q15 + @param[in] S points to the mfcc instance structure + @param[in] pSrc points to the input samples in Q15 + @param[out] pDst points to the output MFCC values in q8.7 format + @param[inout] pTmp points to a temporary buffer of complex + + @return none + + @par Description + The number of input samples is the FFT length used + when initializing the instance data structure. + + The temporary buffer has a 2*fft length. + + The source buffer is modified by this function. + + The function may saturate. If the FFT length is too + big and the number of MEL filters too small then the fixed + point computations may saturate. + + */ + +arm_status arm_mfcc_q15( + const arm_mfcc_instance_q15 * S, + q15_t *pSrc, + q15_t *pDst, + q31_t *pTmp + ) +{ + q15_t m; + uint32_t index; + uint32_t fftShift=0; + q31_t logExponent; + q63_t result; + arm_matrix_instance_q15 pDctMat; + uint32_t i; + uint32_t coefsPos; + uint32_t filterLimit; + q15_t *pTmp2=(q15_t*)pTmp; + + arm_status status = ARM_MATH_SUCCESS; + + // q15 + arm_absmax_q15(pSrc,S->fftLen,&m,&index); + + if (m !=0) + { + q15_t quotient; + int16_t shift; + + status = arm_divide_q15(0x7FFF,m,"ient,&shift); + if (status != ARM_MATH_SUCCESS) + { + return(status); + } + + arm_scale_q15(pSrc,quotient,shift,pSrc,S->fftLen); + } + + + // q15 + arm_mult_q15(pSrc,S->windowCoefs, pSrc, S->fftLen); + + + /* Compute spectrum magnitude + */ + fftShift = 31 - __CLZ(S->fftLen); +#if defined(ARM_MFCC_CFFT_BASED) + /* some HW accelerator for CMSIS-DSP used in some boards + are only providing acceleration for CFFT. + With ARM_MFCC_CFFT_BASED enabled, CFFT is used and the MFCC + will be accelerated on those boards. + + The default is to use RFFT + */ + /* Convert from real to complex */ + for(i=0; i < S->fftLen ; i++) + { + pTmp2[2*i] = pSrc[i]; + pTmp2[2*i+1] = 0; + } + arm_cfft_q15(&(S->cfft),pTmp2,0,1); +#else + /* Default RFFT based implementation */ + arm_rfft_q15(&(S->rfft),pSrc,pTmp2); +#endif + filterLimit = 1 + (S->fftLen >> 1); + + + // q15 - fftShift + arm_cmplx_mag_q15(pTmp2,pSrc,filterLimit); + // q14 - fftShift + + /* Apply MEL filters */ + coefsPos = 0; + for(i=0; i<S->nbMelFilters; i++) + { + arm_dot_prod_q15(pSrc+S->filterPos[i], + &(S->filterCoefs[coefsPos]), + S->filterLengths[i], + &result); + + coefsPos += S->filterLengths[i]; + + // q34.29 - fftShift + result += MICRO_Q15; + result >>= SHIFT_MELFILTER_SATURATION_Q15; + // q34.29 - fftShift - satShift + pTmp[i] = __SSAT(result,31) ; + + } + + + // q34.29 - fftShift - satShift + /* Compute the log */ + arm_vlog_q31(pTmp,pTmp,S->nbMelFilters); + + + // q5.26 + + logExponent = fftShift + 2 + SHIFT_MELFILTER_SATURATION_Q15; + logExponent = logExponent * LOG2TOLOG_Q15; + + + // q8.26 + arm_offset_q31(pTmp,logExponent,pTmp,S->nbMelFilters); + arm_shift_q31(pTmp,-19,pTmp,S->nbMelFilters); + for(i=0; i<S->nbMelFilters; i++) + { + pSrc[i] = __SSAT((q15_t)pTmp[i],16); + } + + // q8.7 + + pDctMat.numRows=S->nbDctOutputs; + pDctMat.numCols=S->nbMelFilters; + pDctMat.pData=(q15_t*)S->dctCoefs; + + arm_mat_vec_mult_q15(&pDctMat, pSrc, pDst); + + return(status); +} + +/** + @} end of MFCC group + */ diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_mfcc_q31.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_mfcc_q31.c new file mode 100644 index 0000000..57db4b3 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_mfcc_q31.c @@ -0,0 +1,207 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_mfcc_q31.c + * Description: MFCC function for the q31 version + * + * $Date: 07 September 2021 + * $Revision: V1.10.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + + +#include "dsp/transform_functions.h" +#include "dsp/statistics_functions.h" +#include "dsp/basic_math_functions.h" +#include "dsp/complex_math_functions.h" +#include "dsp/fast_math_functions.h" +#include "dsp/matrix_functions.h" + +/* Constants for Q31 implementation */ +#define LOG2TOLOG_Q31 0x02C5C860 +#define MICRO_Q31 0x08637BD0 +#define SHIFT_MELFILTER_SATURATION_Q31 10 +/** + @ingroup groupTransforms + */ + + +/** + @defgroup MFCC MFCC + + MFCC Transform + + There are separate functions for floating-point, Q31, and Q31 data types. + */ + + + +/** + @addtogroup MFCC + @{ + */ + +/** + @brief MFCC Q31 + @param[in] S points to the mfcc instance structure + @param[in] pSrc points to the input samples in Q31 + @param[out] pDst points to the output MFCC values in q8.23 format + @param[inout] pTmp points to a temporary buffer of complex + + @return none + + @par Description + The number of input samples is the FFT length used + when initializing the instance data structure. + + The temporary buffer has a 2*fft length. + + The source buffer is modified by this function. + + The function may saturate. If the FFT length is too + big and the number of MEL filters too small then the fixed + point computations may saturate. + + */ + + +arm_status arm_mfcc_q31( + const arm_mfcc_instance_q31 * S, + q31_t *pSrc, + q31_t *pDst, + q31_t *pTmp + ) +{ + q31_t m; + uint32_t index; + uint32_t fftShift=0; + q31_t logExponent; + q63_t result; + arm_matrix_instance_q31 pDctMat; + uint32_t i; + uint32_t coefsPos; + uint32_t filterLimit; + q31_t *pTmp2=(q31_t*)pTmp; + + arm_status status = ARM_MATH_SUCCESS; + + // q31 + arm_absmax_q31(pSrc,S->fftLen,&m,&index); + + if (m !=0) + { + q31_t quotient; + int16_t shift; + + status = arm_divide_q31(0x7FFFFFFF,m,"ient,&shift); + if (status != ARM_MATH_SUCCESS) + { + return(status); + } + + arm_scale_q31(pSrc,quotient,shift,pSrc,S->fftLen); + } + + + // q31 + arm_mult_q31(pSrc,S->windowCoefs, pSrc, S->fftLen); + + + /* Compute spectrum magnitude + */ + fftShift = 31 - __CLZ(S->fftLen); +#if defined(ARM_MFCC_CFFT_BASED) + /* some HW accelerator for CMSIS-DSP used in some boards + are only providing acceleration for CFFT. + With ARM_MFCC_CFFT_BASED enabled, CFFT is used and the MFCC + will be accelerated on those boards. + + The default is to use RFFT + */ + /* Convert from real to complex */ + for(i=0; i < S->fftLen ; i++) + { + pTmp2[2*i] = pSrc[i]; + pTmp2[2*i+1] = 0; + } + arm_cfft_q31(&(S->cfft),pTmp2,0,1); +#else + /* Default RFFT based implementation */ + arm_rfft_q31(&(S->rfft),pSrc,pTmp2); +#endif + filterLimit = 1 + (S->fftLen >> 1); + + + // q31 - fftShift + arm_cmplx_mag_q31(pTmp2,pSrc,filterLimit); + // q30 - fftShift + + + /* Apply MEL filters */ + coefsPos = 0; + for(i=0; i<S->nbMelFilters; i++) + { + arm_dot_prod_q31(pSrc+S->filterPos[i], + &(S->filterCoefs[coefsPos]), + S->filterLengths[i], + &result); + + coefsPos += S->filterLengths[i]; + + // q16.48 - fftShift + result += MICRO_Q31; + result >>= (SHIFT_MELFILTER_SATURATION_Q31 + 18); + // q16.29 - fftShift - satShift + pTmp[i] = __SSAT(result,31) ; + + } + + + // q16.29 - fftShift - satShift + /* Compute the log */ + arm_vlog_q31(pTmp,pTmp,S->nbMelFilters); + + + // q5.26 + + logExponent = fftShift + 2 + SHIFT_MELFILTER_SATURATION_Q31; + logExponent = logExponent * LOG2TOLOG_Q31; + + + // q5.26 + arm_offset_q31(pTmp,logExponent,pTmp,S->nbMelFilters); + arm_shift_q31(pTmp,-3,pTmp,S->nbMelFilters); + + + // q8.23 + + pDctMat.numRows=S->nbDctOutputs; + pDctMat.numCols=S->nbMelFilters; + pDctMat.pData=(q31_t*)S->dctCoefs; + + arm_mat_vec_mult_q31(&pDctMat, pTmp, pDst); + + return(status); +} + +/** + @} end of MFCC group + */ diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_rfft_f32.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_rfft_f32.c new file mode 100644 index 0000000..8844b73 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_rfft_f32.c @@ -0,0 +1,318 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_rfft_f32.c + * Description: RFFT & RIFFT Floating point process function + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/transform_functions.h" + +/* ---------------------------------------------------------------------- + * Internal functions prototypes + * -------------------------------------------------------------------- */ + +extern void arm_radix4_butterfly_f32( + float32_t * pSrc, + uint16_t fftLen, + const float32_t * pCoef, + uint16_t twidCoefModifier); + +extern void arm_radix4_butterfly_inverse_f32( + float32_t * pSrc, + uint16_t fftLen, + const float32_t * pCoef, + uint16_t twidCoefModifier, + float32_t onebyfftLen); + +extern void arm_bitreversal_f32( + float32_t * pSrc, + uint16_t fftSize, + uint16_t bitRevFactor, + const uint16_t * pBitRevTab); + +void arm_split_rfft_f32( + float32_t * pSrc, + uint32_t fftLen, + const float32_t * pATable, + const float32_t * pBTable, + float32_t * pDst, + uint32_t modifier); + +void arm_split_rifft_f32( + float32_t * pSrc, + uint32_t fftLen, + const float32_t * pATable, + const float32_t * pBTable, + float32_t * pDst, + uint32_t modifier); + +/** + @ingroup groupTransforms + */ + +/** + @addtogroup RealFFT + @{ + */ + +/** + @brief Processing function for the floating-point RFFT/RIFFT. + Source buffer is modified by this function. + + @deprecated Do not use this function. It has been superceded by \ref arm_rfft_fast_f32 and will be removed in the future. + @param[in] S points to an instance of the floating-point RFFT/RIFFT structure + @param[in] pSrc points to the input buffer + @param[out] pDst points to the output buffer + @return none + + @par + For the RIFFT, the source buffer must at least have length + fftLenReal + 2. + The last two elements must be equal to what would be generated + by the RFFT: + (pSrc[0] - pSrc[1]) and 0.0f + */ + +void arm_rfft_f32( + const arm_rfft_instance_f32 * S, + float32_t * pSrc, + float32_t * pDst) +{ + const arm_cfft_radix4_instance_f32 *S_CFFT = S->pCfft; + + /* Calculation of Real IFFT of input */ + if (S->ifftFlagR == 1U) + { + /* Real IFFT core process */ + arm_split_rifft_f32 (pSrc, S->fftLenBy2, S->pTwiddleAReal, S->pTwiddleBReal, pDst, S->twidCoefRModifier); + + + /* Complex radix-4 IFFT process */ + arm_radix4_butterfly_inverse_f32 (pDst, S_CFFT->fftLen, S_CFFT->pTwiddle, S_CFFT->twidCoefModifier, S_CFFT->onebyfftLen); + + /* Bit reversal process */ + if (S->bitReverseFlagR == 1U) + { + arm_bitreversal_f32 (pDst, S_CFFT->fftLen, S_CFFT->bitRevFactor, S_CFFT->pBitRevTable); + } + } + else + { + /* Calculation of RFFT of input */ + + /* Complex radix-4 FFT process */ + arm_radix4_butterfly_f32 (pSrc, S_CFFT->fftLen, S_CFFT->pTwiddle, S_CFFT->twidCoefModifier); + + /* Bit reversal process */ + if (S->bitReverseFlagR == 1U) + { + arm_bitreversal_f32 (pSrc, S_CFFT->fftLen, S_CFFT->bitRevFactor, S_CFFT->pBitRevTable); + } + + /* Real FFT core process */ + arm_split_rfft_f32 (pSrc, S->fftLenBy2, S->pTwiddleAReal, S->pTwiddleBReal, pDst, S->twidCoefRModifier); + } + +} + +/** + @} end of RealFFT group + */ + +/** + @brief Core Real FFT process + @param[in] pSrc points to input buffer + @param[in] fftLen length of FFT + @param[in] pATable points to twiddle Coef A buffer + @param[in] pBTable points to twiddle Coef B buffer + @param[out] pDst points to output buffer + @param[in] modifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table + @return none + */ + +void arm_split_rfft_f32( + float32_t * pSrc, + uint32_t fftLen, + const float32_t * pATable, + const float32_t * pBTable, + float32_t * pDst, + uint32_t modifier) +{ + uint32_t i; /* Loop Counter */ + float32_t outR, outI; /* Temporary variables for output */ + const float32_t *pCoefA, *pCoefB; /* Temporary pointers for twiddle factors */ + float32_t CoefA1, CoefA2, CoefB1; /* Temporary variables for twiddle coefficients */ + float32_t *pDst1 = &pDst[2], *pDst2 = &pDst[(4U * fftLen) - 1U]; /* temp pointers for output buffer */ + float32_t *pSrc1 = &pSrc[2], *pSrc2 = &pSrc[(2U * fftLen) - 1U]; /* temp pointers for input buffer */ + + /* Init coefficient pointers */ + pCoefA = &pATable[modifier * 2]; + pCoefB = &pBTable[modifier * 2]; + + i = fftLen - 1U; + + while (i > 0U) + { + /* + outR = ( pSrc[2 * i] * pATable[2 * i] + - pSrc[2 * i + 1] * pATable[2 * i + 1] + + pSrc[2 * n - 2 * i] * pBTable[2 * i] + + pSrc[2 * n - 2 * i + 1] * pBTable[2 * i + 1]); + + outI = ( pIn[2 * i + 1] * pATable[2 * i] + + pIn[2 * i] * pATable[2 * i + 1] + + pIn[2 * n - 2 * i] * pBTable[2 * i + 1] + - pIn[2 * n - 2 * i + 1] * pBTable[2 * i]); + */ + + /* read pATable[2 * i] */ + CoefA1 = *pCoefA++; + /* pATable[2 * i + 1] */ + CoefA2 = *pCoefA; + + /* pSrc[2 * i] * pATable[2 * i] */ + outR = *pSrc1 * CoefA1; + /* pSrc[2 * i] * CoefA2 */ + outI = *pSrc1++ * CoefA2; + + /* (pSrc[2 * i + 1] + pSrc[2 * fftLen - 2 * i + 1]) * CoefA2 */ + outR -= (*pSrc1 + *pSrc2) * CoefA2; + /* pSrc[2 * i + 1] * CoefA1 */ + outI += *pSrc1++ * CoefA1; + + CoefB1 = *pCoefB; + + /* pSrc[2 * fftLen - 2 * i + 1] * CoefB1 */ + outI -= *pSrc2-- * CoefB1; + /* pSrc[2 * fftLen - 2 * i] * CoefA2 */ + outI -= *pSrc2 * CoefA2; + + /* pSrc[2 * fftLen - 2 * i] * CoefB1 */ + outR += *pSrc2-- * CoefB1; + + /* write output */ + *pDst1++ = outR; + *pDst1++ = outI; + + /* write complex conjugate output */ + *pDst2-- = -outI; + *pDst2-- = outR; + + /* update coefficient pointer */ + pCoefB = pCoefB + (modifier * 2U); + pCoefA = pCoefA + ((modifier * 2U) - 1U); + + i--; + + } + + pDst[2U * fftLen] = pSrc[0] - pSrc[1]; + pDst[(2U * fftLen) + 1U] = 0.0f; + + pDst[0] = pSrc[0] + pSrc[1]; + pDst[1] = 0.0f; + +} + + +/** + @brief Core Real IFFT process + @param[in] pSrc points to input buffer + @param[in] fftLen length of FFT + @param[in] pATable points to twiddle Coef A buffer + @param[in] pBTable points to twiddle Coef B buffer + @param[out] pDst points to output buffer + @param[in] modifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table + @return none + */ + +void arm_split_rifft_f32( + float32_t * pSrc, + uint32_t fftLen, + const float32_t * pATable, + const float32_t * pBTable, + float32_t * pDst, + uint32_t modifier) +{ + float32_t outR, outI; /* Temporary variables for output */ + const float32_t *pCoefA, *pCoefB; /* Temporary pointers for twiddle factors */ + float32_t CoefA1, CoefA2, CoefB1; /* Temporary variables for twiddle coefficients */ + float32_t *pSrc1 = &pSrc[0], *pSrc2 = &pSrc[(2U * fftLen) + 1U]; + + pCoefA = &pATable[0]; + pCoefB = &pBTable[0]; + + while (fftLen > 0U) + { + /* + outR = ( pIn[2 * i] * pATable[2 * i] + + pIn[2 * i + 1] * pATable[2 * i + 1] + + pIn[2 * n - 2 * i] * pBTable[2 * i] + - pIn[2 * n - 2 * i + 1] * pBTable[2 * i + 1]); + + outI = ( pIn[2 * i + 1] * pATable[2 * i] + - pIn[2 * i] * pATable[2 * i + 1] + - pIn[2 * n - 2 * i] * pBTable[2 * i + 1] + - pIn[2 * n - 2 * i + 1] * pBTable[2 * i]); + */ + + CoefA1 = *pCoefA++; + CoefA2 = *pCoefA; + + /* outR = (pSrc[2 * i] * CoefA1 */ + outR = *pSrc1 * CoefA1; + + /* - pSrc[2 * i] * CoefA2 */ + outI = -(*pSrc1++) * CoefA2; + + /* (pSrc[2 * i + 1] + pSrc[2 * fftLen - 2 * i + 1]) * CoefA2 */ + outR += (*pSrc1 + *pSrc2) * CoefA2; + + /* pSrc[2 * i + 1] * CoefA1 */ + outI += (*pSrc1++) * CoefA1; + + CoefB1 = *pCoefB; + + /* - pSrc[2 * fftLen - 2 * i + 1] * CoefB1 */ + outI -= *pSrc2-- * CoefB1; + + /* pSrc[2 * fftLen - 2 * i] * CoefB1 */ + outR += *pSrc2 * CoefB1; + + /* pSrc[2 * fftLen - 2 * i] * CoefA2 */ + outI += *pSrc2-- * CoefA2; + + /* write output */ + *pDst++ = outR; + *pDst++ = outI; + + /* update coefficient pointer */ + pCoefB = pCoefB + (modifier * 2); + pCoefA = pCoefA + (modifier * 2 - 1); + + /* Decrement loop count */ + fftLen--; + } + +} diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_rfft_fast_f16.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_rfft_fast_f16.c new file mode 100644 index 0000000..9048f83 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_rfft_fast_f16.c @@ -0,0 +1,612 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_rfft_fast_f16.c + * Description: RFFT & RIFFT Floating point process function + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/transform_functions_f16.h" +#include "arm_common_tables_f16.h" + +#if defined(ARM_FLOAT16_SUPPORTED) + + +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) + +void stage_rfft_f16( + const arm_rfft_fast_instance_f16 * S, + float16_t * p, + float16_t * pOut) +{ + int32_t k; /* Loop Counter */ + float16_t twR, twI; /* RFFT Twiddle coefficients */ + const float16_t * pCoeff = S->pTwiddleRFFT; /* Points to RFFT Twiddle factors */ + float16_t *pA = p; /* increasing pointer */ + float16_t *pB = p; /* decreasing pointer */ + float16_t xAR, xAI, xBR, xBI; /* temporary variables */ + float16_t t1a, t1b; /* temporary variables */ + float16_t p0, p1, p2, p3; /* temporary variables */ + + float16x8x2_t tw,xA,xB; + float16x8x2_t tmp1, tmp2, res; + + uint16x8_t vecStridesBkwd; + + vecStridesBkwd = vddupq_u16((uint16_t)14, 2); + + + int blockCnt; + + + k = (S->Sint).fftLen - 1; + + /* Pack first and last sample of the frequency domain together */ + + xBR = pB[0]; + xBI = pB[1]; + xAR = pA[0]; + xAI = pA[1]; + + twR = *pCoeff++ ; + twI = *pCoeff++ ; + + // U1 = XA(1) + XB(1); % It is real + t1a = (_Float16)xBR + (_Float16)xAR ; + + // U2 = XB(1) - XA(1); % It is imaginary + t1b = (_Float16)xBI + (_Float16)xAI ; + + // real(tw * (xB - xA)) = twR * (xBR - xAR) - twI * (xBI - xAI); + // imag(tw * (xB - xA)) = twI * (xBR - xAR) + twR * (xBI - xAI); + *pOut++ = 0.5f16 * ( (_Float16)t1a + (_Float16)t1b ); + *pOut++ = 0.5f16 * ( (_Float16)t1a - (_Float16)t1b ); + + // XA(1) = 1/2*( U1 - imag(U2) + i*( U1 +imag(U2) )); + pB = p + 2*k - 14; + pA += 2; + + blockCnt = k >> 3; + while (blockCnt > 0) + { + /* + function X = my_split_rfft(X, ifftFlag) + % X is a series of real numbers + L = length(X); + XC = X(1:2:end) +i*X(2:2:end); + XA = fft(XC); + XB = conj(XA([1 end:-1:2])); + TW = i*exp(-2*pi*i*[0:L/2-1]/L).'; + for l = 2:L/2 + XA(l) = 1/2 * (XA(l) + XB(l) + TW(l) * (XB(l) - XA(l))); + end + XA(1) = 1/2* (XA(1) + XB(1) + TW(1) * (XB(1) - XA(1))) + i*( 1/2*( XA(1) + XB(1) + i*( XA(1) - XB(1)))); + X = XA; + */ + + + xA = vld2q_f16(pA); + pA += 16; + + xB = vld2q_f16(pB); + + xB.val[0] = vldrhq_gather_shifted_offset_f16(pB, vecStridesBkwd); + xB.val[1] = vldrhq_gather_shifted_offset_f16(&pB[1], vecStridesBkwd); + + xB.val[1] = vnegq_f16(xB.val[1]); + pB -= 16; + + + tw = vld2q_f16(pCoeff); + pCoeff += 16; + + + tmp1.val[0] = vaddq_f16(xA.val[0],xB.val[0]); + tmp1.val[1] = vaddq_f16(xA.val[1],xB.val[1]); + + tmp2.val[0] = vsubq_f16(xB.val[0],xA.val[0]); + tmp2.val[1] = vsubq_f16(xB.val[1],xA.val[1]); + + res.val[0] = vmulq(tw.val[0], tmp2.val[0]); + res.val[0] = vfmsq(res.val[0],tw.val[1], tmp2.val[1]); + + res.val[1] = vmulq(tw.val[0], tmp2.val[1]); + res.val[1] = vfmaq(res.val[1], tw.val[1], tmp2.val[0]); + + res.val[0] = vaddq_f16(res.val[0],tmp1.val[0] ); + res.val[1] = vaddq_f16(res.val[1],tmp1.val[1] ); + + res.val[0] = vmulq_n_f16(res.val[0], 0.5f); + res.val[1] = vmulq_n_f16(res.val[1], 0.5f); + + + vst2q_f16(pOut, res); + pOut += 16; + + + blockCnt--; + } + + pB += 14; + blockCnt = k & 7; + while (blockCnt > 0) + { + /* + function X = my_split_rfft(X, ifftFlag) + % X is a series of real numbers + L = length(X); + XC = X(1:2:end) +i*X(2:2:end); + XA = fft(XC); + XB = conj(XA([1 end:-1:2])); + TW = i*exp(-2*pi*i*[0:L/2-1]/L).'; + for l = 2:L/2 + XA(l) = 1/2 * (XA(l) + XB(l) + TW(l) * (XB(l) - XA(l))); + end + XA(1) = 1/2* (XA(1) + XB(1) + TW(1) * (XB(1) - XA(1))) + i*( 1/2*( XA(1) + XB(1) + i*( XA(1) - XB(1)))); + X = XA; + */ + + xBI = pB[1]; + xBR = pB[0]; + xAR = pA[0]; + xAI = pA[1]; + + twR = *pCoeff++; + twI = *pCoeff++; + + t1a = (_Float16)xBR - (_Float16)xAR ; + t1b = (_Float16)xBI + (_Float16)xAI ; + + // real(tw * (xB - xA)) = twR * (xBR - xAR) - twI * (xBI - xAI); + // imag(tw * (xB - xA)) = twI * (xBR - xAR) + twR * (xBI - xAI); + p0 = (_Float16)twR * (_Float16)t1a; + p1 = (_Float16)twI * (_Float16)t1a; + p2 = (_Float16)twR * (_Float16)t1b; + p3 = (_Float16)twI * (_Float16)t1b; + + *pOut++ = 0.5f16 * ((_Float16)xAR + (_Float16)xBR + (_Float16)p0 + (_Float16)p3 ); //xAR + *pOut++ = 0.5f16 * ((_Float16)xAI - (_Float16)xBI + (_Float16)p1 - (_Float16)p2 ); //xAI + + pA += 2; + pB -= 2; + blockCnt--; + } +} + +/* Prepares data for inverse cfft */ +void merge_rfft_f16( + const arm_rfft_fast_instance_f16 * S, + float16_t * p, + float16_t * pOut) +{ + int32_t k; /* Loop Counter */ + float16_t twR, twI; /* RFFT Twiddle coefficients */ + const float16_t *pCoeff = S->pTwiddleRFFT; /* Points to RFFT Twiddle factors */ + float16_t *pA = p; /* increasing pointer */ + float16_t *pB = p; /* decreasing pointer */ + float16_t xAR, xAI, xBR, xBI; /* temporary variables */ + float16_t t1a, t1b, r, s, t, u; /* temporary variables */ + + float16x8x2_t tw,xA,xB; + float16x8x2_t tmp1, tmp2, res; + uint16x8_t vecStridesBkwd; + + vecStridesBkwd = vddupq_u16((uint16_t)14, 2); + + int blockCnt; + + + k = (S->Sint).fftLen - 1; + + xAR = pA[0]; + xAI = pA[1]; + + pCoeff += 2 ; + + *pOut++ = 0.5f16 * ( (_Float16)xAR + (_Float16)xAI ); + *pOut++ = 0.5f16 * ( (_Float16)xAR - (_Float16)xAI ); + + pB = p + 2*k - 14; + pA += 2 ; + + blockCnt = k >> 3; + while (blockCnt > 0) + { + /* G is half of the frequency complex spectrum */ + //for k = 2:N + // Xk(k) = 1/2 * (G(k) + conj(G(N-k+2)) + Tw(k)*( G(k) - conj(G(N-k+2)))); + xA = vld2q_f16(pA); + pA += 16; + + xB = vld2q_f16(pB); + + xB.val[0] = vldrhq_gather_shifted_offset_f16(pB, vecStridesBkwd); + xB.val[1] = vldrhq_gather_shifted_offset_f16(&pB[1], vecStridesBkwd); + + xB.val[1] = vnegq_f16(xB.val[1]); + pB -= 16; + + + tw = vld2q_f16(pCoeff); + tw.val[1] = vnegq_f16(tw.val[1]); + pCoeff += 16; + + + tmp1.val[0] = vaddq_f16(xA.val[0],xB.val[0]); + tmp1.val[1] = vaddq_f16(xA.val[1],xB.val[1]); + + tmp2.val[0] = vsubq_f16(xB.val[0],xA.val[0]); + tmp2.val[1] = vsubq_f16(xB.val[1],xA.val[1]); + + res.val[0] = vmulq(tw.val[0], tmp2.val[0]); + res.val[0] = vfmsq(res.val[0],tw.val[1], tmp2.val[1]); + + res.val[1] = vmulq(tw.val[0], tmp2.val[1]); + res.val[1] = vfmaq(res.val[1], tw.val[1], tmp2.val[0]); + + res.val[0] = vaddq_f16(res.val[0],tmp1.val[0] ); + res.val[1] = vaddq_f16(res.val[1],tmp1.val[1] ); + + res.val[0] = vmulq_n_f16(res.val[0], 0.5f); + res.val[1] = vmulq_n_f16(res.val[1], 0.5f); + + + vst2q_f16(pOut, res); + pOut += 16; + + + blockCnt--; + } + + pB += 14; + blockCnt = k & 7; + while (blockCnt > 0) + { + /* G is half of the frequency complex spectrum */ + //for k = 2:N + // Xk(k) = 1/2 * (G(k) + conj(G(N-k+2)) + Tw(k)*( G(k) - conj(G(N-k+2)))); + xBI = pB[1] ; + xBR = pB[0] ; + xAR = pA[0]; + xAI = pA[1]; + + twR = *pCoeff++; + twI = *pCoeff++; + + t1a = (_Float16)xAR - (_Float16)xBR ; + t1b = (_Float16)xAI + (_Float16)xBI ; + + r = (_Float16)twR * (_Float16)t1a; + s = (_Float16)twI * (_Float16)t1b; + t = (_Float16)twI * (_Float16)t1a; + u = (_Float16)twR * (_Float16)t1b; + + // real(tw * (xA - xB)) = twR * (xAR - xBR) - twI * (xAI - xBI); + // imag(tw * (xA - xB)) = twI * (xAR - xBR) + twR * (xAI - xBI); + *pOut++ = 0.5f16 * ((_Float16)xAR + (_Float16)xBR - (_Float16)r - (_Float16)s ); //xAR + *pOut++ = 0.5f16 * ((_Float16)xAI - (_Float16)xBI + (_Float16)t - (_Float16)u ); //xAI + + pA += 2; + pB -= 2; + blockCnt--; + } + +} +#else +void stage_rfft_f16( + const arm_rfft_fast_instance_f16 * S, + float16_t * p, + float16_t * pOut) +{ + int32_t k; /* Loop Counter */ + float16_t twR, twI; /* RFFT Twiddle coefficients */ + const float16_t * pCoeff = S->pTwiddleRFFT; /* Points to RFFT Twiddle factors */ + float16_t *pA = p; /* increasing pointer */ + float16_t *pB = p; /* decreasing pointer */ + float16_t xAR, xAI, xBR, xBI; /* temporary variables */ + float16_t t1a, t1b; /* temporary variables */ + float16_t p0, p1, p2, p3; /* temporary variables */ + + + k = (S->Sint).fftLen - 1; + + /* Pack first and last sample of the frequency domain together */ + + xBR = pB[0]; + xBI = pB[1]; + xAR = pA[0]; + xAI = pA[1]; + + twR = *pCoeff++ ; + twI = *pCoeff++ ; + + + // U1 = XA(1) + XB(1); % It is real + t1a = (_Float16)xBR + (_Float16)xAR ; + + // U2 = XB(1) - XA(1); % It is imaginary + t1b = (_Float16)xBI + (_Float16)xAI ; + + // real(tw * (xB - xA)) = twR * (xBR - xAR) - twI * (xBI - xAI); + // imag(tw * (xB - xA)) = twI * (xBR - xAR) + twR * (xBI - xAI); + *pOut++ = 0.5f16 * ( (_Float16)t1a + (_Float16)t1b ); + *pOut++ = 0.5f16 * ( (_Float16)t1a - (_Float16)t1b ); + + // XA(1) = 1/2*( U1 - imag(U2) + i*( U1 +imag(U2) )); + pB = p + 2*k; + pA += 2; + + do + { + /* + function X = my_split_rfft(X, ifftFlag) + % X is a series of real numbers + L = length(X); + XC = X(1:2:end) +i*X(2:2:end); + XA = fft(XC); + XB = conj(XA([1 end:-1:2])); + TW = i*exp(-2*pi*i*[0:L/2-1]/L).'; + for l = 2:L/2 + XA(l) = 1/2 * (XA(l) + XB(l) + TW(l) * (XB(l) - XA(l))); + end + XA(1) = 1/2* (XA(1) + XB(1) + TW(1) * (XB(1) - XA(1))) + i*( 1/2*( XA(1) + XB(1) + i*( XA(1) - XB(1)))); + X = XA; + */ + + xBI = pB[1]; + xBR = pB[0]; + xAR = pA[0]; + xAI = pA[1]; + + twR = *pCoeff++; + twI = *pCoeff++; + + t1a = (_Float16)xBR - (_Float16)xAR ; + t1b = (_Float16)xBI + (_Float16)xAI ; + + // real(tw * (xB - xA)) = twR * (xBR - xAR) - twI * (xBI - xAI); + // imag(tw * (xB - xA)) = twI * (xBR - xAR) + twR * (xBI - xAI); + p0 = (_Float16)twR * (_Float16)t1a; + p1 = (_Float16)twI * (_Float16)t1a; + p2 = (_Float16)twR * (_Float16)t1b; + p3 = (_Float16)twI * (_Float16)t1b; + + *pOut++ = 0.5f16 * ((_Float16)xAR + (_Float16)xBR + (_Float16)p0 + (_Float16)p3 ); //xAR + *pOut++ = 0.5f16 * ((_Float16)xAI - (_Float16)xBI + (_Float16)p1 - (_Float16)p2 ); //xAI + + + pA += 2; + pB -= 2; + k--; + } while (k > 0); +} + +/* Prepares data for inverse cfft */ +void merge_rfft_f16( + const arm_rfft_fast_instance_f16 * S, + float16_t * p, + float16_t * pOut) +{ + int32_t k; /* Loop Counter */ + float16_t twR, twI; /* RFFT Twiddle coefficients */ + const float16_t *pCoeff = S->pTwiddleRFFT; /* Points to RFFT Twiddle factors */ + float16_t *pA = p; /* increasing pointer */ + float16_t *pB = p; /* decreasing pointer */ + float16_t xAR, xAI, xBR, xBI; /* temporary variables */ + float16_t t1a, t1b, r, s, t, u; /* temporary variables */ + + k = (S->Sint).fftLen - 1; + + xAR = pA[0]; + xAI = pA[1]; + + pCoeff += 2 ; + + *pOut++ = 0.5f16 * ( (_Float16)xAR + (_Float16)xAI ); + *pOut++ = 0.5f16 * ( (_Float16)xAR - (_Float16)xAI ); + + pB = p + 2*k ; + pA += 2 ; + + while (k > 0) + { + /* G is half of the frequency complex spectrum */ + //for k = 2:N + // Xk(k) = 1/2 * (G(k) + conj(G(N-k+2)) + Tw(k)*( G(k) - conj(G(N-k+2)))); + xBI = pB[1] ; + xBR = pB[0] ; + xAR = pA[0]; + xAI = pA[1]; + + twR = *pCoeff++; + twI = *pCoeff++; + + t1a = (_Float16)xAR - (_Float16)xBR ; + t1b = (_Float16)xAI + (_Float16)xBI ; + + r = (_Float16)twR * (_Float16)t1a; + s = (_Float16)twI * (_Float16)t1b; + t = (_Float16)twI * (_Float16)t1a; + u = (_Float16)twR * (_Float16)t1b; + + // real(tw * (xA - xB)) = twR * (xAR - xBR) - twI * (xAI - xBI); + // imag(tw * (xA - xB)) = twI * (xAR - xBR) + twR * (xAI - xBI); + *pOut++ = 0.5f16 * ((_Float16)xAR + (_Float16)xBR - (_Float16)r - (_Float16)s ); //xAR + *pOut++ = 0.5f16 * ((_Float16)xAI - (_Float16)xBI + (_Float16)t - (_Float16)u ); //xAI + + pA += 2; + pB -= 2; + k--; + } + +} + +#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */ + +/** + @ingroup groupTransforms +*/ + +/** + @defgroup RealFFT Real FFT Functions + + @par + The CMSIS DSP library includes specialized algorithms for computing the + FFT of real data sequences. The FFT is defined over complex data but + in many applications the input is real. Real FFT algorithms take advantage + of the symmetry properties of the FFT and have a speed advantage over complex + algorithms of the same length. + @par + The Fast RFFT algorithm relays on the mixed radix CFFT that save processor usage. + @par + The real length N forward FFT of a sequence is computed using the steps shown below. + @par + \image html RFFT.gif "Real Fast Fourier Transform" + @par + The real sequence is initially treated as if it were complex to perform a CFFT. + Later, a processing stage reshapes the data to obtain half of the frequency spectrum + in complex format. Except the first complex number that contains the two real numbers + X[0] and X[N/2] all the data is complex. In other words, the first complex sample + contains two real values packed. + @par + The input for the inverse RFFT should keep the same format as the output of the + forward RFFT. A first processing stage pre-process the data to later perform an + inverse CFFT. + @par + \image html RIFFT.gif "Real Inverse Fast Fourier Transform" + @par + The algorithms for floating-point, Q15, and Q31 data are slightly different + and we describe each algorithm in turn. + @par Floating-point + The main functions are \ref arm_rfft_fast_f16() and \ref arm_rfft_fast_init_f16(). + + @par + The FFT of a real N-point sequence has even symmetry in the frequency domain. + The second half of the data equals the conjugate of the first half flipped in frequency. + Looking at the data, we see that we can uniquely represent the FFT using only N/2 complex numbers. + These are packed into the output array in alternating real and imaginary components: + @par + X = { real[0], imag[0], real[1], imag[1], real[2], imag[2] ... + real[(N/2)-1], imag[(N/2)-1 } + @par + It happens that the first complex number (real[0], imag[0]) is actually + all real. real[0] represents the DC offset, and imag[0] should be 0. + (real[1], imag[1]) is the fundamental frequency, (real[2], imag[2]) is + the first harmonic and so on. + @par + The real FFT functions pack the frequency domain data in this fashion. + The forward transform outputs the data in this form and the inverse + transform expects input data in this form. The function always performs + the needed bitreversal so that the input and output data is always in + normal order. The functions support lengths of [32, 64, 128, ..., 4096] + samples. + @par Q15 and Q31 + The real algorithms are defined in a similar manner and utilize N/2 complex + transforms behind the scenes. + @par + The complex transforms used internally include scaling to prevent fixed-point + overflows. The overall scaling equals 1/(fftLen/2). + Due to the use of complex transform internally, the source buffer is + modified by the rfft. + @par + A separate instance structure must be defined for each transform used but + twiddle factor and bit reversal tables can be reused. + @par + There is also an associated initialization function for each data type. + The initialization function performs the following operations: + - Sets the values of the internal structure fields. + - Initializes twiddle factor table and bit reversal table pointers. + - Initializes the internal complex FFT data structure. + @par + Use of the initialization function is optional **except for MVE versions where it is mandatory**. + If you don't use the initialization functions, then the structures should be initialized with code + similar to the one below: + <pre> + arm_rfft_instance_q31 S = {fftLenReal, fftLenBy2, ifftFlagR, bitReverseFlagR, twidCoefRModifier, pTwiddleAReal, pTwiddleBReal, pCfft}; + arm_rfft_instance_q15 S = {fftLenReal, fftLenBy2, ifftFlagR, bitReverseFlagR, twidCoefRModifier, pTwiddleAReal, pTwiddleBReal, pCfft}; + </pre> + where <code>fftLenReal</code> is the length of the real transform; + <code>fftLenBy2</code> length of the internal complex transform (fftLenReal/2). + <code>ifftFlagR</code> Selects forward (=0) or inverse (=1) transform. + <code>bitReverseFlagR</code> Selects bit reversed output (=0) or normal order + output (=1). + <code>twidCoefRModifier</code> stride modifier for the twiddle factor table. + The value is based on the FFT length; + <code>pTwiddleAReal</code>points to the A array of twiddle coefficients; + <code>pTwiddleBReal</code>points to the B array of twiddle coefficients; + <code>pCfft</code> points to the CFFT Instance structure. The CFFT structure + must also be initialized. +@par + Note that with MVE versions you can't initialize instance structures directly and **must + use the initialization function**. + */ + +/** + @addtogroup RealFFT + @{ +*/ + +/** + @brief Processing function for the floating-point real FFT. + @param[in] S points to an arm_rfft_fast_instance_f16 structure + @param[in] p points to input buffer (Source buffer is modified by this function.) + @param[in] pOut points to output buffer + @param[in] ifftFlag + - value = 0: RFFT + - value = 1: RIFFT + @return none +*/ + +void arm_rfft_fast_f16( + const arm_rfft_fast_instance_f16 * S, + float16_t * p, + float16_t * pOut, + uint8_t ifftFlag) +{ + const arm_cfft_instance_f16 * Sint = &(S->Sint); + + + /* Calculation of Real FFT */ + if (ifftFlag) + { + /* Real FFT compression */ + merge_rfft_f16(S, p, pOut); + /* Complex radix-4 IFFT process */ + arm_cfft_f16( Sint, pOut, ifftFlag, 1); + } + else + { + + /* Calculation of RFFT of input */ + arm_cfft_f16( Sint, p, ifftFlag, 1); + + /* Real FFT extraction */ + stage_rfft_f16(S, p, pOut); + } +} + +/** +* @} end of RealFFT group +*/ + +#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_rfft_fast_f32.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_rfft_fast_f32.c new file mode 100644 index 0000000..6712504 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_rfft_fast_f32.c @@ -0,0 +1,603 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_rfft_fast_f32.c + * Description: RFFT & RIFFT Floating point process function + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/transform_functions.h" + +#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) +void stage_rfft_f32( + const arm_rfft_fast_instance_f32 * S, + float32_t * p, + float32_t * pOut) +{ + int32_t k; /* Loop Counter */ + float32_t twR, twI; /* RFFT Twiddle coefficients */ + const float32_t * pCoeff = S->pTwiddleRFFT; /* Points to RFFT Twiddle factors */ + float32_t *pA = p; /* increasing pointer */ + float32_t *pB = p; /* decreasing pointer */ + float32_t xAR, xAI, xBR, xBI; /* temporary variables */ + float32_t t1a, t1b; /* temporary variables */ + float32_t p0, p1, p2, p3; /* temporary variables */ + + float32x4x2_t tw,xA,xB; + float32x4x2_t tmp1, tmp2, res; + + uint32x4_t vecStridesFwd, vecStridesBkwd; + + vecStridesFwd = vidupq_u32((uint32_t)0, 2); + vecStridesBkwd = -vecStridesFwd; + + int blockCnt; + + + k = (S->Sint).fftLen - 1; + + /* Pack first and last sample of the frequency domain together */ + + xBR = pB[0]; + xBI = pB[1]; + xAR = pA[0]; + xAI = pA[1]; + + twR = *pCoeff++ ; + twI = *pCoeff++ ; + + // U1 = XA(1) + XB(1); % It is real + t1a = xBR + xAR ; + + // U2 = XB(1) - XA(1); % It is imaginary + t1b = xBI + xAI ; + + // real(tw * (xB - xA)) = twR * (xBR - xAR) - twI * (xBI - xAI); + // imag(tw * (xB - xA)) = twI * (xBR - xAR) + twR * (xBI - xAI); + *pOut++ = 0.5f * ( t1a + t1b ); + *pOut++ = 0.5f * ( t1a - t1b ); + + // XA(1) = 1/2*( U1 - imag(U2) + i*( U1 +imag(U2) )); + pB = p + 2*k; + pA += 2; + + blockCnt = k >> 2; + while (blockCnt > 0) + { + /* + function X = my_split_rfft(X, ifftFlag) + % X is a series of real numbers + L = length(X); + XC = X(1:2:end) +i*X(2:2:end); + XA = fft(XC); + XB = conj(XA([1 end:-1:2])); + TW = i*exp(-2*pi*i*[0:L/2-1]/L).'; + for l = 2:L/2 + XA(l) = 1/2 * (XA(l) + XB(l) + TW(l) * (XB(l) - XA(l))); + end + XA(1) = 1/2* (XA(1) + XB(1) + TW(1) * (XB(1) - XA(1))) + i*( 1/2*( XA(1) + XB(1) + i*( XA(1) - XB(1)))); + X = XA; + */ + + + xA = vld2q_f32(pA); + pA += 8; + + xB = vld2q_f32(pB); + + xB.val[0] = vldrwq_gather_shifted_offset_f32(pB, vecStridesBkwd); + xB.val[1] = vldrwq_gather_shifted_offset_f32(&pB[1], vecStridesBkwd); + + xB.val[1] = vnegq_f32(xB.val[1]); + pB -= 8; + + + tw = vld2q_f32(pCoeff); + pCoeff += 8; + + + tmp1.val[0] = vaddq_f32(xA.val[0],xB.val[0]); + tmp1.val[1] = vaddq_f32(xA.val[1],xB.val[1]); + + tmp2.val[0] = vsubq_f32(xB.val[0],xA.val[0]); + tmp2.val[1] = vsubq_f32(xB.val[1],xA.val[1]); + + res.val[0] = vmulq(tw.val[0], tmp2.val[0]); + res.val[0] = vfmsq(res.val[0],tw.val[1], tmp2.val[1]); + + res.val[1] = vmulq(tw.val[0], tmp2.val[1]); + res.val[1] = vfmaq(res.val[1], tw.val[1], tmp2.val[0]); + + res.val[0] = vaddq_f32(res.val[0],tmp1.val[0] ); + res.val[1] = vaddq_f32(res.val[1],tmp1.val[1] ); + + res.val[0] = vmulq_n_f32(res.val[0], 0.5f); + res.val[1] = vmulq_n_f32(res.val[1], 0.5f); + + + vst2q_f32(pOut, res); + pOut += 8; + + + blockCnt--; + } + + blockCnt = k & 3; + while (blockCnt > 0) + { + /* + function X = my_split_rfft(X, ifftFlag) + % X is a series of real numbers + L = length(X); + XC = X(1:2:end) +i*X(2:2:end); + XA = fft(XC); + XB = conj(XA([1 end:-1:2])); + TW = i*exp(-2*pi*i*[0:L/2-1]/L).'; + for l = 2:L/2 + XA(l) = 1/2 * (XA(l) + XB(l) + TW(l) * (XB(l) - XA(l))); + end + XA(1) = 1/2* (XA(1) + XB(1) + TW(1) * (XB(1) - XA(1))) + i*( 1/2*( XA(1) + XB(1) + i*( XA(1) - XB(1)))); + X = XA; + */ + + xBI = pB[1]; + xBR = pB[0]; + xAR = pA[0]; + xAI = pA[1]; + + twR = *pCoeff++; + twI = *pCoeff++; + + t1a = xBR - xAR ; + t1b = xBI + xAI ; + + // real(tw * (xB - xA)) = twR * (xBR - xAR) - twI * (xBI - xAI); + // imag(tw * (xB - xA)) = twI * (xBR - xAR) + twR * (xBI - xAI); + p0 = twR * t1a; + p1 = twI * t1a; + p2 = twR * t1b; + p3 = twI * t1b; + + *pOut++ = 0.5f * (xAR + xBR + p0 + p3 ); //xAR + *pOut++ = 0.5f * (xAI - xBI + p1 - p2 ); //xAI + + pA += 2; + pB -= 2; + blockCnt--; + } +} + +/* Prepares data for inverse cfft */ +void merge_rfft_f32( + const arm_rfft_fast_instance_f32 * S, + float32_t * p, + float32_t * pOut) +{ + int32_t k; /* Loop Counter */ + float32_t twR, twI; /* RFFT Twiddle coefficients */ + const float32_t *pCoeff = S->pTwiddleRFFT; /* Points to RFFT Twiddle factors */ + float32_t *pA = p; /* increasing pointer */ + float32_t *pB = p; /* decreasing pointer */ + float32_t xAR, xAI, xBR, xBI; /* temporary variables */ + float32_t t1a, t1b, r, s, t, u; /* temporary variables */ + + float32x4x2_t tw,xA,xB; + float32x4x2_t tmp1, tmp2, res; + uint32x4_t vecStridesFwd, vecStridesBkwd; + + vecStridesFwd = vidupq_u32((uint32_t)0, 2); + vecStridesBkwd = -vecStridesFwd; + + int blockCnt; + + + k = (S->Sint).fftLen - 1; + + xAR = pA[0]; + xAI = pA[1]; + + pCoeff += 2 ; + + *pOut++ = 0.5f * ( xAR + xAI ); + *pOut++ = 0.5f * ( xAR - xAI ); + + pB = p + 2*k ; + pA += 2 ; + + blockCnt = k >> 2; + while (blockCnt > 0) + { + /* G is half of the frequency complex spectrum */ + //for k = 2:N + // Xk(k) = 1/2 * (G(k) + conj(G(N-k+2)) + Tw(k)*( G(k) - conj(G(N-k+2)))); + xA = vld2q_f32(pA); + pA += 8; + + xB = vld2q_f32(pB); + + xB.val[0] = vldrwq_gather_shifted_offset_f32(pB, vecStridesBkwd); + xB.val[1] = vldrwq_gather_shifted_offset_f32(&pB[1], vecStridesBkwd); + + xB.val[1] = vnegq_f32(xB.val[1]); + pB -= 8; + + + tw = vld2q_f32(pCoeff); + tw.val[1] = vnegq_f32(tw.val[1]); + pCoeff += 8; + + + tmp1.val[0] = vaddq_f32(xA.val[0],xB.val[0]); + tmp1.val[1] = vaddq_f32(xA.val[1],xB.val[1]); + + tmp2.val[0] = vsubq_f32(xB.val[0],xA.val[0]); + tmp2.val[1] = vsubq_f32(xB.val[1],xA.val[1]); + + res.val[0] = vmulq(tw.val[0], tmp2.val[0]); + res.val[0] = vfmsq(res.val[0],tw.val[1], tmp2.val[1]); + + res.val[1] = vmulq(tw.val[0], tmp2.val[1]); + res.val[1] = vfmaq(res.val[1], tw.val[1], tmp2.val[0]); + + res.val[0] = vaddq_f32(res.val[0],tmp1.val[0] ); + res.val[1] = vaddq_f32(res.val[1],tmp1.val[1] ); + + res.val[0] = vmulq_n_f32(res.val[0], 0.5f); + res.val[1] = vmulq_n_f32(res.val[1], 0.5f); + + + vst2q_f32(pOut, res); + pOut += 8; + + + blockCnt--; + } + + blockCnt = k & 3; + while (blockCnt > 0) + { + /* G is half of the frequency complex spectrum */ + //for k = 2:N + // Xk(k) = 1/2 * (G(k) + conj(G(N-k+2)) + Tw(k)*( G(k) - conj(G(N-k+2)))); + xBI = pB[1] ; + xBR = pB[0] ; + xAR = pA[0]; + xAI = pA[1]; + + twR = *pCoeff++; + twI = *pCoeff++; + + t1a = xAR - xBR ; + t1b = xAI + xBI ; + + r = twR * t1a; + s = twI * t1b; + t = twI * t1a; + u = twR * t1b; + + // real(tw * (xA - xB)) = twR * (xAR - xBR) - twI * (xAI - xBI); + // imag(tw * (xA - xB)) = twI * (xAR - xBR) + twR * (xAI - xBI); + *pOut++ = 0.5f * (xAR + xBR - r - s ); //xAR + *pOut++ = 0.5f * (xAI - xBI + t - u ); //xAI + + pA += 2; + pB -= 2; + blockCnt--; + } + +} +#else +void stage_rfft_f32( + const arm_rfft_fast_instance_f32 * S, + float32_t * p, + float32_t * pOut) +{ + int32_t k; /* Loop Counter */ + float32_t twR, twI; /* RFFT Twiddle coefficients */ + const float32_t * pCoeff = S->pTwiddleRFFT; /* Points to RFFT Twiddle factors */ + float32_t *pA = p; /* increasing pointer */ + float32_t *pB = p; /* decreasing pointer */ + float32_t xAR, xAI, xBR, xBI; /* temporary variables */ + float32_t t1a, t1b; /* temporary variables */ + float32_t p0, p1, p2, p3; /* temporary variables */ + + + k = (S->Sint).fftLen - 1; + + /* Pack first and last sample of the frequency domain together */ + + xBR = pB[0]; + xBI = pB[1]; + xAR = pA[0]; + xAI = pA[1]; + + twR = *pCoeff++ ; + twI = *pCoeff++ ; + + + // U1 = XA(1) + XB(1); % It is real + t1a = xBR + xAR ; + + // U2 = XB(1) - XA(1); % It is imaginary + t1b = xBI + xAI ; + + // real(tw * (xB - xA)) = twR * (xBR - xAR) - twI * (xBI - xAI); + // imag(tw * (xB - xA)) = twI * (xBR - xAR) + twR * (xBI - xAI); + *pOut++ = 0.5f * ( t1a + t1b ); + *pOut++ = 0.5f * ( t1a - t1b ); + + // XA(1) = 1/2*( U1 - imag(U2) + i*( U1 +imag(U2) )); + pB = p + 2*k; + pA += 2; + + do + { + /* + function X = my_split_rfft(X, ifftFlag) + % X is a series of real numbers + L = length(X); + XC = X(1:2:end) +i*X(2:2:end); + XA = fft(XC); + XB = conj(XA([1 end:-1:2])); + TW = i*exp(-2*pi*i*[0:L/2-1]/L).'; + for l = 2:L/2 + XA(l) = 1/2 * (XA(l) + XB(l) + TW(l) * (XB(l) - XA(l))); + end + XA(1) = 1/2* (XA(1) + XB(1) + TW(1) * (XB(1) - XA(1))) + i*( 1/2*( XA(1) + XB(1) + i*( XA(1) - XB(1)))); + X = XA; + */ + + xBI = pB[1]; + xBR = pB[0]; + xAR = pA[0]; + xAI = pA[1]; + + twR = *pCoeff++; + twI = *pCoeff++; + + t1a = xBR - xAR ; + t1b = xBI + xAI ; + + // real(tw * (xB - xA)) = twR * (xBR - xAR) - twI * (xBI - xAI); + // imag(tw * (xB - xA)) = twI * (xBR - xAR) + twR * (xBI - xAI); + p0 = twR * t1a; + p1 = twI * t1a; + p2 = twR * t1b; + p3 = twI * t1b; + + *pOut++ = 0.5f * (xAR + xBR + p0 + p3 ); //xAR + *pOut++ = 0.5f * (xAI - xBI + p1 - p2 ); //xAI + + + pA += 2; + pB -= 2; + k--; + } while (k > 0); +} + +/* Prepares data for inverse cfft */ +void merge_rfft_f32( + const arm_rfft_fast_instance_f32 * S, + float32_t * p, + float32_t * pOut) +{ + int32_t k; /* Loop Counter */ + float32_t twR, twI; /* RFFT Twiddle coefficients */ + const float32_t *pCoeff = S->pTwiddleRFFT; /* Points to RFFT Twiddle factors */ + float32_t *pA = p; /* increasing pointer */ + float32_t *pB = p; /* decreasing pointer */ + float32_t xAR, xAI, xBR, xBI; /* temporary variables */ + float32_t t1a, t1b, r, s, t, u; /* temporary variables */ + + k = (S->Sint).fftLen - 1; + + xAR = pA[0]; + xAI = pA[1]; + + pCoeff += 2 ; + + *pOut++ = 0.5f * ( xAR + xAI ); + *pOut++ = 0.5f * ( xAR - xAI ); + + pB = p + 2*k ; + pA += 2 ; + + while (k > 0) + { + /* G is half of the frequency complex spectrum */ + //for k = 2:N + // Xk(k) = 1/2 * (G(k) + conj(G(N-k+2)) + Tw(k)*( G(k) - conj(G(N-k+2)))); + xBI = pB[1] ; + xBR = pB[0] ; + xAR = pA[0]; + xAI = pA[1]; + + twR = *pCoeff++; + twI = *pCoeff++; + + t1a = xAR - xBR ; + t1b = xAI + xBI ; + + r = twR * t1a; + s = twI * t1b; + t = twI * t1a; + u = twR * t1b; + + // real(tw * (xA - xB)) = twR * (xAR - xBR) - twI * (xAI - xBI); + // imag(tw * (xA - xB)) = twI * (xAR - xBR) + twR * (xAI - xBI); + *pOut++ = 0.5f * (xAR + xBR - r - s ); //xAR + *pOut++ = 0.5f * (xAI - xBI + t - u ); //xAI + + pA += 2; + pB -= 2; + k--; + } + +} + +#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */ + +/** + @ingroup groupTransforms +*/ + +/** + @defgroup RealFFT Real FFT Functions + + @par + The CMSIS DSP library includes specialized algorithms for computing the + FFT of real data sequences. The FFT is defined over complex data but + in many applications the input is real. Real FFT algorithms take advantage + of the symmetry properties of the FFT and have a speed advantage over complex + algorithms of the same length. + @par + The Fast RFFT algorithm relays on the mixed radix CFFT that save processor usage. + @par + The real length N forward FFT of a sequence is computed using the steps shown below. + @par + \image html RFFT.gif "Real Fast Fourier Transform" + @par + The real sequence is initially treated as if it were complex to perform a CFFT. + Later, a processing stage reshapes the data to obtain half of the frequency spectrum + in complex format. Except the first complex number that contains the two real numbers + X[0] and X[N/2] all the data is complex. In other words, the first complex sample + contains two real values packed. + @par + The input for the inverse RFFT should keep the same format as the output of the + forward RFFT. A first processing stage pre-process the data to later perform an + inverse CFFT. + @par + \image html RIFFT.gif "Real Inverse Fast Fourier Transform" + @par + The algorithms for floating-point, Q15, and Q31 data are slightly different + and we describe each algorithm in turn. + @par Floating-point + The main functions are \ref arm_rfft_fast_f32() and \ref arm_rfft_fast_init_f32(). + The older functions \ref arm_rfft_f32() and \ref arm_rfft_init_f32() have been deprecated + but are still documented. + @par + The FFT of a real N-point sequence has even symmetry in the frequency domain. + The second half of the data equals the conjugate of the first half flipped in frequency. + Looking at the data, we see that we can uniquely represent the FFT using only N/2 complex numbers. + These are packed into the output array in alternating real and imaginary components: + @par + X = { real[0], imag[0], real[1], imag[1], real[2], imag[2] ... + real[(N/2)-1], imag[(N/2)-1 } + @par + It happens that the first complex number (real[0], imag[0]) is actually + all real. real[0] represents the DC offset, and imag[0] should be 0. + (real[1], imag[1]) is the fundamental frequency, (real[2], imag[2]) is + the first harmonic and so on. + @par + The real FFT functions pack the frequency domain data in this fashion. + The forward transform outputs the data in this form and the inverse + transform expects input data in this form. The function always performs + the needed bitreversal so that the input and output data is always in + normal order. The functions support lengths of [32, 64, 128, ..., 4096] + samples. + @par Q15 and Q31 + The real algorithms are defined in a similar manner and utilize N/2 complex + transforms behind the scenes. + @par + The complex transforms used internally include scaling to prevent fixed-point + overflows. The overall scaling equals 1/(fftLen/2). + Due to the use of complex transform internally, the source buffer is + modified by the rfft. + @par + A separate instance structure must be defined for each transform used but + twiddle factor and bit reversal tables can be reused. + @par + There is also an associated initialization function for each data type. + The initialization function performs the following operations: + - Sets the values of the internal structure fields. + - Initializes twiddle factor table and bit reversal table pointers. + - Initializes the internal complex FFT data structure. + @par + Use of the initialization function is optional **except for MVE versions where it is mandatory**. + If you don't use the initialization functions, then the structures should be initialized with code + similar to the one below: + <pre> + arm_rfft_instance_q31 S = {fftLenReal, fftLenBy2, ifftFlagR, bitReverseFlagR, twidCoefRModifier, pTwiddleAReal, pTwiddleBReal, pCfft}; + arm_rfft_instance_q15 S = {fftLenReal, fftLenBy2, ifftFlagR, bitReverseFlagR, twidCoefRModifier, pTwiddleAReal, pTwiddleBReal, pCfft}; + </pre> + where <code>fftLenReal</code> is the length of the real transform; + <code>fftLenBy2</code> length of the internal complex transform (fftLenReal/2). + <code>ifftFlagR</code> Selects forward (=0) or inverse (=1) transform. + <code>bitReverseFlagR</code> Selects bit reversed output (=0) or normal order + output (=1). + <code>twidCoefRModifier</code> stride modifier for the twiddle factor table. + The value is based on the FFT length; + <code>pTwiddleAReal</code>points to the A array of twiddle coefficients; + <code>pTwiddleBReal</code>points to the B array of twiddle coefficients; + <code>pCfft</code> points to the CFFT Instance structure. The CFFT structure + must also be initialized. +@par + Note that with MVE versions you can't initialize instance structures directly and **must + use the initialization function**. + */ + +/** + @addtogroup RealFFT + @{ +*/ + +/** + @brief Processing function for the floating-point real FFT. + @param[in] S points to an arm_rfft_fast_instance_f32 structure + @param[in] p points to input buffer (Source buffer is modified by this function.) + @param[in] pOut points to output buffer + @param[in] ifftFlag + - value = 0: RFFT + - value = 1: RIFFT + @return none +*/ + +void arm_rfft_fast_f32( + const arm_rfft_fast_instance_f32 * S, + float32_t * p, + float32_t * pOut, + uint8_t ifftFlag) +{ + const arm_cfft_instance_f32 * Sint = &(S->Sint); + + /* Calculation of Real FFT */ + if (ifftFlag) + { + /* Real FFT compression */ + merge_rfft_f32(S, p, pOut); + /* Complex radix-4 IFFT process */ + arm_cfft_f32( Sint, pOut, ifftFlag, 1); + } + else + { + /* Calculation of RFFT of input */ + arm_cfft_f32( Sint, p, ifftFlag, 1); + + /* Real FFT extraction */ + stage_rfft_f32(S, p, pOut); + } +} + +/** +* @} end of RealFFT group +*/ diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_rfft_fast_f64.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_rfft_fast_f64.c new file mode 100644 index 0000000..a1e4ed0 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_rfft_fast_f64.c @@ -0,0 +1,228 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_rfft_fast_f64.c + * Description: RFFT & RIFFT Double precision Floating point process function + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/transform_functions.h" + +void stage_rfft_f64( + const arm_rfft_fast_instance_f64 * S, + float64_t * p, + float64_t * pOut) +{ + uint32_t k; /* Loop Counter */ + float64_t twR, twI; /* RFFT Twiddle coefficients */ + const float64_t * pCoeff = S->pTwiddleRFFT; /* Points to RFFT Twiddle factors */ + float64_t *pA = p; /* increasing pointer */ + float64_t *pB = p; /* decreasing pointer */ + float64_t xAR, xAI, xBR, xBI; /* temporary variables */ + float64_t t1a, t1b; /* temporary variables */ + float64_t p0, p1, p2, p3; /* temporary variables */ + + + k = (S->Sint).fftLen - 1; + + /* Pack first and last sample of the frequency domain together */ + + xBR = pB[0]; + xBI = pB[1]; + xAR = pA[0]; + xAI = pA[1]; + + twR = *pCoeff++ ; + twI = *pCoeff++ ; + + // U1 = XA(1) + XB(1); % It is real + t1a = xBR + xAR ; + + // U2 = XB(1) - XA(1); % It is imaginary + t1b = xBI + xAI ; + + // real(tw * (xB - xA)) = twR * (xBR - xAR) - twI * (xBI - xAI); + // imag(tw * (xB - xA)) = twI * (xBR - xAR) + twR * (xBI - xAI); + *pOut++ = 0.5 * ( t1a + t1b ); + *pOut++ = 0.5 * ( t1a - t1b ); + + // XA(1) = 1/2*( U1 - imag(U2) + i*( U1 +imag(U2) )); + pB = p + 2*k; + pA += 2; + + do + { + /* + function X = my_split_rfft(X, ifftFlag) + % X is a series of real numbers + L = length(X); + XC = X(1:2:end) +i*X(2:2:end); + XA = fft(XC); + XB = conj(XA([1 end:-1:2])); + TW = i*exp(-2*pi*i*[0:L/2-1]/L).'; + for l = 2:L/2 + XA(l) = 1/2 * (XA(l) + XB(l) + TW(l) * (XB(l) - XA(l))); + end + XA(1) = 1/2* (XA(1) + XB(1) + TW(1) * (XB(1) - XA(1))) + i*( 1/2*( XA(1) + XB(1) + i*( XA(1) - XB(1)))); + X = XA; + */ + + xBI = pB[1]; + xBR = pB[0]; + xAR = pA[0]; + xAI = pA[1]; + + twR = *pCoeff++; + twI = *pCoeff++; + + t1a = xBR - xAR ; + t1b = xBI + xAI ; + + // real(tw * (xB - xA)) = twR * (xBR - xAR) - twI * (xBI - xAI); + // imag(tw * (xB - xA)) = twI * (xBR - xAR) + twR * (xBI - xAI); + p0 = twR * t1a; + p1 = twI * t1a; + p2 = twR * t1b; + p3 = twI * t1b; + + *pOut++ = 0.5 * (xAR + xBR + p0 + p3 ); //xAR + *pOut++ = 0.5 * (xAI - xBI + p1 - p2 ); //xAI + + pA += 2; + pB -= 2; + k--; + } while (k > 0U); +} + +/* Prepares data for inverse cfft */ +void merge_rfft_f64( + const arm_rfft_fast_instance_f64 * S, + float64_t * p, + float64_t * pOut) +{ + uint32_t k; /* Loop Counter */ + float64_t twR, twI; /* RFFT Twiddle coefficients */ + const float64_t *pCoeff = S->pTwiddleRFFT; /* Points to RFFT Twiddle factors */ + float64_t *pA = p; /* increasing pointer */ + float64_t *pB = p; /* decreasing pointer */ + float64_t xAR, xAI, xBR, xBI; /* temporary variables */ + float64_t t1a, t1b, r, s, t, u; /* temporary variables */ + + k = (S->Sint).fftLen - 1; + + xAR = pA[0]; + xAI = pA[1]; + + pCoeff += 2 ; + + *pOut++ = 0.5 * ( xAR + xAI ); + *pOut++ = 0.5 * ( xAR - xAI ); + + pB = p + 2*k ; + pA += 2 ; + + while (k > 0U) + { + /* G is half of the frequency complex spectrum */ + //for k = 2:N + // Xk(k) = 1/2 * (G(k) + conj(G(N-k+2)) + Tw(k)*( G(k) - conj(G(N-k+2)))); + xBI = pB[1] ; + xBR = pB[0] ; + xAR = pA[0]; + xAI = pA[1]; + + twR = *pCoeff++; + twI = *pCoeff++; + + t1a = xAR - xBR ; + t1b = xAI + xBI ; + + r = twR * t1a; + s = twI * t1b; + t = twI * t1a; + u = twR * t1b; + + // real(tw * (xA - xB)) = twR * (xAR - xBR) - twI * (xAI - xBI); + // imag(tw * (xA - xB)) = twI * (xAR - xBR) + twR * (xAI - xBI); + *pOut++ = 0.5 * (xAR + xBR - r - s ); //xAR + *pOut++ = 0.5 * (xAI - xBI + t - u ); //xAI + + pA += 2; + pB -= 2; + k--; + } + +} + +/** + @ingroup groupTransforms +*/ + + +/** + @addtogroup RealFFT + @{ +*/ + +/** + @brief Processing function for the Double Precision floating-point real FFT. + @param[in] S points to an arm_rfft_fast_instance_f64 structure + @param[in] p points to input buffer (Source buffer is modified by this function.) + @param[in] pOut points to output buffer + @param[in] ifftFlag + - value = 0: RFFT + - value = 1: RIFFT + @return none +*/ + +void arm_rfft_fast_f64( + arm_rfft_fast_instance_f64 * S, + float64_t * p, + float64_t * pOut, + uint8_t ifftFlag) +{ + arm_cfft_instance_f64 * Sint = &(S->Sint); + Sint->fftLen = S->fftLenRFFT / 2; + + /* Calculation of Real FFT */ + if (ifftFlag) + { + /* Real FFT compression */ + merge_rfft_f64(S, p, pOut); + + /* Complex radix-4 IFFT process */ + arm_cfft_f64( Sint, pOut, ifftFlag, 1); + } + else + { + /* Calculation of RFFT of input */ + arm_cfft_f64( Sint, p, ifftFlag, 1); + + /* Real FFT extraction */ + stage_rfft_f64(S, p, pOut); + } +} + +/** +* @} end of RealFFT group +*/ diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_rfft_fast_init_f16.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_rfft_fast_init_f16.c new file mode 100644 index 0000000..158180b --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_rfft_fast_init_f16.c @@ -0,0 +1,357 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_rfft_fast_init_f16.c + * Description: Split Radix Decimation in Frequency CFFT Floating point processing function + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/transform_functions_f16.h" +#include "arm_common_tables_f16.h" +#include "arm_const_structs_f16.h" + +#if defined(ARM_FLOAT16_SUPPORTED) + +/** + @ingroup groupTransforms + */ + +/** + @addtogroup RealFFT + @{ + */ + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_16) && defined(ARM_TABLE_BITREVIDX_FLT_16) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F16_32)) + +/** + @private + @brief Initialization function for the 32pt floating-point real FFT. + @param[in,out] S points to an arm_rfft_fast_instance_f16 structure + @return execution status + - \ref ARM_MATH_SUCCESS : Operation successful + - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected + */ + +static arm_status arm_rfft_32_fast_init_f16( arm_rfft_fast_instance_f16 * S ) { + + arm_status status; + + if( !S ) return ARM_MATH_ARGUMENT_ERROR; + + status=arm_cfft_init_f16(&(S->Sint),16); + if (status != ARM_MATH_SUCCESS) + { + return(status); + } + + S->fftLenRFFT = 32U; + S->pTwiddleRFFT = (float16_t *) twiddleCoefF16_rfft_32; + + return ARM_MATH_SUCCESS; +} +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_32) && defined(ARM_TABLE_BITREVIDX_FLT_32) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F16_64)) + +/** + @private + @brief Initialization function for the 64pt floating-point real FFT. + @param[in,out] S points to an arm_rfft_fast_instance_f16 structure + @return execution status + - \ref ARM_MATH_SUCCESS : Operation successful + - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected + */ + +static arm_status arm_rfft_64_fast_init_f16( arm_rfft_fast_instance_f16 * S ) { + + arm_status status; + + if( !S ) return ARM_MATH_ARGUMENT_ERROR; + + status=arm_cfft_init_f16(&(S->Sint),32); + if (status != ARM_MATH_SUCCESS) + { + return(status); + } + S->fftLenRFFT = 64U; + + S->pTwiddleRFFT = (float16_t *) twiddleCoefF16_rfft_64; + + return ARM_MATH_SUCCESS; +} +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_64) && defined(ARM_TABLE_BITREVIDX_FLT_64) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F16_128)) + +/** + @private + @brief Initialization function for the 128pt floating-point real FFT. + @param[in,out] S points to an arm_rfft_fast_instance_f16 structure + @return execution status + - \ref ARM_MATH_SUCCESS : Operation successful + - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected + */ + +static arm_status arm_rfft_128_fast_init_f16( arm_rfft_fast_instance_f16 * S ) { + + arm_status status; + + if( !S ) return ARM_MATH_ARGUMENT_ERROR; + + status=arm_cfft_init_f16(&(S->Sint),64); + if (status != ARM_MATH_SUCCESS) + { + return(status); + } + S->fftLenRFFT = 128; + + S->pTwiddleRFFT = (float16_t *) twiddleCoefF16_rfft_128; + + return ARM_MATH_SUCCESS; +} +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_128) && defined(ARM_TABLE_BITREVIDX_FLT_128) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F16_256)) + +/** + @private + @brief Initialization function for the 256pt floating-point real FFT. + @param[in,out] S points to an arm_rfft_fast_instance_f16 structure + @return execution status + - \ref ARM_MATH_SUCCESS : Operation successful + - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected +*/ + +static arm_status arm_rfft_256_fast_init_f16( arm_rfft_fast_instance_f16 * S ) { + + arm_status status; + + if( !S ) return ARM_MATH_ARGUMENT_ERROR; + + status=arm_cfft_init_f16(&(S->Sint),128); + if (status != ARM_MATH_SUCCESS) + { + return(status); + } + S->fftLenRFFT = 256U; + + S->pTwiddleRFFT = (float16_t *) twiddleCoefF16_rfft_256; + + return ARM_MATH_SUCCESS; +} +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_256) && defined(ARM_TABLE_BITREVIDX_FLT_256) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F16_512)) + +/** + @private + @brief Initialization function for the 512pt floating-point real FFT. + @param[in,out] S points to an arm_rfft_fast_instance_f16 structure + @return execution status + - \ref ARM_MATH_SUCCESS : Operation successful + - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected + */ + +static arm_status arm_rfft_512_fast_init_f16( arm_rfft_fast_instance_f16 * S ) { + + arm_status status; + + if( !S ) return ARM_MATH_ARGUMENT_ERROR; + + status=arm_cfft_init_f16(&(S->Sint),256); + if (status != ARM_MATH_SUCCESS) + { + return(status); + } + S->fftLenRFFT = 512U; + + S->pTwiddleRFFT = (float16_t *) twiddleCoefF16_rfft_512; + + return ARM_MATH_SUCCESS; +} +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_512) && defined(ARM_TABLE_BITREVIDX_FLT_512) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F16_1024)) +/** + @private + @brief Initialization function for the 1024pt floating-point real FFT. + @param[in,out] S points to an arm_rfft_fast_instance_f16 structure + @return execution status + - \ref ARM_MATH_SUCCESS : Operation successful + - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected + */ + +static arm_status arm_rfft_1024_fast_init_f16( arm_rfft_fast_instance_f16 * S ) { + + arm_status status; + + if( !S ) return ARM_MATH_ARGUMENT_ERROR; + + status=arm_cfft_init_f16(&(S->Sint),512); + if (status != ARM_MATH_SUCCESS) + { + return(status); + } + S->fftLenRFFT = 1024U; + + S->pTwiddleRFFT = (float16_t *) twiddleCoefF16_rfft_1024; + + return ARM_MATH_SUCCESS; +} +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_1024) && defined(ARM_TABLE_BITREVIDX_FLT_1024) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F16_2048)) +/** + @private + @brief Initialization function for the 2048pt floating-point real FFT. + @param[in,out] S points to an arm_rfft_fast_instance_f16 structure + @return execution status + - \ref ARM_MATH_SUCCESS : Operation successful + - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected + */ +static arm_status arm_rfft_2048_fast_init_f16( arm_rfft_fast_instance_f16 * S ) { + + arm_status status; + + if( !S ) return ARM_MATH_ARGUMENT_ERROR; + + status=arm_cfft_init_f16(&(S->Sint),1024); + if (status != ARM_MATH_SUCCESS) + { + return(status); + } + S->fftLenRFFT = 2048U; + + S->pTwiddleRFFT = (float16_t *) twiddleCoefF16_rfft_2048; + + return ARM_MATH_SUCCESS; +} +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_2048) && defined(ARM_TABLE_BITREVIDX_FLT_2048) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F16_4096)) +/** + @private +* @brief Initialization function for the 4096pt floating-point real FFT. +* @param[in,out] S points to an arm_rfft_fast_instance_f16 structure + @return execution status + - \ref ARM_MATH_SUCCESS : Operation successful + - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected + */ + +static arm_status arm_rfft_4096_fast_init_f16( arm_rfft_fast_instance_f16 * S ) { + + arm_status status; + + if( !S ) return ARM_MATH_ARGUMENT_ERROR; + + status=arm_cfft_init_f16(&(S->Sint),2048); + if (status != ARM_MATH_SUCCESS) + { + return(status); + } + S->fftLenRFFT = 4096U; + + S->pTwiddleRFFT = (float16_t *) twiddleCoefF16_rfft_4096; + + return ARM_MATH_SUCCESS; +} +#endif + +/** + @brief Initialization function for the floating-point real FFT. + @param[in,out] S points to an arm_rfft_fast_instance_f16 structure + @param[in] fftLen length of the Real Sequence + @return execution status + - \ref ARM_MATH_SUCCESS : Operation successful + - \ref ARM_MATH_ARGUMENT_ERROR : <code>fftLen</code> is not a supported length + + @par Description + The parameter <code>fftLen</code> specifies the length of RFFT/CIFFT process. + Supported FFT Lengths are 32, 64, 128, 256, 512, 1024, 2048, 4096. + @par + This Function also initializes Twiddle factor table pointer and Bit reversal table pointer. + */ + +arm_status arm_rfft_fast_init_f16( + arm_rfft_fast_instance_f16 * S, + uint16_t fftLen) +{ + typedef arm_status(*fft_init_ptr)( arm_rfft_fast_instance_f16 *); + fft_init_ptr fptr = 0x0; + + switch (fftLen) + { +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_2048) && defined(ARM_TABLE_BITREVIDX_FLT_2048) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F16_4096)) + case 4096U: + fptr = arm_rfft_4096_fast_init_f16; + break; +#endif +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_1024) && defined(ARM_TABLE_BITREVIDX_FLT_1024) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F16_2048)) + case 2048U: + fptr = arm_rfft_2048_fast_init_f16; + break; +#endif +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_512) && defined(ARM_TABLE_BITREVIDX_FLT_512) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F16_1024)) + case 1024U: + fptr = arm_rfft_1024_fast_init_f16; + break; +#endif +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_256) && defined(ARM_TABLE_BITREVIDX_FLT_256) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F16_512)) + case 512U: + fptr = arm_rfft_512_fast_init_f16; + break; +#endif +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_128) && defined(ARM_TABLE_BITREVIDX_FLT_128) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F16_256)) + case 256U: + fptr = arm_rfft_256_fast_init_f16; + break; +#endif +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_64) && defined(ARM_TABLE_BITREVIDX_FLT_64) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F16_128)) + case 128U: + fptr = arm_rfft_128_fast_init_f16; + break; +#endif +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_32) && defined(ARM_TABLE_BITREVIDX_FLT_32) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F16_64)) + case 64U: + fptr = arm_rfft_64_fast_init_f16; + break; +#endif +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_16) && defined(ARM_TABLE_BITREVIDX_FLT_16) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F16_32)) + case 32U: + fptr = arm_rfft_32_fast_init_f16; + break; +#endif + default: + break; + } + + if( ! fptr ) return ARM_MATH_ARGUMENT_ERROR; + return fptr( S ); + +} + +/** + @} end of RealFFT group + */ + +#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_rfft_fast_init_f32.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_rfft_fast_init_f32.c new file mode 100644 index 0000000..e8273b4 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_rfft_fast_init_f32.c @@ -0,0 +1,352 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_rfft_fast_init_f32.c + * Description: Split Radix Decimation in Frequency CFFT Floating point processing function + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/transform_functions.h" +#include "arm_common_tables.h" + +/** + @ingroup groupTransforms + */ + +/** + @addtogroup RealFFT + @{ + */ + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_16) && defined(ARM_TABLE_BITREVIDX_FLT_16) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_32)) + +/** + @private + @brief Initialization function for the 32pt floating-point real FFT. + @param[in,out] S points to an arm_rfft_fast_instance_f32 structure + @return execution status + - \ref ARM_MATH_SUCCESS : Operation successful + - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected + */ + +static arm_status arm_rfft_32_fast_init_f32( arm_rfft_fast_instance_f32 * S ) { + + arm_status status; + + if( !S ) return ARM_MATH_ARGUMENT_ERROR; + + status=arm_cfft_init_f32(&(S->Sint),16); + if (status != ARM_MATH_SUCCESS) + { + return(status); + } + + S->fftLenRFFT = 32U; + S->pTwiddleRFFT = (float32_t *) twiddleCoef_rfft_32; + + return ARM_MATH_SUCCESS; +} +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_32) && defined(ARM_TABLE_BITREVIDX_FLT_32) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_64)) + +/** + @private + @brief Initialization function for the 64pt floating-point real FFT. + @param[in,out] S points to an arm_rfft_fast_instance_f32 structure + @return execution status + - \ref ARM_MATH_SUCCESS : Operation successful + - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected + */ + +static arm_status arm_rfft_64_fast_init_f32( arm_rfft_fast_instance_f32 * S ) { + + arm_status status; + + if( !S ) return ARM_MATH_ARGUMENT_ERROR; + + status=arm_cfft_init_f32(&(S->Sint),32); + if (status != ARM_MATH_SUCCESS) + { + return(status); + } + S->fftLenRFFT = 64U; + + S->pTwiddleRFFT = (float32_t *) twiddleCoef_rfft_64; + + return ARM_MATH_SUCCESS; +} +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_64) && defined(ARM_TABLE_BITREVIDX_FLT_64) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_128)) + +/** + @private + @brief Initialization function for the 128pt floating-point real FFT. + @param[in,out] S points to an arm_rfft_fast_instance_f32 structure + @return execution status + - \ref ARM_MATH_SUCCESS : Operation successful + - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected + */ + +static arm_status arm_rfft_128_fast_init_f32( arm_rfft_fast_instance_f32 * S ) { + + arm_status status; + + if( !S ) return ARM_MATH_ARGUMENT_ERROR; + + status=arm_cfft_init_f32(&(S->Sint),64); + if (status != ARM_MATH_SUCCESS) + { + return(status); + } + S->fftLenRFFT = 128; + + S->pTwiddleRFFT = (float32_t *) twiddleCoef_rfft_128; + + return ARM_MATH_SUCCESS; +} +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_128) && defined(ARM_TABLE_BITREVIDX_FLT_128) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_256)) + +/** + @private + @brief Initialization function for the 256pt floating-point real FFT. + @param[in,out] S points to an arm_rfft_fast_instance_f32 structure + @return execution status + - \ref ARM_MATH_SUCCESS : Operation successful + - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected +*/ + +static arm_status arm_rfft_256_fast_init_f32( arm_rfft_fast_instance_f32 * S ) { + + arm_status status; + + if( !S ) return ARM_MATH_ARGUMENT_ERROR; + + status=arm_cfft_init_f32(&(S->Sint),128); + if (status != ARM_MATH_SUCCESS) + { + return(status); + } + S->fftLenRFFT = 256U; + + S->pTwiddleRFFT = (float32_t *) twiddleCoef_rfft_256; + + return ARM_MATH_SUCCESS; +} +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_256) && defined(ARM_TABLE_BITREVIDX_FLT_256) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_512)) + +/** + @private + @brief Initialization function for the 512pt floating-point real FFT. + @param[in,out] S points to an arm_rfft_fast_instance_f32 structure + @return execution status + - \ref ARM_MATH_SUCCESS : Operation successful + - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected + */ + +static arm_status arm_rfft_512_fast_init_f32( arm_rfft_fast_instance_f32 * S ) { + + arm_status status; + + if( !S ) return ARM_MATH_ARGUMENT_ERROR; + + status=arm_cfft_init_f32(&(S->Sint),256); + if (status != ARM_MATH_SUCCESS) + { + return(status); + } + S->fftLenRFFT = 512U; + + S->pTwiddleRFFT = (float32_t *) twiddleCoef_rfft_512; + + return ARM_MATH_SUCCESS; +} +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_512) && defined(ARM_TABLE_BITREVIDX_FLT_512) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_1024)) +/** + @private + @brief Initialization function for the 1024pt floating-point real FFT. + @param[in,out] S points to an arm_rfft_fast_instance_f32 structure + @return execution status + - \ref ARM_MATH_SUCCESS : Operation successful + - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected + */ + +static arm_status arm_rfft_1024_fast_init_f32( arm_rfft_fast_instance_f32 * S ) { + + arm_status status; + + if( !S ) return ARM_MATH_ARGUMENT_ERROR; + + status=arm_cfft_init_f32(&(S->Sint),512); + if (status != ARM_MATH_SUCCESS) + { + return(status); + } + S->fftLenRFFT = 1024U; + + S->pTwiddleRFFT = (float32_t *) twiddleCoef_rfft_1024; + + return ARM_MATH_SUCCESS; +} +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_1024) && defined(ARM_TABLE_BITREVIDX_FLT_1024) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_2048)) +/** + @private + @brief Initialization function for the 2048pt floating-point real FFT. + @param[in,out] S points to an arm_rfft_fast_instance_f32 structure + @return execution status + - \ref ARM_MATH_SUCCESS : Operation successful + - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected + */ +static arm_status arm_rfft_2048_fast_init_f32( arm_rfft_fast_instance_f32 * S ) { + + arm_status status; + + if( !S ) return ARM_MATH_ARGUMENT_ERROR; + + status=arm_cfft_init_f32(&(S->Sint),1024); + if (status != ARM_MATH_SUCCESS) + { + return(status); + } + S->fftLenRFFT = 2048U; + + S->pTwiddleRFFT = (float32_t *) twiddleCoef_rfft_2048; + + return ARM_MATH_SUCCESS; +} +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_2048) && defined(ARM_TABLE_BITREVIDX_FLT_2048) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_4096)) +/** + @private +* @brief Initialization function for the 4096pt floating-point real FFT. +* @param[in,out] S points to an arm_rfft_fast_instance_f32 structure + @return execution status + - \ref ARM_MATH_SUCCESS : Operation successful + - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected + */ + +static arm_status arm_rfft_4096_fast_init_f32( arm_rfft_fast_instance_f32 * S ) { + + arm_status status; + + if( !S ) return ARM_MATH_ARGUMENT_ERROR; + + status=arm_cfft_init_f32(&(S->Sint),2048); + if (status != ARM_MATH_SUCCESS) + { + return(status); + } + S->fftLenRFFT = 4096U; + + S->pTwiddleRFFT = (float32_t *) twiddleCoef_rfft_4096; + + return ARM_MATH_SUCCESS; +} +#endif + +/** + @brief Initialization function for the floating-point real FFT. + @param[in,out] S points to an arm_rfft_fast_instance_f32 structure + @param[in] fftLen length of the Real Sequence + @return execution status + - \ref ARM_MATH_SUCCESS : Operation successful + - \ref ARM_MATH_ARGUMENT_ERROR : <code>fftLen</code> is not a supported length + + @par Description + The parameter <code>fftLen</code> specifies the length of RFFT/CIFFT process. + Supported FFT Lengths are 32, 64, 128, 256, 512, 1024, 2048, 4096. + @par + This Function also initializes Twiddle factor table pointer and Bit reversal table pointer. + */ + +arm_status arm_rfft_fast_init_f32( + arm_rfft_fast_instance_f32 * S, + uint16_t fftLen) +{ + typedef arm_status(*fft_init_ptr)( arm_rfft_fast_instance_f32 *); + fft_init_ptr fptr = 0x0; + + switch (fftLen) + { +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_2048) && defined(ARM_TABLE_BITREVIDX_FLT_2048) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_4096)) + case 4096U: + fptr = arm_rfft_4096_fast_init_f32; + break; +#endif +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_1024) && defined(ARM_TABLE_BITREVIDX_FLT_1024) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_2048)) + case 2048U: + fptr = arm_rfft_2048_fast_init_f32; + break; +#endif +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_512) && defined(ARM_TABLE_BITREVIDX_FLT_512) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_1024)) + case 1024U: + fptr = arm_rfft_1024_fast_init_f32; + break; +#endif +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_256) && defined(ARM_TABLE_BITREVIDX_FLT_256) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_512)) + case 512U: + fptr = arm_rfft_512_fast_init_f32; + break; +#endif +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_128) && defined(ARM_TABLE_BITREVIDX_FLT_128) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_256)) + case 256U: + fptr = arm_rfft_256_fast_init_f32; + break; +#endif +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_64) && defined(ARM_TABLE_BITREVIDX_FLT_64) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_128)) + case 128U: + fptr = arm_rfft_128_fast_init_f32; + break; +#endif +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_32) && defined(ARM_TABLE_BITREVIDX_FLT_32) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_64)) + case 64U: + fptr = arm_rfft_64_fast_init_f32; + break; +#endif +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_16) && defined(ARM_TABLE_BITREVIDX_FLT_16) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_32)) + case 32U: + fptr = arm_rfft_32_fast_init_f32; + break; +#endif + default: + break; + } + + if( ! fptr ) return ARM_MATH_ARGUMENT_ERROR; + return fptr( S ); + +} + +/** + @} end of RealFFT group + */ diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_rfft_fast_init_f64.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_rfft_fast_init_f64.c new file mode 100644 index 0000000..c6d3e69 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_rfft_fast_init_f64.c @@ -0,0 +1,344 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_rfft_fast_init_f64.c + * Description: Split Radix Decimation in Frequency CFFT Double Precision Floating point processing function + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/transform_functions.h" +#include "arm_common_tables.h" + +/** + @ingroup groupTransforms + */ + +/** + @addtogroup RealFFT + @{ + */ + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_16) && defined(ARM_TABLE_BITREVIDX_FLT64_16) && defined(ARM_TABLE_TWIDDLECOEF_F64_16) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_32)) + +/** + @brief Initialization function for the 32pt double precision floating-point real FFT. + @param[in,out] S points to an arm_rfft_fast_instance_f64 structure + @return execution status + - \ref ARM_MATH_SUCCESS : Operation successful + - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected + */ + +static arm_status arm_rfft_32_fast_init_f64( arm_rfft_fast_instance_f64 * S ) { + + arm_cfft_instance_f64 * Sint; + + if( !S ) return ARM_MATH_ARGUMENT_ERROR; + + Sint = &(S->Sint); + Sint->fftLen = 16U; + S->fftLenRFFT = 32U; + + Sint->bitRevLength = ARMBITREVINDEXTABLEF64_16_TABLE_LENGTH; + Sint->pBitRevTable = (uint16_t *)armBitRevIndexTableF64_16; + Sint->pTwiddle = (float64_t *) twiddleCoefF64_16; + S->pTwiddleRFFT = (float64_t *) twiddleCoefF64_rfft_32; + + return ARM_MATH_SUCCESS; +} +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_32) && defined(ARM_TABLE_BITREVIDX_FLT64_32) && defined(ARM_TABLE_TWIDDLECOEF_F64_32) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_64)) + +/** + @brief Initialization function for the 64pt Double Precision floating-point real FFT. + @param[in,out] S points to an arm_rfft_fast_instance_f64 structure + @return execution status + - \ref ARM_MATH_SUCCESS : Operation successful + - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected + */ + +static arm_status arm_rfft_64_fast_init_f64( arm_rfft_fast_instance_f64 * S ) { + + arm_cfft_instance_f64 * Sint; + + if( !S ) return ARM_MATH_ARGUMENT_ERROR; + + Sint = &(S->Sint); + Sint->fftLen = 32U; + S->fftLenRFFT = 64U; + + Sint->bitRevLength = ARMBITREVINDEXTABLEF64_32_TABLE_LENGTH; + Sint->pBitRevTable = (uint16_t *)armBitRevIndexTableF64_32; + Sint->pTwiddle = (float64_t *) twiddleCoefF64_32; + S->pTwiddleRFFT = (float64_t *) twiddleCoefF64_rfft_64; + + return ARM_MATH_SUCCESS; +} +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_64) && defined(ARM_TABLE_BITREVIDX_FLT64_64) && defined(ARM_TABLE_TWIDDLECOEF_F64_64) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_128)) + +/** + @brief Initialization function for the 128pt Double Precision floating-point real FFT. + @param[in,out] S points to an arm_rfft_fast_instance_f64 structure + @return execution status + - \ref ARM_MATH_SUCCESS : Operation successful + - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected + */ + +static arm_status arm_rfft_128_fast_init_f64( arm_rfft_fast_instance_f64 * S ) { + + arm_cfft_instance_f64 * Sint; + + if( !S ) return ARM_MATH_ARGUMENT_ERROR; + + Sint = &(S->Sint); + Sint->fftLen = 64U; + S->fftLenRFFT = 128U; + + Sint->bitRevLength = ARMBITREVINDEXTABLEF64_64_TABLE_LENGTH; + Sint->pBitRevTable = (uint16_t *)armBitRevIndexTableF64_64; + Sint->pTwiddle = (float64_t *) twiddleCoefF64_64; + S->pTwiddleRFFT = (float64_t *) twiddleCoefF64_rfft_128; + + return ARM_MATH_SUCCESS; +} +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_128) && defined(ARM_TABLE_BITREVIDX_FLT64_128) && defined(ARM_TABLE_TWIDDLECOEF_F64_128) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_256)) + +/** + @brief Initialization function for the 256pt Double Precision floating-point real FFT. + @param[in,out] S points to an arm_rfft_fast_instance_f64 structure + @return execution status + - \ref ARM_MATH_SUCCESS : Operation successful + - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected +*/ + +static arm_status arm_rfft_256_fast_init_f64( arm_rfft_fast_instance_f64 * S ) { + + arm_cfft_instance_f64 * Sint; + + if( !S ) return ARM_MATH_ARGUMENT_ERROR; + + Sint = &(S->Sint); + Sint->fftLen = 128U; + S->fftLenRFFT = 256U; + + Sint->bitRevLength = ARMBITREVINDEXTABLEF64_128_TABLE_LENGTH; + Sint->pBitRevTable = (uint16_t *)armBitRevIndexTableF64_128; + Sint->pTwiddle = (float64_t *) twiddleCoefF64_128; + S->pTwiddleRFFT = (float64_t *) twiddleCoefF64_rfft_256; + + return ARM_MATH_SUCCESS; +} +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_256) && defined(ARM_TABLE_BITREVIDX_FLT64_256) && defined(ARM_TABLE_TWIDDLECOEF_F64_256) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_512)) + +/** + @brief Initialization function for the 512pt Double Precision floating-point real FFT. + @param[in,out] S points to an arm_rfft_fast_instance_f64 structure + @return execution status + - \ref ARM_MATH_SUCCESS : Operation successful + - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected + */ + +static arm_status arm_rfft_512_fast_init_f64( arm_rfft_fast_instance_f64 * S ) { + + arm_cfft_instance_f64 * Sint; + + if( !S ) return ARM_MATH_ARGUMENT_ERROR; + + Sint = &(S->Sint); + Sint->fftLen = 256U; + S->fftLenRFFT = 512U; + + Sint->bitRevLength = ARMBITREVINDEXTABLEF64_256_TABLE_LENGTH; + Sint->pBitRevTable = (uint16_t *)armBitRevIndexTableF64_256; + Sint->pTwiddle = (float64_t *) twiddleCoefF64_256; + S->pTwiddleRFFT = (float64_t *) twiddleCoefF64_rfft_512; + + return ARM_MATH_SUCCESS; +} +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_512) && defined(ARM_TABLE_BITREVIDX_FLT64_512) && defined(ARM_TABLE_TWIDDLECOEF_F64_512) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_1024)) +/** + @brief Initialization function for the 1024pt Double Precision floating-point real FFT. + @param[in,out] S points to an arm_rfft_fast_instance_f64 structure + @return execution status + - \ref ARM_MATH_SUCCESS : Operation successful + - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected + */ + +static arm_status arm_rfft_1024_fast_init_f64( arm_rfft_fast_instance_f64 * S ) { + + arm_cfft_instance_f64 * Sint; + + if( !S ) return ARM_MATH_ARGUMENT_ERROR; + + Sint = &(S->Sint); + Sint->fftLen = 512U; + S->fftLenRFFT = 1024U; + + Sint->bitRevLength = ARMBITREVINDEXTABLEF64_512_TABLE_LENGTH; + Sint->pBitRevTable = (uint16_t *)armBitRevIndexTableF64_512; + Sint->pTwiddle = (float64_t *) twiddleCoefF64_512; + S->pTwiddleRFFT = (float64_t *) twiddleCoefF64_rfft_1024; + + return ARM_MATH_SUCCESS; +} +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_1024) && defined(ARM_TABLE_BITREVIDX_FLT64_1024) && defined(ARM_TABLE_TWIDDLECOEF_F64_1024) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_2048)) +/** + @brief Initialization function for the 2048pt Double Precision floating-point real FFT. + @param[in,out] S points to an arm_rfft_fast_instance_f64 structure + @return execution status + - \ref ARM_MATH_SUCCESS : Operation successful + - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected + */ +static arm_status arm_rfft_2048_fast_init_f64( arm_rfft_fast_instance_f64 * S ) { + + arm_cfft_instance_f64 * Sint; + + if( !S ) return ARM_MATH_ARGUMENT_ERROR; + + Sint = &(S->Sint); + Sint->fftLen = 1024U; + S->fftLenRFFT = 2048U; + + Sint->bitRevLength = ARMBITREVINDEXTABLEF64_1024_TABLE_LENGTH; + Sint->pBitRevTable = (uint16_t *)armBitRevIndexTableF64_1024; + Sint->pTwiddle = (float64_t *) twiddleCoefF64_1024; + S->pTwiddleRFFT = (float64_t *) twiddleCoefF64_rfft_2048; + + return ARM_MATH_SUCCESS; +} +#endif + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_2048) && defined(ARM_TABLE_BITREVIDX_FLT64_2048) && defined(ARM_TABLE_TWIDDLECOEF_F64_2048) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_4096)) +/** +* @brief Initialization function for the 4096pt Double Precision floating-point real FFT. +* @param[in,out] S points to an arm_rfft_fast_instance_f64 structure + @return execution status + - \ref ARM_MATH_SUCCESS : Operation successful + - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected + */ + +static arm_status arm_rfft_4096_fast_init_f64( arm_rfft_fast_instance_f64 * S ) { + + arm_cfft_instance_f64 * Sint; + + if( !S ) return ARM_MATH_ARGUMENT_ERROR; + + Sint = &(S->Sint); + Sint->fftLen = 2048U; + S->fftLenRFFT = 4096U; + + Sint->bitRevLength = ARMBITREVINDEXTABLEF64_2048_TABLE_LENGTH; + Sint->pBitRevTable = (uint16_t *)armBitRevIndexTableF64_2048; + Sint->pTwiddle = (float64_t *) twiddleCoefF64_2048; + S->pTwiddleRFFT = (float64_t *) twiddleCoefF64_rfft_4096; + + return ARM_MATH_SUCCESS; +} +#endif + +/** + @brief Initialization function for the Double Precision floating-point real FFT. + @param[in,out] S points to an arm_rfft_fast_instance_f64 structure + @param[in] fftLen length of the Real Sequence + @return execution status + - \ref ARM_MATH_SUCCESS : Operation successful + - \ref ARM_MATH_ARGUMENT_ERROR : <code>fftLen</code> is not a supported length + + @par Description + The parameter <code>fftLen</code> specifies the length of RFFT/CIFFT process. + Supported FFT Lengths are 32, 64, 128, 256, 512, 1024, 2048, 4096. + @par + This Function also initializes Twiddle factor table pointer and Bit reversal table pointer. + */ + +arm_status arm_rfft_fast_init_f64( + arm_rfft_fast_instance_f64 * S, + uint16_t fftLen) +{ + typedef arm_status(*fft_init_ptr)( arm_rfft_fast_instance_f64 *); + fft_init_ptr fptr = 0x0; + + switch (fftLen) + { +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_2048) && defined(ARM_TABLE_BITREVIDX_FLT64_2048) && defined(ARM_TABLE_TWIDDLECOEF_F64_2048) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_4096)) + case 4096U: + fptr = arm_rfft_4096_fast_init_f64; + break; +#endif +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_1024) && defined(ARM_TABLE_BITREVIDX_FLT64_1024) && defined(ARM_TABLE_TWIDDLECOEF_F64_1024) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_2048)) + case 2048U: + fptr = arm_rfft_2048_fast_init_f64; + break; +#endif +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_512) && defined(ARM_TABLE_BITREVIDX_FLT64_512) && defined(ARM_TABLE_TWIDDLECOEF_F64_512) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_1024)) + case 1024U: + fptr = arm_rfft_1024_fast_init_f64; + break; +#endif +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_256) && defined(ARM_TABLE_BITREVIDX_FLT64_256) && defined(ARM_TABLE_TWIDDLECOEF_F64_256) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_512)) + case 512U: + fptr = arm_rfft_512_fast_init_f64; + break; +#endif +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_128) && defined(ARM_TABLE_BITREVIDX_FLT64_128) && defined(ARM_TABLE_TWIDDLECOEF_F64_128) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_256)) + case 256U: + fptr = arm_rfft_256_fast_init_f64; + break; +#endif +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_64) && defined(ARM_TABLE_BITREVIDX_FLT64_64) && defined(ARM_TABLE_TWIDDLECOEF_F64_64) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_128)) + case 128U: + fptr = arm_rfft_128_fast_init_f64; + break; +#endif +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_32) && defined(ARM_TABLE_BITREVIDX_FLT64_32) && defined(ARM_TABLE_TWIDDLECOEF_F64_32) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_64)) + case 64U: + fptr = arm_rfft_64_fast_init_f64; + break; +#endif +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_16) && defined(ARM_TABLE_BITREVIDX_FLT64_16) && defined(ARM_TABLE_TWIDDLECOEF_F64_16) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_32)) + case 32U: + fptr = arm_rfft_32_fast_init_f64; + break; +#endif + default: + break; + } + + if( ! fptr ) return ARM_MATH_ARGUMENT_ERROR; + return fptr( S ); + +} + +/** + @} end of RealFFT group + */ diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_rfft_init_f32.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_rfft_init_f32.c new file mode 100644 index 0000000..0a32da6 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_rfft_init_f32.c @@ -0,0 +1,147 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_rfft_init_f32.c + * Description: RFFT & RIFFT Floating point initialisation function + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/transform_functions.h" +#include "arm_common_tables.h" + + +/** + @addtogroup RealFFT + @{ + */ + +/** + @brief Initialization function for the floating-point RFFT/RIFFT. + @deprecated Do not use this function. It has been superceded by \ref arm_rfft_fast_init_f32 and will be removed in the future. + @param[in,out] S points to an instance of the floating-point RFFT/RIFFT structure + @param[in,out] S_CFFT points to an instance of the floating-point CFFT/CIFFT structure + @param[in] fftLenReal length of the FFT. + @param[in] ifftFlagR flag that selects transform direction + - value = 0: forward transform + - value = 1: inverse transform + @param[in] bitReverseFlag flag that enables / disables bit reversal of output + - value = 0: disables bit reversal of output + - value = 1: enables bit reversal of output + @return execution status + - \ref ARM_MATH_SUCCESS : Operation successful + - \ref ARM_MATH_ARGUMENT_ERROR : <code>fftLenReal</code> is not a supported length + + @par Description + The parameter <code>fftLenReal</code>specifies length of RFFT/RIFFT Process. + Supported FFT Lengths are 128, 512, 2048. + @par + The parameter <code>ifftFlagR</code> controls whether a forward or inverse transform is computed. + Set(=1) ifftFlagR to calculate RIFFT, otherwise RFFT is calculated. + @par + The parameter <code>bitReverseFlag</code> controls whether output is in normal order or bit reversed order. + Set(=1) bitReverseFlag for output to be in normal order otherwise output is in bit reversed order. + @par + This function also initializes Twiddle factor table. + */ + +arm_status arm_rfft_init_f32( + arm_rfft_instance_f32 * S, + arm_cfft_radix4_instance_f32 * S_CFFT, + uint32_t fftLenReal, + uint32_t ifftFlagR, + uint32_t bitReverseFlag) +{ + /* Initialise the default arm status */ + arm_status status = ARM_MATH_ARGUMENT_ERROR; + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES) + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_REALCOEF_F32) + + /* Initialise the default arm status */ + status = ARM_MATH_SUCCESS; + + /* Initialize the Real FFT length */ + S->fftLenReal = (uint16_t) fftLenReal; + + /* Initialize the Complex FFT length */ + S->fftLenBy2 = (uint16_t) fftLenReal / 2U; + + /* Initialize the Twiddle coefficientA pointer */ + S->pTwiddleAReal = (float32_t *) realCoefA; + + /* Initialize the Twiddle coefficientB pointer */ + S->pTwiddleBReal = (float32_t *) realCoefB; + + /* Initialize the Flag for selection of RFFT or RIFFT */ + S->ifftFlagR = (uint8_t) ifftFlagR; + + /* Initialize the Flag for calculation Bit reversal or not */ + S->bitReverseFlagR = (uint8_t) bitReverseFlag; + + /* Initializations of structure parameters depending on the FFT length */ + switch (S->fftLenReal) + { + /* Init table modifier value */ + case 8192U: + S->twidCoefRModifier = 1U; + break; + case 2048U: + S->twidCoefRModifier = 4U; + break; + case 512U: + S->twidCoefRModifier = 16U; + break; + case 128U: + S->twidCoefRModifier = 64U; + break; + default: + /* Reporting argument error if rfftSize is not valid value */ + status = ARM_MATH_ARGUMENT_ERROR; + break; + } + + /* Init Complex FFT Instance */ + S->pCfft = S_CFFT; + + if (S->ifftFlagR) + { + /* Initializes the CIFFT Module for fftLenreal/2 length */ + arm_cfft_radix4_init_f32(S->pCfft, S->fftLenBy2, 1U, 0U); + } + else + { + /* Initializes the CFFT Module for fftLenreal/2 length */ + arm_cfft_radix4_init_f32(S->pCfft, S->fftLenBy2, 0U, 0U); + } + +#endif +#endif + /* return the status of RFFT Init function */ + return (status); + +} + +/** + @} end of RealFFT group + */ diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_rfft_init_q15.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_rfft_init_q15.c new file mode 100644 index 0000000..e70f8af --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_rfft_init_q15.c @@ -0,0 +1,248 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_rfft_init_q15.c + * Description: RFFT & RIFFT Q15 initialisation function + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/transform_functions.h" +#include "arm_common_tables.h" +#include "arm_const_structs.h" + +/** + @addtogroup RealFFT + @{ + */ + +/** + @brief Initialization function for the Q15 RFFT/RIFFT. + @param[in,out] S points to an instance of the Q15 RFFT/RIFFT structure + @param[in] fftLenReal length of the FFT + @param[in] ifftFlagR flag that selects transform direction + - value = 0: forward transform + - value = 1: inverse transform + @param[in] bitReverseFlag flag that enables / disables bit reversal of output + - value = 0: disables bit reversal of output + - value = 1: enables bit reversal of output + @return execution status + - \ref ARM_MATH_SUCCESS : Operation successful + - \ref ARM_MATH_ARGUMENT_ERROR : <code>fftLenReal</code> is not a supported length + + @par Details + The parameter <code>fftLenReal</code> specifies length of RFFT/RIFFT Process. + Supported FFT Lengths are 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192. + @par + The parameter <code>ifftFlagR</code> controls whether a forward or inverse transform is computed. + Set(=1) ifftFlagR to calculate RIFFT, otherwise RFFT is calculated. + @par + The parameter <code>bitReverseFlag</code> controls whether output is in normal order or bit reversed order. + Set(=1) bitReverseFlag for output to be in normal order otherwise output is in bit reversed order. + @par + This function also initializes Twiddle factor table. + */ + +arm_status arm_rfft_init_q15( + arm_rfft_instance_q15 * S, + uint32_t fftLenReal, + uint32_t ifftFlagR, + uint32_t bitReverseFlag) +{ + /* Initialise the default arm status */ + arm_status status = ARM_MATH_ARGUMENT_ERROR; + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES) + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_REALCOEF_Q15) + + /* Initialise the default arm status */ + status = ARM_MATH_SUCCESS; + + /* Initialize the Real FFT length */ + S->fftLenReal = (uint16_t) fftLenReal; + + /* Initialize the Twiddle coefficientA pointer */ + S->pTwiddleAReal = (q15_t *) realCoefAQ15; + + /* Initialize the Twiddle coefficientB pointer */ + S->pTwiddleBReal = (q15_t *) realCoefBQ15; + + /* Initialize the Flag for selection of RFFT or RIFFT */ + S->ifftFlagR = (uint8_t) ifftFlagR; + + /* Initialize the Flag for calculation Bit reversal or not */ + S->bitReverseFlagR = (uint8_t) bitReverseFlag; + + /* Initialization of coef modifier depending on the FFT length */ + switch (S->fftLenReal) + { +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_4096) && defined(ARM_TABLE_BITREVIDX_FXT_4096)) + case 8192U: + S->twidCoefRModifier = 1U; + + #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) + status=arm_cfft_init_q15(&(S->cfftInst),4096); + if (status != ARM_MATH_SUCCESS) + { + return(status); + } + #else + S->pCfft = &arm_cfft_sR_q15_len4096; + #endif + break; +#endif +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_2048) && defined(ARM_TABLE_BITREVIDX_FXT_2048)) + case 4096U: + S->twidCoefRModifier = 2U; + + #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) + status=arm_cfft_init_q15(&(S->cfftInst),2048); + if (status != ARM_MATH_SUCCESS) + { + return(status); + } + #else + S->pCfft = &arm_cfft_sR_q15_len2048; + #endif + break; +#endif +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_1024) && defined(ARM_TABLE_BITREVIDX_FXT_1024)) + case 2048U: + S->twidCoefRModifier = 4U; + + #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) + status=arm_cfft_init_q15(&(S->cfftInst),1024); + if (status != ARM_MATH_SUCCESS) + { + return(status); + } + #else + S->pCfft = &arm_cfft_sR_q15_len1024; + #endif + break; +#endif +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_512) && defined(ARM_TABLE_BITREVIDX_FXT_512)) + case 1024U: + S->twidCoefRModifier = 8U; + + #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) + status=arm_cfft_init_q15(&(S->cfftInst),512); + if (status != ARM_MATH_SUCCESS) + { + return(status); + } + #else + S->pCfft = &arm_cfft_sR_q15_len512; + #endif + break; +#endif +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_256) && defined(ARM_TABLE_BITREVIDX_FXT_256)) + case 512U: + S->twidCoefRModifier = 16U; + + #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) + status=arm_cfft_init_q15(&(S->cfftInst),256); + if (status != ARM_MATH_SUCCESS) + { + return(status); + } + #else + S->pCfft = &arm_cfft_sR_q15_len256; + #endif + break; +#endif +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_128) && defined(ARM_TABLE_BITREVIDX_FXT_128)) + case 256U: + S->twidCoefRModifier = 32U; + + #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) + status=arm_cfft_init_q15(&(S->cfftInst),128); + if (status != ARM_MATH_SUCCESS) + { + return(status); + } + #else + S->pCfft = &arm_cfft_sR_q15_len128; + #endif + break; +#endif +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_64) && defined(ARM_TABLE_BITREVIDX_FXT_64)) + case 128U: + S->twidCoefRModifier = 64U; + + #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) + status=arm_cfft_init_q15(&(S->cfftInst),64); + if (status != ARM_MATH_SUCCESS) + { + return(status); + } + #else + S->pCfft = &arm_cfft_sR_q15_len64; + #endif + break; +#endif +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_32) && defined(ARM_TABLE_BITREVIDX_FXT_32)) + case 64U: + S->twidCoefRModifier = 128U; + + #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) + status=arm_cfft_init_q15(&(S->cfftInst),32); + if (status != ARM_MATH_SUCCESS) + { + return(status); + } + #else + S->pCfft = &arm_cfft_sR_q15_len32; + #endif + break; +#endif +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_16) && defined(ARM_TABLE_BITREVIDX_FXT_16)) + case 32U: + S->twidCoefRModifier = 256U; + + #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) + status=arm_cfft_init_q15(&(S->cfftInst),16); + if (status != ARM_MATH_SUCCESS) + { + return(status); + } + #else + S->pCfft = &arm_cfft_sR_q15_len16; + #endif + break; +#endif + default: + /* Reporting argument error if rfftSize is not valid value */ + status = ARM_MATH_ARGUMENT_ERROR; + break; + } + +#endif +#endif + /* return the status of RFFT Init function */ + return (status); +} + +/** + @} end of RealFFT group + */ diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_rfft_init_q31.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_rfft_init_q31.c new file mode 100644 index 0000000..0a28719 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_rfft_init_q31.c @@ -0,0 +1,246 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_rfft_init_q31.c + * Description: RFFT & RIFFT Q31 initialisation function + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/transform_functions.h" +#include "arm_common_tables.h" +#include "arm_const_structs.h" + + + +/** + @addtogroup RealFFT + @{ + */ + +/** + @brief Initialization function for the Q31 RFFT/RIFFT. + @param[in,out] S points to an instance of the Q31 RFFT/RIFFT structure + @param[in] fftLenReal length of the FFT + @param[in] ifftFlagR flag that selects transform direction + - value = 0: forward transform + - value = 1: inverse transform + @param[in] bitReverseFlag flag that enables / disables bit reversal of output + - value = 0: disables bit reversal of output + - value = 1: enables bit reversal of output + @return execution status + - \ref ARM_MATH_SUCCESS : Operation successful + - \ref ARM_MATH_ARGUMENT_ERROR : <code>fftLenReal</code> is not a supported length + + @par Details + The parameter <code>fftLenReal</code> specifies length of RFFT/RIFFT Process. + Supported FFT Lengths are 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192. + @par + The parameter <code>ifftFlagR</code> controls whether a forward or inverse transform is computed. + Set(=1) ifftFlagR to calculate RIFFT, otherwise RFFT is calculated. + @par + The parameter <code>bitReverseFlag</code> controls whether output is in normal order or bit reversed order. + Set(=1) bitReverseFlag for output to be in normal order otherwise output is in bit reversed order. + @par + This function also initializes Twiddle factor table. +*/ + +arm_status arm_rfft_init_q31( + arm_rfft_instance_q31 * S, + uint32_t fftLenReal, + uint32_t ifftFlagR, + uint32_t bitReverseFlag) +{ + /* Initialise the default arm status */ + arm_status status = ARM_MATH_ARGUMENT_ERROR; + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES) + +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_REALCOEF_Q31) + + /* Initialise the default arm status */ + status = ARM_MATH_SUCCESS; + + /* Initialize the Real FFT length */ + S->fftLenReal = (uint16_t) fftLenReal; + + /* Initialize the Twiddle coefficientA pointer */ + S->pTwiddleAReal = (q31_t *) realCoefAQ31; + + /* Initialize the Twiddle coefficientB pointer */ + S->pTwiddleBReal = (q31_t *) realCoefBQ31; + + /* Initialize the Flag for selection of RFFT or RIFFT */ + S->ifftFlagR = (uint8_t) ifftFlagR; + + /* Initialize the Flag for calculation Bit reversal or not */ + S->bitReverseFlagR = (uint8_t) bitReverseFlag; + + /* Initialization of coef modifier depending on the FFT length */ + switch (S->fftLenReal) + { +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_4096) && defined(ARM_TABLE_BITREVIDX_FXT_4096)) + case 8192U: + + + S->twidCoefRModifier = 1U; + + #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) + status=arm_cfft_init_q31(&(S->cfftInst),4096); + if (status != ARM_MATH_SUCCESS) + { + return(status); + } + #else + S->pCfft = &arm_cfft_sR_q31_len4096; + #endif + break; +#endif +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_2048) && defined(ARM_TABLE_BITREVIDX_FXT_2048)) + case 4096U: + S->twidCoefRModifier = 2U; + + #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) + status=arm_cfft_init_q31(&(S->cfftInst),2048); + if (status != ARM_MATH_SUCCESS) + { + return(status); + } + #else + S->pCfft = &arm_cfft_sR_q31_len2048; + #endif + break; +#endif +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_1024) && defined(ARM_TABLE_BITREVIDX_FXT_1024)) + case 2048U: + S->twidCoefRModifier = 4U; + + #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) + status=arm_cfft_init_q31(&(S->cfftInst),1024); + if (status != ARM_MATH_SUCCESS) + { + return(status); + } + #else + S->pCfft = &arm_cfft_sR_q31_len1024; + #endif + break; +#endif +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_512) && defined(ARM_TABLE_BITREVIDX_FXT_512)) + case 1024U: + S->twidCoefRModifier = 8U; + #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) + status=arm_cfft_init_q31(&(S->cfftInst),512); + if (status != ARM_MATH_SUCCESS) + { + return(status); + } + #else + S->pCfft = &arm_cfft_sR_q31_len512; + #endif + break; +#endif +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_256) && defined(ARM_TABLE_BITREVIDX_FXT_256)) + case 512U: + S->twidCoefRModifier = 16U; + #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) + status=arm_cfft_init_q31(&(S->cfftInst),256); + if (status != ARM_MATH_SUCCESS) + { + return(status); + } + #else + S->pCfft = &arm_cfft_sR_q31_len256; + #endif + break; +#endif +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_128) && defined(ARM_TABLE_BITREVIDX_FXT_128)) + case 256U: + S->twidCoefRModifier = 32U; + #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) + status=arm_cfft_init_q31(&(S->cfftInst),128); + if (status != ARM_MATH_SUCCESS) + { + return(status); + } + #else + S->pCfft = &arm_cfft_sR_q31_len128; + #endif + break; +#endif +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_64) && defined(ARM_TABLE_BITREVIDX_FXT_64)) + case 128U: + S->twidCoefRModifier = 64U; + #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) + status=arm_cfft_init_q31(&(S->cfftInst),64); + if (status != ARM_MATH_SUCCESS) + { + return(status); + } + #else + S->pCfft = &arm_cfft_sR_q31_len64; + #endif + break; +#endif +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_32) && defined(ARM_TABLE_BITREVIDX_FXT_32)) + case 64U: + S->twidCoefRModifier = 128U; + #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) + status=arm_cfft_init_q31(&(S->cfftInst),32); + if (status != ARM_MATH_SUCCESS) + { + return(status); + } + #else + S->pCfft = &arm_cfft_sR_q31_len32; + #endif + break; +#endif +#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_16) && defined(ARM_TABLE_BITREVIDX_FXT_16)) + case 32U: + S->twidCoefRModifier = 256U; + #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) + status=arm_cfft_init_q31(&(S->cfftInst),16); + if (status != ARM_MATH_SUCCESS) + { + return(status); + } + #else + S->pCfft = &arm_cfft_sR_q31_len16; + #endif + break; +#endif + default: + /* Reporting argument error if rfftSize is not valid value */ + status = ARM_MATH_ARGUMENT_ERROR; + break; + } + +#endif +#endif + /* return the status of RFFT Init function */ + return (status); +} + +/** + @} end of RealFFT group + */ diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_rfft_q15.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_rfft_q15.c new file mode 100644 index 0000000..7d149c6 --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_rfft_q15.c @@ -0,0 +1,526 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_rfft_q15.c + * Description: RFFT & RIFFT Q15 process function + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/transform_functions.h" + +/* ---------------------------------------------------------------------- + * Internal functions prototypes + * -------------------------------------------------------------------- */ + +void arm_split_rfft_q15( + q15_t * pSrc, + uint32_t fftLen, + const q15_t * pATable, + const q15_t * pBTable, + q15_t * pDst, + uint32_t modifier); + +void arm_split_rifft_q15( + q15_t * pSrc, + uint32_t fftLen, + const q15_t * pATable, + const q15_t * pBTable, + q15_t * pDst, + uint32_t modifier); + +/** + @addtogroup RealFFT + @{ + */ + +/** + @brief Processing function for the Q15 RFFT/RIFFT. + @param[in] S points to an instance of the Q15 RFFT/RIFFT structure + @param[in] pSrc points to input buffer (Source buffer is modified by this function.) + @param[out] pDst points to output buffer + @return none + + @par Input an output formats + Internally input is downscaled by 2 for every stage to avoid saturations inside CFFT/CIFFT process. + Hence the output format is different for different RFFT sizes. + The input and output formats for different RFFT sizes and number of bits to upscale are mentioned in the tables below for RFFT and RIFFT: + @par + \image html RFFTQ15.gif "Input and Output Formats for Q15 RFFT" + @par + \image html RIFFTQ15.gif "Input and Output Formats for Q15 RIFFT" + @par + If the input buffer is of length N, the output buffer must have length 2*N. + The input buffer is modified by this function. + @par + For the RIFFT, the source buffer must at least have length + fftLenReal + 2. + The last two elements must be equal to what would be generated + by the RFFT: + (pSrc[0] - pSrc[1]) >> 1 and 0 + */ + +void arm_rfft_q15( + const arm_rfft_instance_q15 * S, + q15_t * pSrc, + q15_t * pDst) +{ +#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) + const arm_cfft_instance_q15 *S_CFFT = &(S->cfftInst); +#else + const arm_cfft_instance_q15 *S_CFFT = S->pCfft; +#endif + uint32_t L2 = S->fftLenReal >> 1U; + + /* Calculation of RIFFT of input */ + if (S->ifftFlagR == 1U) + { + /* Real IFFT core process */ + arm_split_rifft_q15 (pSrc, L2, S->pTwiddleAReal, S->pTwiddleBReal, pDst, S->twidCoefRModifier); + + /* Complex IFFT process */ + arm_cfft_q15 (S_CFFT, pDst, S->ifftFlagR, S->bitReverseFlagR); + + arm_shift_q15(pDst, 1, pDst, S->fftLenReal); + } + else + { + /* Calculation of RFFT of input */ + + /* Complex FFT process */ + arm_cfft_q15 (S_CFFT, pSrc, S->ifftFlagR, S->bitReverseFlagR); + + /* Real FFT core process */ + arm_split_rfft_q15 (pSrc, L2, S->pTwiddleAReal, S->pTwiddleBReal, pDst, S->twidCoefRModifier); + } + +} + +/** + @} end of RealFFT group + */ + +/** + @brief Core Real FFT process + @param[in] pSrc points to input buffer + @param[in] fftLen length of FFT + @param[in] pATable points to twiddle Coef A buffer + @param[in] pBTable points to twiddle Coef B buffer + @param[out] pDst points to output buffer + @param[in] modifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table + @return none + + @par + The function implements a Real FFT + */ + +#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) + +#include "arm_helium_utils.h" +#include "arm_vec_fft.h" + +#if defined(__CMSIS_GCC_H) +#define MVE_CMPLX_MULT_FX_AxB_S16(A,B) vqdmladhxq_s16(vqdmlsdhq_s16((__typeof(A))vuninitializedq_s16(), A, B), A, B) +#define MVE_CMPLX_MULT_FX_AxConjB_S16(A,B) vqdmladhq_s16(vqdmlsdhxq_s16((__typeof(A))vuninitializedq_s16(), A, B), A, B) + +#endif + +void arm_split_rfft_q15( + q15_t * pSrc, + uint32_t fftLen, + const q15_t * pATable, + const q15_t * pBTable, + q15_t * pDst, + uint32_t modifier) +{ + uint32_t i; /* Loop Counter */ + const q15_t *pCoefA, *pCoefB; /* Temporary pointers for twiddle factors */ + q15_t *pOut1 = &pDst[2]; + q15_t *pIn1 = &pSrc[2]; + uint16x8_t offsetIn = { 6, 7, 4, 5, 2, 3, 0, 1 }; + uint16x8_t offsetCoef; + const uint16_t offsetCoefArr[16] = { + 0, 0, 2, 2, 4, 4, 6, 6, + 0, 1, 0, 1, 0, 1, 0, 1 + }; + + offsetCoef = vmulq_n_u16(vld1q_u16(offsetCoefArr), modifier) + vld1q_u16(offsetCoefArr + 8); + offsetIn = vaddq_n_u16(offsetIn, (2 * fftLen - 8)); + + /* Init coefficient pointers */ + pCoefA = &pATable[modifier * 2]; + pCoefB = &pBTable[modifier * 2]; + + const q15_t *pCoefAb, *pCoefBb; + pCoefAb = pCoefA; + pCoefBb = pCoefB; + + pIn1 = &pSrc[2]; + + i = fftLen - 1U; + i = i / 4 + 1; + while (i > 0U) { + q15x8_t in1 = vld1q_s16(pIn1); + q15x8_t in2 = vldrhq_gather_shifted_offset_s16(pSrc, offsetIn); + q15x8_t coefA = vldrhq_gather_shifted_offset_s16(pCoefAb, offsetCoef); + q15x8_t coefB = vldrhq_gather_shifted_offset_s16(pCoefBb, offsetCoef); + +#if defined(__CMSIS_GCC_H) + q15x8_t out = vhaddq_s16(MVE_CMPLX_MULT_FX_AxB_S16(in1, coefA), + MVE_CMPLX_MULT_FX_AxConjB_S16(coefB, in2)); +#else + q15x8_t out = vhaddq_s16(MVE_CMPLX_MULT_FX_AxB(in1, coefA, q15x8_t), + MVE_CMPLX_MULT_FX_AxConjB(coefB, in2, q15x8_t)); +#endif + vst1q_s16(pOut1, out); + pOut1 += 8; + + offsetCoef = vaddq_n_u16(offsetCoef, modifier * 8); + offsetIn -= 8; + pIn1 += 8; + i -= 1; + } + + pDst[2 * fftLen] = (pSrc[0] - pSrc[1]) >> 1U; + pDst[2 * fftLen + 1] = 0; + + pDst[0] = (pSrc[0] + pSrc[1]) >> 1U; + pDst[1] = 0; +} +#else +void arm_split_rfft_q15( + q15_t * pSrc, + uint32_t fftLen, + const q15_t * pATable, + const q15_t * pBTable, + q15_t * pDst, + uint32_t modifier) +{ + uint32_t i; /* Loop Counter */ + q31_t outR, outI; /* Temporary variables for output */ + const q15_t *pCoefA, *pCoefB; /* Temporary pointers for twiddle factors */ + q15_t *pSrc1, *pSrc2; +#if defined (ARM_MATH_DSP) + q15_t *pD1, *pD2; +#endif + + /* Init coefficient pointers */ + pCoefA = &pATable[modifier * 2]; + pCoefB = &pBTable[modifier * 2]; + + pSrc1 = &pSrc[2]; + pSrc2 = &pSrc[(2U * fftLen) - 2U]; + +#if defined (ARM_MATH_DSP) + + i = 1U; + pD1 = pDst + 2; + pD2 = pDst + (4U * fftLen) - 2; + + for (i = fftLen - 1; i > 0; i--) + { + /* + outR = ( pSrc[2 * i] * pATable[2 * i] + - pSrc[2 * i + 1] * pATable[2 * i + 1] + + pSrc[2 * n - 2 * i] * pBTable[2 * i] + + pSrc[2 * n - 2 * i + 1] * pBTable[2 * i + 1]); + + outI = ( pIn[2 * i + 1] * pATable[2 * i] + + pIn[2 * i] * pATable[2 * i + 1] + + pIn[2 * n - 2 * i] * pBTable[2 * i + 1] + - pIn[2 * n - 2 * i + 1] * pBTable[2 * i]) + */ + + +#ifndef ARM_MATH_BIG_ENDIAN + /* pSrc[2 * i] * pATable[2 * i] - pSrc[2 * i + 1] * pATable[2 * i + 1] */ + outR = __SMUSD(read_q15x2 (pSrc1), read_q15x2((q15_t *) pCoefA)); +#else + /* -(pSrc[2 * i + 1] * pATable[2 * i + 1] - pSrc[2 * i] * pATable[2 * i]) */ + outR = -(__SMUSD(read_q15x2 (pSrc1), read_q15x2((q15_t *) pCoefA))); +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* pSrc[2 * n - 2 * i] * pBTable[2 * i] + pSrc[2 * n - 2 * i + 1] * pBTable[2 * i + 1]) */ + outR = __SMLAD(read_q15x2 (pSrc2), read_q15x2((q15_t *) pCoefB), outR) >> 16U; + + /* pIn[2 * n - 2 * i] * pBTable[2 * i + 1] - pIn[2 * n - 2 * i + 1] * pBTable[2 * i] */ +#ifndef ARM_MATH_BIG_ENDIAN + outI = __SMUSDX(read_q15x2_da (&pSrc2), read_q15x2((q15_t *) pCoefB)); +#else + outI = __SMUSDX(read_q15x2 ((q15_t *) pCoefB), read_q15x2_da (&pSrc2)); +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* (pIn[2 * i + 1] * pATable[2 * i] + pIn[2 * i] * pATable[2 * i + 1] */ + outI = __SMLADX(read_q15x2_ia (&pSrc1), read_q15x2 ((q15_t *) pCoefA), outI); + + /* write output */ + *pD1++ = (q15_t) outR; + *pD1++ = outI >> 16U; + + /* write complex conjugate output */ + pD2[0] = (q15_t) outR; + pD2[1] = -(outI >> 16U); + pD2 -= 2; + + /* update coefficient pointer */ + pCoefB = pCoefB + (2U * modifier); + pCoefA = pCoefA + (2U * modifier); + } + + pDst[2U * fftLen] = (pSrc[0] - pSrc[1]) >> 1U; + pDst[2U * fftLen + 1U] = 0; + + pDst[0] = (pSrc[0] + pSrc[1]) >> 1U; + pDst[1] = 0; + +#else + + i = 1U; + + while (i < fftLen) + { + /* + outR = ( pSrc[2 * i] * pATable[2 * i] + - pSrc[2 * i + 1] * pATable[2 * i + 1] + + pSrc[2 * n - 2 * i] * pBTable[2 * i] + + pSrc[2 * n - 2 * i + 1] * pBTable[2 * i + 1]); + */ + + outR = *pSrc1 * *pCoefA; + outR = outR - (*(pSrc1 + 1) * *(pCoefA + 1)); + outR = outR + (*pSrc2 * *pCoefB); + outR = (outR + (*(pSrc2 + 1) * *(pCoefB + 1))) >> 16; + + /* + outI = ( pIn[2 * i + 1] * pATable[2 * i] + + pIn[2 * i] * pATable[2 * i + 1] + + pIn[2 * n - 2 * i] * pBTable[2 * i + 1] + - pIn[2 * n - 2 * i + 1] * pBTable[2 * i]); + */ + + outI = *pSrc2 * *(pCoefB + 1); + outI = outI - (*(pSrc2 + 1) * *pCoefB); + outI = outI + (*(pSrc1 + 1) * *pCoefA); + outI = outI + (*pSrc1 * *(pCoefA + 1)); + + /* update input pointers */ + pSrc1 += 2U; + pSrc2 -= 2U; + + /* write output */ + pDst[2U * i] = (q15_t) outR; + pDst[2U * i + 1U] = outI >> 16U; + + /* write complex conjugate output */ + pDst[(4U * fftLen) - (2U * i)] = (q15_t) outR; + pDst[((4U * fftLen) - (2U * i)) + 1U] = -(outI >> 16U); + + /* update coefficient pointer */ + pCoefB = pCoefB + (2U * modifier); + pCoefA = pCoefA + (2U * modifier); + + i++; + } + + pDst[2U * fftLen] = (pSrc[0] - pSrc[1]) >> 1; + pDst[2U * fftLen + 1U] = 0; + + pDst[0] = (pSrc[0] + pSrc[1]) >> 1; + pDst[1] = 0; + +#endif /* #if defined (ARM_MATH_DSP) */ +} +#endif /* defined(ARM_MATH_MVEI) */ + +/** + @brief Core Real IFFT process + @param[in] pSrc points to input buffer + @param[in] fftLen length of FFT + @param[in] pATable points to twiddle Coef A buffer + @param[in] pBTable points to twiddle Coef B buffer + @param[out] pDst points to output buffer + @param[in] modifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table + @return none + + @par + The function implements a Real IFFT + */ + +#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) + +#include "arm_helium_utils.h" +#include "arm_vec_fft.h" + +void arm_split_rifft_q15( + q15_t * pSrc, + uint32_t fftLen, + const q15_t * pATable, + const q15_t * pBTable, + q15_t * pDst, + uint32_t modifier) +{ + uint32_t i; /* Loop Counter */ + const q15_t *pCoefA, *pCoefB; /* Temporary pointers for twiddle factors */ + q15_t *pIn1; + uint16x8_t offset = { 6, 7, 4, 5, 2, 3, 0, 1 }; + uint16x8_t offsetCoef; + int16x8_t conj = { 1, -1, 1, -1, 1, -1, 1, -1 }; /* conjugate */ + const uint16_t offsetCoefArr[16] = { + 0, 0, 2, 2, 4, 4, 6, 6, + 0, 1, 0, 1, 0, 1, 0, 1 + }; + + offsetCoef = vmulq_n_u16(vld1q_u16(offsetCoefArr), modifier) + vld1q_u16(offsetCoefArr + 8); + + offset = vaddq_n_u16(offset, (2 * fftLen - 6)); + + /* Init coefficient pointers */ + pCoefA = &pATable[0]; + pCoefB = &pBTable[0]; + + const q15_t *pCoefAb, *pCoefBb; + pCoefAb = pCoefA; + pCoefBb = pCoefB; + + pIn1 = &pSrc[0]; + + i = fftLen; + i = i / 4; + + while (i > 0U) { + q15x8_t in1 = vld1q_s16(pIn1); + q15x8_t in2 = vldrhq_gather_shifted_offset_s16(pSrc, offset); + q15x8_t coefA = vldrhq_gather_shifted_offset_s16(pCoefAb, offsetCoef); + q15x8_t coefB = vldrhq_gather_shifted_offset_s16(pCoefBb, offsetCoef); + + /* can we avoid the conjugate here ? */ + q15x8_t out = vhaddq_s16(MVE_CMPLX_MULT_FX_AxConjB(in1, coefA, q15x8_t), + vmulq(conj, MVE_CMPLX_MULT_FX_AxB(in2, coefB, q15x8_t))); + + vst1q_s16(pDst, out); + pDst += 8; + + offsetCoef = vaddq_n_u16(offsetCoef, modifier * 8); + offset -= 8; + + pIn1 += 8; + i -= 1; + } +} +#else +void arm_split_rifft_q15( + q15_t * pSrc, + uint32_t fftLen, + const q15_t * pATable, + const q15_t * pBTable, + q15_t * pDst, + uint32_t modifier) +{ + uint32_t i; /* Loop Counter */ + q31_t outR, outI; /* Temporary variables for output */ + const q15_t *pCoefA, *pCoefB; /* Temporary pointers for twiddle factors */ + q15_t *pSrc1, *pSrc2; + q15_t *pDst1 = &pDst[0]; + + pCoefA = &pATable[0]; + pCoefB = &pBTable[0]; + + pSrc1 = &pSrc[0]; + pSrc2 = &pSrc[2 * fftLen]; + + i = fftLen; + while (i > 0U) + { + /* + outR = ( pIn[2 * i] * pATable[2 * i] + + pIn[2 * i + 1] * pATable[2 * i + 1] + + pIn[2 * n - 2 * i] * pBTable[2 * i] + - pIn[2 * n - 2 * i + 1] * pBTable[2 * i + 1]); + + outI = ( pIn[2 * i + 1] * pATable[2 * i] + - pIn[2 * i] * pATable[2 * i + 1] + - pIn[2 * n - 2 * i] * pBTable[2 * i + 1] + - pIn[2 * n - 2 * i + 1] * pBTable[2 * i]); + */ + +#if defined (ARM_MATH_DSP) + +#ifndef ARM_MATH_BIG_ENDIAN + /* pIn[2 * n - 2 * i] * pBTable[2 * i] - pIn[2 * n - 2 * i + 1] * pBTable[2 * i + 1]) */ + outR = __SMUSD(read_q15x2(pSrc2), read_q15x2((q15_t *) pCoefB)); +#else + /* -(-pIn[2 * n - 2 * i] * pBTable[2 * i] + pIn[2 * n - 2 * i + 1] * pBTable[2 * i + 1])) */ + outR = -(__SMUSD(read_q15x2(pSrc2), read_q15x2((q15_t *) pCoefB))); +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* pIn[2 * i] * pATable[2 * i] + pIn[2 * i + 1] * pATable[2 * i + 1] + pIn[2 * n - 2 * i] * pBTable[2 * i] */ + outR = __SMLAD(read_q15x2(pSrc1), read_q15x2 ((q15_t *) pCoefA), outR) >> 16U; + + /* -pIn[2 * n - 2 * i] * pBTable[2 * i + 1] + pIn[2 * n - 2 * i + 1] * pBTable[2 * i] */ + outI = __SMUADX(read_q15x2_da (&pSrc2), read_q15x2((q15_t *) pCoefB)); + + /* pIn[2 * i + 1] * pATable[2 * i] - pIn[2 * i] * pATable[2 * i + 1] */ +#ifndef ARM_MATH_BIG_ENDIAN + outI = __SMLSDX(read_q15x2 ((q15_t *) pCoefA), read_q15x2_ia (&pSrc1), -outI); +#else + outI = __SMLSDX(read_q15x2_ia (&pSrc1), read_q15x2 ((q15_t *) pCoefA), -outI); +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + /* write output */ +#ifndef ARM_MATH_BIG_ENDIAN + write_q15x2_ia (&pDst1, __PKHBT(outR, (outI >> 16U), 16)); +#else + write_q15x2_ia (&pDst1, __PKHBT((outI >> 16U), outR, 16)); +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ + + +#else /* #if defined (ARM_MATH_DSP) */ + + outR = *pSrc2 * *pCoefB; + outR = outR - (*(pSrc2 + 1) * *(pCoefB + 1)); + outR = outR + (*pSrc1 * *pCoefA); + outR = (outR + (*(pSrc1 + 1) * *(pCoefA + 1))) >> 16; + + outI = *(pSrc1 + 1) * *pCoefA; + outI = outI - (*pSrc1 * *(pCoefA + 1)); + outI = outI - (*pSrc2 * *(pCoefB + 1)); + outI = outI - (*(pSrc2 + 1) * *(pCoefB)); + + /* update input pointers */ + pSrc1 += 2U; + pSrc2 -= 2U; + + /* write output */ + *pDst1++ = (q15_t) outR; + *pDst1++ = (q15_t) (outI >> 16); + +#endif /* #if defined (ARM_MATH_DSP) */ + + /* update coefficient pointer */ + pCoefB = pCoefB + (2 * modifier); + pCoefA = pCoefA + (2 * modifier); + + i--; + } + +} +#endif /* defined(ARM_MATH_MVEI) */ diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_rfft_q31.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_rfft_q31.c new file mode 100644 index 0000000..ad3212d --- /dev/null +++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_rfft_q31.c @@ -0,0 +1,430 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_rfft_q31.c + * Description: FFT & RIFFT Q31 process function + * + * $Date: 23 April 2021 + * $Revision: V1.9.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/transform_functions.h" + +/* ---------------------------------------------------------------------- + * Internal functions prototypes + * -------------------------------------------------------------------- */ + +void arm_split_rfft_q31( + q31_t * pSrc, + uint32_t fftLen, + const q31_t * pATable, + const q31_t * pBTable, + q31_t * pDst, + uint32_t modifier); + +void arm_split_rifft_q31( + q31_t * pSrc, + uint32_t fftLen, + const q31_t * pATable, + const q31_t * pBTable, + q31_t * pDst, + uint32_t modifier); + +/** + @addtogroup RealFFT + @{ + */ + +/** + @brief Processing function for the Q31 RFFT/RIFFT. + @param[in] S points to an instance of the Q31 RFFT/RIFFT structure + @param[in] pSrc points to input buffer (Source buffer is modified by this function) + @param[out] pDst points to output buffer + @return none + + @par Input an output formats + Internally input is downscaled by 2 for every stage to avoid saturations inside CFFT/CIFFT process. + Hence the output format is different for different RFFT sizes. + The input and output formats for different RFFT sizes and number of bits to upscale are mentioned in the tables below for RFFT and RIFFT: + @par + \image html RFFTQ31.gif "Input and Output Formats for Q31 RFFT" + @par + \image html RIFFTQ31.gif "Input and Output Formats for Q31 RIFFT" + @par + If the input buffer is of length N, the output buffer must have length 2*N. + The input buffer is modified by this function. + @par + For the RIFFT, the source buffer must at least have length + fftLenReal + 2. + The last two elements must be equal to what would be generated + by the RFFT: + (pSrc[0] - pSrc[1]) >> 1 and 0 + + */ + +void arm_rfft_q31( + const arm_rfft_instance_q31 * S, + q31_t * pSrc, + q31_t * pDst) +{ +#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) + const arm_cfft_instance_q31 *S_CFFT = &(S->cfftInst); +#else + const arm_cfft_instance_q31 *S_CFFT = S->pCfft; +#endif + uint32_t L2 = S->fftLenReal >> 1U; + + /* Calculation of RIFFT of input */ + if (S->ifftFlagR == 1U) + { + /* Real IFFT core process */ + arm_split_rifft_q31 (pSrc, L2, S->pTwiddleAReal, S->pTwiddleBReal, pDst, S->twidCoefRModifier); + + /* Complex IFFT process */ + arm_cfft_q31 (S_CFFT, pDst, S->ifftFlagR, S->bitReverseFlagR); + + arm_shift_q31(pDst, 1, pDst, S->fftLenReal); + } + else + { + /* Calculation of RFFT of input */ + + /* Complex FFT process */ + arm_cfft_q31 (S_CFFT, pSrc, S->ifftFlagR, S->bitReverseFlagR); + + /* Real FFT core process */ + arm_split_rfft_q31 (pSrc, L2, S->pTwiddleAReal, S->pTwiddleBReal, pDst, S->twidCoefRModifier); + } + +} + +/** + @} end of RealFFT group + */ + +/** + @brief Core Real FFT process + @param[in] pSrc points to input buffer + @param[in] fftLen length of FFT + @param[in] pATable points to twiddle Coef A buffer + @param[in] pBTable points to twiddle Coef B buffer + @param[out] pDst points to output buffer + @param[in] modifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table + @return none + */ + +#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) + +#include "arm_helium_utils.h" +#include "arm_vec_fft.h" + +#if defined(__CMSIS_GCC_H) + +#define MVE_CMPLX_MULT_FX_AxB_S32(A,B) vqdmladhxq_s32(vqdmlsdhq_s32((__typeof(A))vuninitializedq_s32(), A, B), A, B) +#define MVE_CMPLX_MULT_FX_AxConjB_S32(A,B) vqdmladhq_s32(vqdmlsdhxq_s32((__typeof(A))vuninitializedq_s32(), A, B), A, B) + +#endif + +void arm_split_rfft_q31( + q31_t *pSrc, + uint32_t fftLen, + const q31_t *pATable, + const q31_t *pBTable, + q31_t *pDst, + uint32_t modifier) +{ + uint32_t i; /* Loop Counter */ + const q31_t *pCoefA, *pCoefB; /* Temporary pointers for twiddle factors */ + q31_t *pOut1 = &pDst[2]; + q31_t *pIn1 = &pSrc[2]; + uint32x4_t offset = { 2, 3, 0, 1 }; + uint32x4_t offsetCoef = { 0, 1, modifier * 2, modifier * 2 + 1 }; + + offset = offset + (2 * fftLen - 4); + + + /* Init coefficient pointers */ + pCoefA = &pATable[modifier * 2]; + pCoefB = &pBTable[modifier * 2]; + + const q31_t *pCoefAb, *pCoefBb; + pCoefAb = pCoefA; + pCoefBb = pCoefB; + + pIn1 = &pSrc[2]; + + i = fftLen - 1U; + i = i / 2 + 1; + while (i > 0U) { + q31x4_t in1 = vld1q_s32(pIn1); + q31x4_t in2 = vldrwq_gather_shifted_offset_s32(pSrc, offset); + q31x4_t coefA = vldrwq_gather_shifted_offset_s32(pCoefAb, offsetCoef); + q31x4_t coefB = vldrwq_gather_shifted_offset_s32(pCoefBb, offsetCoef); +#if defined(__CMSIS_GCC_H) + q31x4_t out = vhaddq_s32(MVE_CMPLX_MULT_FX_AxB_S32(in1, coefA),MVE_CMPLX_MULT_FX_AxConjB_S32(coefB, in2)); +#else + q31x4_t out = vhaddq_s32(MVE_CMPLX_MULT_FX_AxB(in1, coefA, q31x4_t), + MVE_CMPLX_MULT_FX_AxConjB(coefB, in2, q31x4_t)); +#endif + vst1q(pOut1, out); + pOut1 += 4; + + offsetCoef += modifier * 4; + offset -= 4; + + pIn1 += 4; + i -= 1; + } + + pDst[2 * fftLen] = (pSrc[0] - pSrc[1]) >> 1U; + pDst[2 * fftLen + 1] = 0; + + pDst[0] = (pSrc[0] + pSrc[1]) >> 1U; + pDst[1] = 0; +} +#else +void arm_split_rfft_q31( + q31_t * pSrc, + uint32_t fftLen, + const q31_t * pATable, + const q31_t * pBTable, + q31_t * pDst, + uint32_t modifier) +{ + uint32_t i; /* Loop Counter */ + q31_t outR, outI; /* Temporary variables for output */ + const q31_t *pCoefA, *pCoefB; /* Temporary pointers for twiddle factors */ + q31_t CoefA1, CoefA2, CoefB1; /* Temporary variables for twiddle coefficients */ + q31_t *pOut1 = &pDst[2], *pOut2 = &pDst[4 * fftLen - 1]; + q31_t *pIn1 = &pSrc[2], *pIn2 = &pSrc[2 * fftLen - 1]; + + /* Init coefficient pointers */ + pCoefA = &pATable[modifier * 2]; + pCoefB = &pBTable[modifier * 2]; + + i = fftLen - 1U; + + while (i > 0U) + { + /* + outR = ( pSrc[2 * i] * pATable[2 * i] + - pSrc[2 * i + 1] * pATable[2 * i + 1] + + pSrc[2 * n - 2 * i] * pBTable[2 * i] + + pSrc[2 * n - 2 * i + 1] * pBTable[2 * i + 1]); + + outI = ( pIn[2 * i + 1] * pATable[2 * i] + + pIn[2 * i] * pATable[2 * i + 1] + + pIn[2 * n - 2 * i] * pBTable[2 * i + 1] + - pIn[2 * n - 2 * i + 1] * pBTable[2 * i]); + */ + + CoefA1 = *pCoefA++; + CoefA2 = *pCoefA; + + /* outR = (pSrc[2 * i] * pATable[2 * i] */ + mult_32x32_keep32_R (outR, *pIn1, CoefA1); + + /* outI = pIn[2 * i] * pATable[2 * i + 1] */ + mult_32x32_keep32_R (outI, *pIn1++, CoefA2); + + /* - pSrc[2 * i + 1] * pATable[2 * i + 1] */ + multSub_32x32_keep32_R (outR, *pIn1, CoefA2); + + /* (pIn[2 * i + 1] * pATable[2 * i] */ + multAcc_32x32_keep32_R (outI, *pIn1++, CoefA1); + + /* pSrc[2 * n - 2 * i] * pBTable[2 * i] */ + multSub_32x32_keep32_R (outR, *pIn2, CoefA2); + CoefB1 = *pCoefB; + + /* pIn[2 * n - 2 * i] * pBTable[2 * i + 1] */ + multSub_32x32_keep32_R (outI, *pIn2--, CoefB1); + + /* pSrc[2 * n - 2 * i + 1] * pBTable[2 * i + 1] */ + multAcc_32x32_keep32_R (outR, *pIn2, CoefB1); + + /* pIn[2 * n - 2 * i + 1] * pBTable[2 * i] */ + multSub_32x32_keep32_R (outI, *pIn2--, CoefA2); + + /* write output */ + *pOut1++ = outR; + *pOut1++ = outI; + + /* write complex conjugate output */ + *pOut2-- = -outI; + *pOut2-- = outR; + + /* update coefficient pointer */ + pCoefB = pCoefB + (2 * modifier); + pCoefA = pCoefA + (2 * modifier - 1); + + /* Decrement loop count */ + i--; + } + + pDst[2 * fftLen] = (pSrc[0] - pSrc[1]) >> 1U; + pDst[2 * fftLen + 1] = 0; + + pDst[0] = (pSrc[0] + pSrc[1]) >> 1U; + pDst[1] = 0; +} +#endif /* defined(ARM_MATH_MVEI) */ + +/** + @brief Core Real IFFT process + @param[in] pSrc points to input buffer + @param[in] fftLen length of FFT + @param[in] pATable points to twiddle Coef A buffer + @param[in] pBTable points to twiddle Coef B buffer + @param[out] pDst points to output buffer + @param[in] modifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table + @return none + */ + +#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) + +void arm_split_rifft_q31( + q31_t * pSrc, + uint32_t fftLen, + const q31_t * pATable, + const q31_t * pBTable, + q31_t * pDst, + uint32_t modifier) +{ + uint32_t i; /* Loop Counter */ + const q31_t *pCoefA, *pCoefB; /* Temporary pointers for twiddle factors */ + q31_t *pIn1; + uint32x4_t offset = { 2, 3, 0, 1 }; + uint32x4_t offsetCoef = { 0, 1, modifier * 2, modifier * 2 + 1 }; + int32x4_t conj = { 1, -1, 1, -1 }; + + offset = offset + (2 * fftLen - 2); + + /* Init coefficient pointers */ + pCoefA = &pATable[0]; + pCoefB = &pBTable[0]; + + const q31_t *pCoefAb, *pCoefBb; + pCoefAb = pCoefA; + pCoefBb = pCoefB; + + pIn1 = &pSrc[0]; + + i = fftLen; + i = i >> 1; + while (i > 0U) { + q31x4_t in1 = vld1q_s32(pIn1); + q31x4_t in2 = vldrwq_gather_shifted_offset_s32(pSrc, offset); + q31x4_t coefA = vldrwq_gather_shifted_offset_s32(pCoefAb, offsetCoef); + q31x4_t coefB = vldrwq_gather_shifted_offset_s32(pCoefBb, offsetCoef); + + /* can we avoid the conjugate here ? */ +#if defined(__CMSIS_GCC_H) + q31x4_t out = vhaddq_s32(MVE_CMPLX_MULT_FX_AxConjB_S32(in1, coefA), + vmulq_s32(conj, MVE_CMPLX_MULT_FX_AxB_S32(in2, coefB))); +#else + q31x4_t out = vhaddq_s32(MVE_CMPLX_MULT_FX_AxConjB(in1, coefA, q31x4_t), + vmulq_s32(conj, MVE_CMPLX_MULT_FX_AxB(in2, coefB, q31x4_t))); +#endif + vst1q_s32(pDst, out); + pDst += 4; + + offsetCoef += modifier * 4; + offset -= 4; + + pIn1 += 4; + i -= 1; + } +} +#else +void arm_split_rifft_q31( + q31_t * pSrc, + uint32_t fftLen, + const q31_t * pATable, + const q31_t * pBTable, + q31_t * pDst, + uint32_t modifier) +{ + q31_t outR, outI; /* Temporary variables for output */ + const q31_t *pCoefA, *pCoefB; /* Temporary pointers for twiddle factors */ + q31_t CoefA1, CoefA2, CoefB1; /* Temporary variables for twiddle coefficients */ + q31_t *pIn1 = &pSrc[0], *pIn2 = &pSrc[2 * fftLen + 1]; + + pCoefA = &pATable[0]; + pCoefB = &pBTable[0]; + + while (fftLen > 0U) + { + /* + outR = ( pIn[2 * i] * pATable[2 * i] + + pIn[2 * i + 1] * pATable[2 * i + 1] + + pIn[2 * n - 2 * i] * pBTable[2 * i] + - pIn[2 * n - 2 * i + 1] * pBTable[2 * i + 1]); + + outI = ( pIn[2 * i + 1] * pATable[2 * i] + - pIn[2 * i] * pATable[2 * i + 1] + - pIn[2 * n - 2 * i] * pBTable[2 * i + 1] + - pIn[2 * n - 2 * i + 1] * pBTable[2 * i]); + */ + + CoefA1 = *pCoefA++; + CoefA2 = *pCoefA; + + /* outR = (pIn[2 * i] * pATable[2 * i] */ + mult_32x32_keep32_R (outR, *pIn1, CoefA1); + + /* - pIn[2 * i] * pATable[2 * i + 1] */ + mult_32x32_keep32_R (outI, *pIn1++, -CoefA2); + + /* pIn[2 * i + 1] * pATable[2 * i + 1] */ + multAcc_32x32_keep32_R (outR, *pIn1, CoefA2); + + /* pIn[2 * i + 1] * pATable[2 * i] */ + multAcc_32x32_keep32_R (outI, *pIn1++, CoefA1); + + /* pIn[2 * n - 2 * i] * pBTable[2 * i] */ + multAcc_32x32_keep32_R (outR, *pIn2, CoefA2); + CoefB1 = *pCoefB; + + /* pIn[2 * n - 2 * i] * pBTable[2 * i + 1] */ + multSub_32x32_keep32_R (outI, *pIn2--, CoefB1); + + /* pIn[2 * n - 2 * i + 1] * pBTable[2 * i + 1] */ + multAcc_32x32_keep32_R (outR, *pIn2, CoefB1); + + /* pIn[2 * n - 2 * i + 1] * pBTable[2 * i] */ + multAcc_32x32_keep32_R (outI, *pIn2--, CoefA2); + + /* write output */ + *pDst++ = outR; + *pDst++ = outI; + + /* update coefficient pointer */ + pCoefB = pCoefB + (modifier * 2); + pCoefA = pCoefA + (modifier * 2 - 1); + + /* Decrement loop count */ + fftLen--; + } + +} + +#endif /* defined(ARM_MATH_MVEI) */ |