35 files changed, 12257 insertions, 14972 deletions
diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/CMakeLists.txt b/Drivers/CMSIS/DSP/Source/TransformFunctions/CMakeLists.txt
index e598f82..834409e 100644
--- a/Drivers/CMSIS/DSP/Source/TransformFunctions/CMakeLists.txt
+++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/CMakeLists.txt
@@ -1,213 +1,116 @@
-cmake_minimum_required (VERSION 3.14)
-
-project(CMSISDSPTransform)
-
-include(configLib)
-include(configDsp)
-
-add_library(CMSISDSPTransform STATIC)
-configLib(CMSISDSPTransform ${ROOT})
-configDsp(CMSISDSPTransform ${ROOT})
-
-include(fft)
-fft(CMSISDSPTransform)
-
-if (CONFIGTABLE AND ALLFFT)
-target_compile_definitions(CMSISDSPTransform PUBLIC ARM_ALL_FFT_TABLES) 
-endif() 
-
-target_sources(CMSISDSPTransform PRIVATE arm_bitreversal.c)
-target_sources(CMSISDSPTransform PRIVATE arm_bitreversal2.c)
-
-if ((NOT ARMAC5) AND (NOT DISABLEFLOAT16))
-target_sources(CMSISDSPTransform PRIVATE arm_bitreversal_f16.c)
-endif()
-
-if (NOT CONFIGTABLE OR ALLFFT OR CFFT_F32_16 OR CFFT_F32_32 OR CFFT_F32_64 OR CFFT_F32_128 OR CFFT_F32_256 OR CFFT_F32_512 
-    OR CFFT_F32_1024 OR CFFT_F32_2048 OR CFFT_F32_4096)
-target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix2_f32.c)
-target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix4_f32.c)
-target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix8_f32.c)
-target_sources(CMSISDSPTransform PRIVATE arm_cfft_f32.c)
-target_sources(CMSISDSPTransform PRIVATE arm_cfft_init_f32.c)
-endif()
-
-if ((NOT ARMAC5) AND (NOT DISABLEFLOAT16))
-if (NOT CONFIGTABLE OR ALLFFT OR CFFT_F16_16 OR CFFT_F16_32 OR CFFT_F16_64 OR CFFT_F16_128 OR CFFT_F16_256 OR CFFT_F16_512 
-    OR CFFT_F16_1024 OR CFFT_F16_2048 OR CFFT_F16_4096)
-target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix2_f16.c)
-target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix4_f16.c)
-target_sources(CMSISDSPTransform PRIVATE arm_cfft_f16.c)
-target_sources(CMSISDSPTransform PRIVATE arm_cfft_init_f16.c)
-endif()
-endif()
-
-if ((NOT ARMAC5) AND (NOT DISABLEFLOAT16))
-if (NOT CONFIGTABLE OR ALLFFT OR RFFT_F16_128 OR RFFT_F16_512 OR RFFT_F16_2048 OR RFFT_F16_8192)
-target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix4_init_f16.c)
-target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix4_f16.c)
-endif()
-endif()
-
-if (NOT CONFIGTABLE OR ALLFFT OR CFFT_F64_16 OR CFFT_F64_32 OR CFFT_F64_64 OR CFFT_F64_128 OR CFFT_F64_256 OR CFFT_F64_512 
-    OR CFFT_F64_1024 OR CFFT_F64_2048 OR CFFT_F64_4096)
-target_sources(CMSISDSPTransform PRIVATE arm_cfft_f64.c)
-target_sources(CMSISDSPTransform PRIVATE arm_cfft_init_f64.c)
-endif()
-
-if (NOT CONFIGTABLE OR ALLFFT OR CFFT_Q15_16 OR CFFT_Q15_32 OR CFFT_Q15_64 OR CFFT_Q15_128 OR CFFT_Q15_256 OR CFFT_Q15_512 
-    OR CFFT_Q15_1024 OR CFFT_Q15_2048 OR CFFT_Q15_4096)
-target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix2_q15.c)
-target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix4_q15.c)
-target_sources(CMSISDSPTransform PRIVATE arm_cfft_q15.c)
-target_sources(CMSISDSPTransform PRIVATE arm_cfft_init_q15.c)
-endif()
-
-if (NOT CONFIGTABLE OR ALLFFT OR CFFT_Q31_16 OR CFFT_Q31_32 OR CFFT_Q31_64 OR CFFT_Q31_128 OR CFFT_Q31_256 OR CFFT_Q31_512 
-    OR CFFT_Q31_1024 OR CFFT_Q31_2048 OR CFFT_Q31_4096)
-target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix2_q31.c)
-target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix4_q31.c)
-target_sources(CMSISDSPTransform PRIVATE arm_cfft_q31.c)
-target_sources(CMSISDSPTransform PRIVATE arm_cfft_init_q31.c)
-endif()
-
-
-if (NOT CONFIGTABLE OR ALLFFT OR DCT4_F32_128 OR DCT4_F32_512 OR DCT4_F32_2048 OR DCT4_F32_8192)
-target_sources(CMSISDSPTransform PRIVATE arm_dct4_f32.c)
-target_sources(CMSISDSPTransform PRIVATE arm_dct4_init_f32.c)
-
-target_sources(CMSISDSPTransform PRIVATE arm_rfft_init_f32.c)
-target_sources(CMSISDSPTransform PRIVATE arm_rfft_f32.c)
-target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix4_init_f32.c)
-target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix4_f32.c)
-endif()
-
-if (NOT CONFIGTABLE OR ALLFFT OR DCT4_Q31_128 OR DCT4_Q31_512 OR DCT4_Q31_2048 OR DCT4_Q31_8192)
-target_sources(CMSISDSPTransform PRIVATE arm_dct4_q31.c)
-target_sources(CMSISDSPTransform PRIVATE arm_dct4_init_q31.c)
-
-target_sources(CMSISDSPTransform PRIVATE arm_rfft_init_q31.c)
-target_sources(CMSISDSPTransform PRIVATE arm_rfft_q31.c)
-target_sources(CMSISDSPTransform PRIVATE arm_cfft_q31.c)
-target_sources(CMSISDSPTransform PRIVATE arm_cfft_init_q31.c)
-target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix4_init_q31.c)
-target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix4_q31.c)
-endif()
-
-if (NOT CONFIGTABLE OR ALLFFT OR ALLFFT OR DCT4_Q15_128 OR DCT4_Q15_512 OR DCT4_Q15_2048 OR DCT4_Q15_8192)
-target_sources(CMSISDSPTransform PRIVATE arm_dct4_init_q15.c)
-target_sources(CMSISDSPTransform PRIVATE arm_dct4_q15.c)
-
-target_sources(CMSISDSPTransform PRIVATE arm_rfft_init_q15.c)
-target_sources(CMSISDSPTransform PRIVATE arm_rfft_q15.c)
-target_sources(CMSISDSPTransform PRIVATE arm_cfft_q15.c)
-target_sources(CMSISDSPTransform PRIVATE arm_cfft_init_q15.c)
-target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix4_init_q15.c)
-target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix4_q15.c)
-endif()
-
-if (NOT CONFIGTABLE OR ALLFFT OR RFFT_FAST_F32_32 OR RFFT_FAST_F32_64 OR RFFT_FAST_F32_128
-   OR RFFT_FAST_F32_256 OR RFFT_FAST_F32_512 OR RFFT_FAST_F32_1024 OR RFFT_FAST_F32_2048
-   OR RFFT_FAST_F32_4096 )
-target_sources(CMSISDSPTransform PRIVATE arm_rfft_fast_f32.c)
-target_sources(CMSISDSPTransform PRIVATE arm_rfft_fast_init_f32.c)
-target_sources(CMSISDSPTransform PRIVATE arm_cfft_f32.c)
-target_sources(CMSISDSPTransform PRIVATE arm_cfft_init_f32.c)
-target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix8_f32.c)
-endif()
-
-if (NOT CONFIGTABLE OR ALLFFT OR RFFT_FAST_F64_32 OR RFFT_FAST_F64_64 OR RFFT_FAST_F64_128
-   OR RFFT_FAST_F64_256 OR RFFT_FAST_F64_512 OR RFFT_FAST_F64_1024 OR RFFT_FAST_F64_2048
-   OR RFFT_FAST_F64_4096 )
-target_sources(CMSISDSPTransform PRIVATE arm_rfft_fast_f64.c)
-target_sources(CMSISDSPTransform PRIVATE arm_rfft_fast_init_f64.c)
-endif()
-
-if ((NOT DISABLEFLOAT16))
-if (NOT CONFIGTABLE OR ALLFFT OR RFFT_FAST_F16_32 OR RFFT_FAST_F16_64 OR RFFT_FAST_F16_128
-   OR RFFT_FAST_F16_256 OR RFFT_FAST_F16_512 OR RFFT_FAST_F16_1024 OR RFFT_FAST_F16_2048
-   OR RFFT_FAST_F16_4096 )
-target_sources(CMSISDSPTransform PRIVATE arm_rfft_fast_f16.c)
-target_sources(CMSISDSPTransform PRIVATE arm_rfft_fast_init_f16.c)
-target_sources(CMSISDSPTransform PRIVATE arm_cfft_f16.c)
-target_sources(CMSISDSPTransform PRIVATE arm_cfft_init_f16.c)
-target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix8_f16.c)
-endif()
-endif()
-
-if (NOT CONFIGTABLE OR ALLFFT OR RFFT_F32_128 OR RFFT_F32_512 OR RFFT_F32_2048 OR RFFT_F32_8192)
-target_sources(CMSISDSPTransform PRIVATE arm_rfft_init_f32.c)
-target_sources(CMSISDSPTransform PRIVATE arm_rfft_f32.c)
-target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix4_init_f32.c)
-target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix4_f32.c)
-endif()
-
-if (NOT CONFIGTABLE OR ALLFFT OR RFFT_Q15_32 OR RFFT_Q15_64 OR RFFT_Q15_128 OR RFFT_Q15_256
-     OR RFFT_Q15_512 OR RFFT_Q15_1024 OR RFFT_Q15_2048 OR RFFT_Q15_4096 OR RFFT_Q15_8192)
-target_sources(CMSISDSPTransform PRIVATE arm_rfft_init_q15.c)
-target_sources(CMSISDSPTransform PRIVATE arm_rfft_q15.c)
-target_sources(CMSISDSPTransform PRIVATE arm_cfft_q15.c)
-target_sources(CMSISDSPTransform PRIVATE arm_cfft_init_q15.c)
-target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix4_q15.c)
-endif()
-
-if (NOT CONFIGTABLE OR ALLFFT OR RFFT_Q31_32 OR RFFT_Q31_64 OR RFFT_Q31_128 OR RFFT_Q31_256
-     OR RFFT_Q31_512 OR RFFT_Q31_1024 OR RFFT_Q31_2048 OR RFFT_Q31_4096 OR RFFT_Q31_8192)
-target_sources(CMSISDSPTransform PRIVATE arm_rfft_init_q31.c)
-target_sources(CMSISDSPTransform PRIVATE arm_rfft_q31.c)
-target_sources(CMSISDSPTransform PRIVATE arm_cfft_q31.c)
-target_sources(CMSISDSPTransform PRIVATE arm_cfft_init_q31.c)
-target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix4_q31.c)
-endif()
-
-if (WRAPPER OR ARM_CFFT_RADIX2_Q15)
-  target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix2_init_q15.c)
-endif()
-
-if (NOT CONFIGTABLE  OR ALLFFT OR ARM_CFFT_RADIX4_Q15)
-  target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix4_init_q15.c)
-endif()
-
-if (WRAPPER OR ARM_CFFT_RADIX2_Q31)
-  target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix2_init_q31.c)
-endif()
-
-if (NOT CONFIGTABLE  OR ALLFFT OR ARM_CFFT_RADIX4_Q31)
-  target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix4_init_q31.c)
-endif()
-
-# For scipy or wrappers or benchmarks
-if (WRAPPER)
-target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix2_init_f32.c)
-if ((NOT ARMAC5) AND (NOT DISABLEFLOAT16))
-target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix2_init_f16.c)
-endif()
-
-    target_compile_definitions(CMSISDSPTransform PUBLIC ARM_TABLE_BITREV_1024)
-    target_compile_definitions(CMSISDSPTransform PUBLIC ARM_TABLE_TWIDDLECOEF_F32_4096)
-    target_compile_definitions(CMSISDSPTransform PUBLIC ARM_TABLE_TWIDDLECOEF_Q31_4096)
-    target_compile_definitions(CMSISDSPTransform PUBLIC ARM_TABLE_TWIDDLECOEF_Q15_4096)
-if ((NOT ARMAC5) AND (NOT DISABLEFLOAT16))
-    target_compile_definitions(CMSISDSPTransform PUBLIC ARM_TABLE_TWIDDLECOEF_F16_4096)
-endif()
-endif()
-
-target_sources(CMSISDSPTransform PRIVATE arm_mfcc_init_f32.c)
-target_sources(CMSISDSPTransform PRIVATE arm_mfcc_f32.c)
-
-target_sources(CMSISDSPTransform PRIVATE arm_mfcc_init_q31.c)
-target_sources(CMSISDSPTransform PRIVATE arm_mfcc_q31.c)
-
-target_sources(CMSISDSPTransform PRIVATE arm_mfcc_init_q15.c)
-target_sources(CMSISDSPTransform PRIVATE arm_mfcc_q15.c)
-
-if ((NOT ARMAC5) AND (NOT DISABLEFLOAT16))
-target_sources(CMSISDSPTransform PRIVATE arm_mfcc_init_f16.c)
-target_sources(CMSISDSPTransform PRIVATE arm_mfcc_f16.c)
-endif()
-
-### Includes
-target_include_directories(CMSISDSPTransform PUBLIC "${DSP}/Include")
-
-
-
+cmake_minimum_required (VERSION 3.6)
+
+project(CMSISDSPTransform)
+
+
+
+add_library(CMSISDSPTransform STATIC)
+
+include(fft)
+fft(CMSISDSPTransform)
+
+if (CONFIGTABLE AND ALLFFT)
+target_compile_definitions(CMSISDSPTransform PUBLIC ARM_ALL_FFT_TABLES) 
+endif() 
+
+target_sources(CMSISDSPTransform PRIVATE arm_bitreversal.c)
+target_sources(CMSISDSPTransform PRIVATE arm_bitreversal2.c)
+
+if (NOT CONFIGTABLE OR ALLFFT OR CFFT_F32_16 OR CFFT_F32_32 OR CFFT_F32_64 OR CFFT_F32_128 OR CFFT_F32_256 OR CFFT_F32_512 
+    OR CFFT_F32_1024 OR CFFT_F32_2048 OR CFFT_F32_4096)
+target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix2_f32.c)
+target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix4_f32.c)
+target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix8_f32.c)
+target_sources(CMSISDSPTransform PRIVATE arm_cfft_f32.c)
+endif()
+
+if (NOT CONFIGTABLE OR ALLFFT OR CFFT_Q15_16 OR CFFT_Q15_32 OR CFFT_Q15_64 OR CFFT_Q15_128 OR CFFT_Q15_256 OR CFFT_Q15_512 
+    OR CFFT_Q15_1024 OR CFFT_Q15_2048 OR CFFT_Q15_4096)
+target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix2_q15.c)
+target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix4_q15.c)
+target_sources(CMSISDSPTransform PRIVATE arm_cfft_q15.c)
+endif()
+
+if (NOT CONFIGTABLE OR ALLFFT OR CFFT_Q31_16 OR CFFT_Q31_32 OR CFFT_Q31_64 OR CFFT_Q31_128 OR CFFT_Q31_256 OR CFFT_Q31_512 
+    OR CFFT_Q31_1024 OR CFFT_Q31_2048 OR CFFT_Q31_4096)
+target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix2_q31.c)
+target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix4_q31.c)
+target_sources(CMSISDSPTransform PRIVATE arm_cfft_q31.c)
+endif()
+
+if (NOT CONFIGTABLE OR ALLFFT)
+target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix2_init_q15.c)
+target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix2_init_q31.c)
+endif()
+
+if (NOT CONFIGTABLE OR ALLFFT OR DCT4_F32_128 OR DCT4_F32_512 OR DCT4_F32_2048 OR DCT4_F32_8192)
+target_sources(CMSISDSPTransform PRIVATE arm_dct4_f32.c)
+target_sources(CMSISDSPTransform PRIVATE arm_dct4_init_f32.c)
+
+target_sources(CMSISDSPTransform PRIVATE arm_rfft_init_f32.c)
+target_sources(CMSISDSPTransform PRIVATE arm_rfft_f32.c)
+target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix4_init_f32.c)
+target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix4_f32.c)
+endif()
+
+if (NOT CONFIGTABLE OR ALLFFT OR DCT4_Q31_128 OR DCT4_Q31_512 OR DCT4_Q31_2048 OR DCT4_Q31_8192)
+target_sources(CMSISDSPTransform PRIVATE arm_dct4_q31.c)
+target_sources(CMSISDSPTransform PRIVATE arm_dct4_init_q31.c)
+
+target_sources(CMSISDSPTransform PRIVATE arm_rfft_init_q31.c)
+target_sources(CMSISDSPTransform PRIVATE arm_rfft_q31.c)
+target_sources(CMSISDSPTransform PRIVATE arm_cfft_q31.c)
+target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix4_init_q31.c)
+target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix4_q31.c)
+endif()
+
+if (NOT CONFIGTABLE OR ALLFFT OR ALLFFT OR DCT4_Q15_128 OR DCT4_Q15_512 OR DCT4_Q15_2048 OR DCT4_Q15_8192)
+target_sources(CMSISDSPTransform PRIVATE arm_dct4_init_q15.c)
+target_sources(CMSISDSPTransform PRIVATE arm_dct4_q15.c)
+
+target_sources(CMSISDSPTransform PRIVATE arm_rfft_init_q15.c)
+target_sources(CMSISDSPTransform PRIVATE arm_rfft_q15.c)
+target_sources(CMSISDSPTransform PRIVATE arm_cfft_q15.c)
+target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix4_init_q15.c)
+target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix4_q15.c)
+endif()
+
+if (NOT CONFIGTABLE OR ALLFFT OR RFFT_FAST_F32_32 OR RFFT_FAST_F32_64 OR RFFT_FAST_F32_128
+   OR RFFT_FAST_F32_256 OR RFFT_FAST_F32_512 OR RFFT_FAST_F32_1024 OR RFFT_FAST_F32_2048
+   OR RFFT_FAST_F32_4096 )
+target_sources(CMSISDSPTransform PRIVATE arm_rfft_fast_f32.c)
+target_sources(CMSISDSPTransform PRIVATE arm_rfft_fast_init_f32.c)
+target_sources(CMSISDSPTransform PRIVATE arm_cfft_f32.c)
+target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix8_f32.c)
+endif()
+
+if (NOT CONFIGTABLE OR ALLFFT OR RFFT_F32_128 OR RFFT_F32_512 OR RFFT_F32_2048 OR RFFT_F32_8192)
+target_sources(CMSISDSPTransform PRIVATE arm_rfft_init_f32.c)
+target_sources(CMSISDSPTransform PRIVATE arm_rfft_f32.c)
+target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix4_init_f32.c)
+target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix4_f32.c)
+endif()
+
+if (NOT CONFIGTABLE OR ALLFFT OR RFFT_Q15_32 OR RFFT_Q15_64 OR RFFT_Q15_128 OR RFFT_Q15_256
+     OR RFFT_Q15_512 OR RFFT_Q15_1024 OR RFFT_Q15_2048 OR RFFT_Q15_4096 OR RFFT_Q15_8192)
+target_sources(CMSISDSPTransform PRIVATE arm_rfft_init_q15.c)
+target_sources(CMSISDSPTransform PRIVATE arm_rfft_q15.c)
+target_sources(CMSISDSPTransform PRIVATE arm_cfft_q15.c)
+target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix4_q15.c)
+endif()
+
+if (NOT CONFIGTABLE OR ALLFFT OR RFFT_Q31_32 OR RFFT_Q31_64 OR RFFT_Q31_128 OR RFFT_Q31_256
+     OR RFFT_Q31_512 OR RFFT_Q31_1024 OR RFFT_Q31_2048 OR RFFT_Q31_4096 OR RFFT_Q31_8192)
+target_sources(CMSISDSPTransform PRIVATE arm_rfft_init_q31.c)
+target_sources(CMSISDSPTransform PRIVATE arm_rfft_q31.c)
+target_sources(CMSISDSPTransform PRIVATE arm_cfft_q31.c)
+target_sources(CMSISDSPTransform PRIVATE arm_cfft_radix4_q31.c)
+endif()
+
+configdsp(CMSISDSPTransform ..)
+
+### Includes
+target_include_directories(CMSISDSPTransform PUBLIC "${DSP}/../../Include")
+
+
+
diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/TransformFunctions.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/TransformFunctions.c
index f327801..fcd0d95 100644
--- a/Drivers/CMSIS/DSP/Source/TransformFunctions/TransformFunctions.c
+++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/TransformFunctions.c
@@ -1,83 +1,60 @@
-/* ----------------------------------------------------------------------
- * Project:      CMSIS DSP Library
- * Title:        TransformFunctions.c
- * Description:  Combination of all transform function source files.
- *
- * $Date:        18. March 2019
- * $Revision:    V1.0.0
- *
- * Target Processor: Cortex-M cores
- * -------------------------------------------------------------------- */
-/*
- * Copyright (C) 2019 ARM Limited or its affiliates. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "arm_bitreversal.c"
-#include "arm_bitreversal2.c"
-#include "arm_cfft_f32.c"
-#include "arm_cfft_f64.c"
-#include "arm_cfft_q15.c"
-#include "arm_cfft_q31.c"
-#include "arm_cfft_init_f32.c"
-#include "arm_cfft_init_f64.c"
-#include "arm_cfft_init_q15.c"
-#include "arm_cfft_init_q31.c"
-#include "arm_cfft_radix2_f32.c"
-#include "arm_cfft_radix2_q15.c"
-#include "arm_cfft_radix2_q31.c"
-#include "arm_cfft_radix4_f32.c"
-#include "arm_cfft_radix4_q15.c"
-#include "arm_cfft_radix4_q31.c"
-#include "arm_cfft_radix8_f32.c"
-#include "arm_rfft_fast_f32.c"
-#include "arm_rfft_fast_f64.c"
-#include "arm_rfft_fast_init_f32.c"
-#include "arm_rfft_fast_init_f64.c"
-
-#include "arm_mfcc_init_f32.c"
-#include "arm_mfcc_f32.c"
-
-#include "arm_mfcc_init_q31.c"
-#include "arm_mfcc_q31.c"
-
-#include "arm_mfcc_init_q15.c"
-#include "arm_mfcc_q15.c"
-
-/* Deprecated */
-
-#include "arm_dct4_f32.c"
-#include "arm_dct4_init_f32.c"
-#include "arm_dct4_init_q15.c"
-#include "arm_dct4_init_q31.c"
-#include "arm_dct4_q15.c"
-#include "arm_dct4_q31.c"
-
-#include "arm_rfft_f32.c"
-#include "arm_rfft_q15.c"
-#include "arm_rfft_q31.c"
-
-#include "arm_rfft_init_f32.c"
-#include "arm_rfft_init_q15.c"
-#include "arm_rfft_init_q31.c"
-
-#include "arm_cfft_radix4_init_f32.c"
-#include "arm_cfft_radix4_init_q15.c"
-#include "arm_cfft_radix4_init_q31.c"
-
-#include "arm_cfft_radix2_init_f32.c"
-#include "arm_cfft_radix2_init_q15.c"
-#include "arm_cfft_radix2_init_q31.c"
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        TransformFunctions.c
+ * Description:  Combination of all transform function source files.
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.0.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_bitreversal.c"
+#include "arm_bitreversal2.c"
+#include "arm_cfft_f32.c"
+#include "arm_cfft_q15.c"
+#include "arm_cfft_q31.c"
+#include "arm_cfft_radix2_f32.c"
+#include "arm_cfft_radix2_init_f32.c"
+#include "arm_cfft_radix2_init_q15.c"
+#include "arm_cfft_radix2_init_q31.c"
+#include "arm_cfft_radix2_q15.c"
+#include "arm_cfft_radix2_q31.c"
+#include "arm_cfft_radix4_f32.c"
+#include "arm_cfft_radix4_init_f32.c"
+#include "arm_cfft_radix4_init_q15.c"
+#include "arm_cfft_radix4_init_q31.c"
+#include "arm_cfft_radix4_q15.c"
+#include "arm_cfft_radix4_q31.c"
+#include "arm_cfft_radix8_f32.c"
+#include "arm_dct4_f32.c"
+#include "arm_dct4_init_f32.c"
+#include "arm_dct4_init_q15.c"
+#include "arm_dct4_init_q31.c"
+#include "arm_dct4_q15.c"
+#include "arm_dct4_q31.c"
+#include "arm_rfft_f32.c"
+#include "arm_rfft_fast_f32.c"
+#include "arm_rfft_fast_init_f32.c"
+#include "arm_rfft_init_f32.c"
+#include "arm_rfft_init_q15.c"
+#include "arm_rfft_init_q31.c"
+#include "arm_rfft_q15.c"
+#include "arm_rfft_q31.c"
diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_bitreversal.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_bitreversal.c
index 687a9e8..de95c59 100644
--- a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_bitreversal.c
+++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_bitreversal.c
@@ -1,230 +1,229 @@
-/* ----------------------------------------------------------------------
- * Project:      CMSIS DSP Library
- * Title:        arm_bitreversal.c
- * Description:  Bitreversal functions
- *
- * $Date:        23 April 2021
- * $Revision:    V1.9.0
- *
- * Target Processor: Cortex-M and Cortex-A cores
- * -------------------------------------------------------------------- */
-/*
- * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "dsp/transform_functions.h"
-#include "arm_common_tables.h"
-
-
-/**
-  @brief         In-place floating-point bit reversal function.
-  @param[in,out] pSrc         points to in-place floating-point data buffer
-  @param[in]     fftSize      length of FFT
-  @param[in]     bitRevFactor bit reversal modifier that supports different size FFTs with the same bit reversal table
-  @param[in]     pBitRevTab   points to bit reversal table
-  @return        none
- */
-
-void arm_bitreversal_f32(
-        float32_t * pSrc,
-        uint16_t fftSize,
-        uint16_t bitRevFactor,
-  const uint16_t * pBitRevTab)
-{
-   uint16_t fftLenBy2, fftLenBy2p1;
-   uint16_t i, j;
-   float32_t in;
-
-   /*  Initializations */
-   j = 0U;
-   fftLenBy2 = fftSize >> 1U;
-   fftLenBy2p1 = (fftSize >> 1U) + 1U;
-
-   /* Bit Reversal Implementation */
-   for (i = 0U; i <= (fftLenBy2 - 2U); i += 2U)
-   {
-      if (i < j)
-      {
-         /*  pSrc[i] <-> pSrc[j]; */
-         in = pSrc[2U * i];
-         pSrc[2U * i] = pSrc[2U * j];
-         pSrc[2U * j] = in;
-
-         /*  pSrc[i+1U] <-> pSrc[j+1U] */
-         in = pSrc[(2U * i) + 1U];
-         pSrc[(2U * i) + 1U] = pSrc[(2U * j) + 1U];
-         pSrc[(2U * j) + 1U] = in;
-
-         /*  pSrc[i+fftLenBy2p1] <-> pSrc[j+fftLenBy2p1] */
-         in = pSrc[2U * (i + fftLenBy2p1)];
-         pSrc[2U * (i + fftLenBy2p1)] = pSrc[2U * (j + fftLenBy2p1)];
-         pSrc[2U * (j + fftLenBy2p1)] = in;
-
-         /*  pSrc[i+fftLenBy2p1+1U] <-> pSrc[j+fftLenBy2p1+1U] */
-         in = pSrc[(2U * (i + fftLenBy2p1)) + 1U];
-         pSrc[(2U * (i + fftLenBy2p1)) + 1U] =
-         pSrc[(2U * (j + fftLenBy2p1)) + 1U];
-         pSrc[(2U * (j + fftLenBy2p1)) + 1U] = in;
-
-      }
-
-      /*  pSrc[i+1U] <-> pSrc[j+1U] */
-      in = pSrc[2U * (i + 1U)];
-      pSrc[2U * (i + 1U)] = pSrc[2U * (j + fftLenBy2)];
-      pSrc[2U * (j + fftLenBy2)] = in;
-
-      /*  pSrc[i+2U] <-> pSrc[j+2U] */
-      in = pSrc[(2U * (i + 1U)) + 1U];
-      pSrc[(2U * (i + 1U)) + 1U] = pSrc[(2U * (j + fftLenBy2)) + 1U];
-      pSrc[(2U * (j + fftLenBy2)) + 1U] = in;
-
-      /*  Reading the index for the bit reversal */
-      j = *pBitRevTab;
-
-      /*  Updating the bit reversal index depending on the fft length  */
-      pBitRevTab += bitRevFactor;
-   }
-}
-
-
-/**
-  @brief         In-place Q31 bit reversal function.
-  @param[in,out] pSrc         points to in-place Q31 data buffer.
-  @param[in]     fftLen       length of FFT.
-  @param[in]     bitRevFactor bit reversal modifier that supports different size FFTs with the same bit reversal table
-  @param[in]     pBitRevTab   points to bit reversal table
-  @return        none
-*/
-
-void arm_bitreversal_q31(
-        q31_t * pSrc,
-        uint32_t fftLen,
-        uint16_t bitRevFactor,
-  const uint16_t * pBitRevTab)
-{
-   uint32_t fftLenBy2, fftLenBy2p1, i, j;
-   q31_t in;
-
-   /*  Initializations      */
-   j = 0U;
-   fftLenBy2 = fftLen / 2U;
-   fftLenBy2p1 = (fftLen / 2U) + 1U;
-
-   /* Bit Reversal Implementation */
-   for (i = 0U; i <= (fftLenBy2 - 2U); i += 2U)
-   {
-      if (i < j)
-      {
-         /*  pSrc[i] <-> pSrc[j]; */
-         in = pSrc[2U * i];
-         pSrc[2U * i] = pSrc[2U * j];
-         pSrc[2U * j] = in;
-
-         /*  pSrc[i+1U] <-> pSrc[j+1U] */
-         in = pSrc[(2U * i) + 1U];
-         pSrc[(2U * i) + 1U] = pSrc[(2U * j) + 1U];
-         pSrc[(2U * j) + 1U] = in;
-
-         /*  pSrc[i+fftLenBy2p1] <-> pSrc[j+fftLenBy2p1] */
-         in = pSrc[2U * (i + fftLenBy2p1)];
-         pSrc[2U * (i + fftLenBy2p1)] = pSrc[2U * (j + fftLenBy2p1)];
-         pSrc[2U * (j + fftLenBy2p1)] = in;
-
-         /*  pSrc[i+fftLenBy2p1+1U] <-> pSrc[j+fftLenBy2p1+1U] */
-         in = pSrc[(2U * (i + fftLenBy2p1)) + 1U];
-         pSrc[(2U * (i + fftLenBy2p1)) + 1U] =
-         pSrc[(2U * (j + fftLenBy2p1)) + 1U];
-         pSrc[(2U * (j + fftLenBy2p1)) + 1U] = in;
-
-      }
-
-      /*  pSrc[i+1U] <-> pSrc[j+1U] */
-      in = pSrc[2U * (i + 1U)];
-      pSrc[2U * (i + 1U)] = pSrc[2U * (j + fftLenBy2)];
-      pSrc[2U * (j + fftLenBy2)] = in;
-
-      /*  pSrc[i+2U] <-> pSrc[j+2U] */
-      in = pSrc[(2U * (i + 1U)) + 1U];
-      pSrc[(2U * (i + 1U)) + 1U] = pSrc[(2U * (j + fftLenBy2)) + 1U];
-      pSrc[(2U * (j + fftLenBy2)) + 1U] = in;
-
-      /*  Reading the index for the bit reversal */
-      j = *pBitRevTab;
-
-      /*  Updating the bit reversal index depending on the fft length */
-      pBitRevTab += bitRevFactor;
-   }
-}
-
-
-
-/**
-  @brief         In-place Q15 bit reversal function.
-  @param[in,out] pSrc16       points to in-place Q15 data buffer
-  @param[in]     fftLen       length of FFT
-  @param[in]     bitRevFactor bit reversal modifier that supports different size FFTs with the same bit reversal table
-  @param[in]     pBitRevTab   points to bit reversal table
-  @return        none
-*/
-
-void arm_bitreversal_q15(
-        q15_t * pSrc16,
-        uint32_t fftLen,
-        uint16_t bitRevFactor,
-  const uint16_t * pBitRevTab)
-{
-   q31_t *pSrc = (q31_t *) pSrc16;
-   q31_t in;
-   uint32_t fftLenBy2, fftLenBy2p1;
-   uint32_t i, j;
-
-   /*  Initializations */
-   j = 0U;
-   fftLenBy2 = fftLen / 2U;
-   fftLenBy2p1 = (fftLen / 2U) + 1U;
-
-   /* Bit Reversal Implementation */
-   for (i = 0U; i <= (fftLenBy2 - 2U); i += 2U)
-   {
-      if (i < j)
-      {
-         /*  pSrc[i] <-> pSrc[j]; */
-         /*  pSrc[i+1U] <-> pSrc[j+1U] */
-         in = pSrc[i];
-         pSrc[i] = pSrc[j];
-         pSrc[j] = in;
-
-         /*  pSrc[i + fftLenBy2p1] <-> pSrc[j + fftLenBy2p1];  */
-         /*  pSrc[i + fftLenBy2p1+1U] <-> pSrc[j + fftLenBy2p1+1U] */
-         in = pSrc[i + fftLenBy2p1];
-         pSrc[i + fftLenBy2p1] = pSrc[j + fftLenBy2p1];
-         pSrc[j + fftLenBy2p1] = in;
-      }
-
-      /*  pSrc[i+1U] <-> pSrc[j+fftLenBy2];         */
-      /*  pSrc[i+2] <-> pSrc[j+fftLenBy2+1U]  */
-      in = pSrc[i + 1U];
-      pSrc[i + 1U] = pSrc[j + fftLenBy2];
-      pSrc[j + fftLenBy2] = in;
-
-      /*  Reading the index for the bit reversal */
-      j = *pBitRevTab;
-
-      /*  Updating the bit reversal index depending on the fft length  */
-      pBitRevTab += bitRevFactor;
-   }
-}
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_bitreversal.c
+ * Description:  Bitreversal functions
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+#include "arm_common_tables.h"
+
+/**
+  @brief         In-place floating-point bit reversal function.
+  @param[in,out] pSrc         points to in-place floating-point data buffer
+  @param[in]     fftSize      length of FFT
+  @param[in]     bitRevFactor bit reversal modifier that supports different size FFTs with the same bit reversal table
+  @param[in]     pBitRevTab   points to bit reversal table
+  @return        none
+ */
+
+void arm_bitreversal_f32(
+        float32_t * pSrc,
+        uint16_t fftSize,
+        uint16_t bitRevFactor,
+  const uint16_t * pBitRevTab)
+{
+   uint16_t fftLenBy2, fftLenBy2p1;
+   uint16_t i, j;
+   float32_t in;
+
+   /*  Initializations */
+   j = 0U;
+   fftLenBy2 = fftSize >> 1U;
+   fftLenBy2p1 = (fftSize >> 1U) + 1U;
+
+   /* Bit Reversal Implementation */
+   for (i = 0U; i <= (fftLenBy2 - 2U); i += 2U)
+   {
+      if (i < j)
+      {
+         /*  pSrc[i] <-> pSrc[j]; */
+         in = pSrc[2U * i];
+         pSrc[2U * i] = pSrc[2U * j];
+         pSrc[2U * j] = in;
+
+         /*  pSrc[i+1U] <-> pSrc[j+1U] */
+         in = pSrc[(2U * i) + 1U];
+         pSrc[(2U * i) + 1U] = pSrc[(2U * j) + 1U];
+         pSrc[(2U * j) + 1U] = in;
+
+         /*  pSrc[i+fftLenBy2p1] <-> pSrc[j+fftLenBy2p1] */
+         in = pSrc[2U * (i + fftLenBy2p1)];
+         pSrc[2U * (i + fftLenBy2p1)] = pSrc[2U * (j + fftLenBy2p1)];
+         pSrc[2U * (j + fftLenBy2p1)] = in;
+
+         /*  pSrc[i+fftLenBy2p1+1U] <-> pSrc[j+fftLenBy2p1+1U] */
+         in = pSrc[(2U * (i + fftLenBy2p1)) + 1U];
+         pSrc[(2U * (i + fftLenBy2p1)) + 1U] =
+         pSrc[(2U * (j + fftLenBy2p1)) + 1U];
+         pSrc[(2U * (j + fftLenBy2p1)) + 1U] = in;
+
+      }
+
+      /*  pSrc[i+1U] <-> pSrc[j+1U] */
+      in = pSrc[2U * (i + 1U)];
+      pSrc[2U * (i + 1U)] = pSrc[2U * (j + fftLenBy2)];
+      pSrc[2U * (j + fftLenBy2)] = in;
+
+      /*  pSrc[i+2U] <-> pSrc[j+2U] */
+      in = pSrc[(2U * (i + 1U)) + 1U];
+      pSrc[(2U * (i + 1U)) + 1U] = pSrc[(2U * (j + fftLenBy2)) + 1U];
+      pSrc[(2U * (j + fftLenBy2)) + 1U] = in;
+
+      /*  Reading the index for the bit reversal */
+      j = *pBitRevTab;
+
+      /*  Updating the bit reversal index depending on the fft length  */
+      pBitRevTab += bitRevFactor;
+   }
+}
+
+
+/**
+  @brief         In-place Q31 bit reversal function.
+  @param[in,out] pSrc         points to in-place Q31 data buffer.
+  @param[in]     fftLen       length of FFT.
+  @param[in]     bitRevFactor bit reversal modifier that supports different size FFTs with the same bit reversal table
+  @param[in]     pBitRevTab   points to bit reversal table
+  @return        none
+*/
+
+void arm_bitreversal_q31(
+        q31_t * pSrc,
+        uint32_t fftLen,
+        uint16_t bitRevFactor,
+  const uint16_t * pBitRevTab)
+{
+   uint32_t fftLenBy2, fftLenBy2p1, i, j;
+   q31_t in;
+
+   /*  Initializations      */
+   j = 0U;
+   fftLenBy2 = fftLen / 2U;
+   fftLenBy2p1 = (fftLen / 2U) + 1U;
+
+   /* Bit Reversal Implementation */
+   for (i = 0U; i <= (fftLenBy2 - 2U); i += 2U)
+   {
+      if (i < j)
+      {
+         /*  pSrc[i] <-> pSrc[j]; */
+         in = pSrc[2U * i];
+         pSrc[2U * i] = pSrc[2U * j];
+         pSrc[2U * j] = in;
+
+         /*  pSrc[i+1U] <-> pSrc[j+1U] */
+         in = pSrc[(2U * i) + 1U];
+         pSrc[(2U * i) + 1U] = pSrc[(2U * j) + 1U];
+         pSrc[(2U * j) + 1U] = in;
+
+         /*  pSrc[i+fftLenBy2p1] <-> pSrc[j+fftLenBy2p1] */
+         in = pSrc[2U * (i + fftLenBy2p1)];
+         pSrc[2U * (i + fftLenBy2p1)] = pSrc[2U * (j + fftLenBy2p1)];
+         pSrc[2U * (j + fftLenBy2p1)] = in;
+
+         /*  pSrc[i+fftLenBy2p1+1U] <-> pSrc[j+fftLenBy2p1+1U] */
+         in = pSrc[(2U * (i + fftLenBy2p1)) + 1U];
+         pSrc[(2U * (i + fftLenBy2p1)) + 1U] =
+         pSrc[(2U * (j + fftLenBy2p1)) + 1U];
+         pSrc[(2U * (j + fftLenBy2p1)) + 1U] = in;
+
+      }
+
+      /*  pSrc[i+1U] <-> pSrc[j+1U] */
+      in = pSrc[2U * (i + 1U)];
+      pSrc[2U * (i + 1U)] = pSrc[2U * (j + fftLenBy2)];
+      pSrc[2U * (j + fftLenBy2)] = in;
+
+      /*  pSrc[i+2U] <-> pSrc[j+2U] */
+      in = pSrc[(2U * (i + 1U)) + 1U];
+      pSrc[(2U * (i + 1U)) + 1U] = pSrc[(2U * (j + fftLenBy2)) + 1U];
+      pSrc[(2U * (j + fftLenBy2)) + 1U] = in;
+
+      /*  Reading the index for the bit reversal */
+      j = *pBitRevTab;
+
+      /*  Updating the bit reversal index depending on the fft length */
+      pBitRevTab += bitRevFactor;
+   }
+}
+
+
+
+/**
+  @brief         In-place Q15 bit reversal function.
+  @param[in,out] pSrc16       points to in-place Q15 data buffer
+  @param[in]     fftLen       length of FFT
+  @param[in]     bitRevFactor bit reversal modifier that supports different size FFTs with the same bit reversal table
+  @param[in]     pBitRevTab   points to bit reversal table
+  @return        none
+*/
+
+void arm_bitreversal_q15(
+        q15_t * pSrc16,
+        uint32_t fftLen,
+        uint16_t bitRevFactor,
+  const uint16_t * pBitRevTab)
+{
+   q31_t *pSrc = (q31_t *) pSrc16;
+   q31_t in;
+   uint32_t fftLenBy2, fftLenBy2p1;
+   uint32_t i, j;
+
+   /*  Initializations */
+   j = 0U;
+   fftLenBy2 = fftLen / 2U;
+   fftLenBy2p1 = (fftLen / 2U) + 1U;
+
+   /* Bit Reversal Implementation */
+   for (i = 0U; i <= (fftLenBy2 - 2U); i += 2U)
+   {
+      if (i < j)
+      {
+         /*  pSrc[i] <-> pSrc[j]; */
+         /*  pSrc[i+1U] <-> pSrc[j+1U] */
+         in = pSrc[i];
+         pSrc[i] = pSrc[j];
+         pSrc[j] = in;
+
+         /*  pSrc[i + fftLenBy2p1] <-> pSrc[j + fftLenBy2p1];  */
+         /*  pSrc[i + fftLenBy2p1+1U] <-> pSrc[j + fftLenBy2p1+1U] */
+         in = pSrc[i + fftLenBy2p1];
+         pSrc[i + fftLenBy2p1] = pSrc[j + fftLenBy2p1];
+         pSrc[j + fftLenBy2p1] = in;
+      }
+
+      /*  pSrc[i+1U] <-> pSrc[j+fftLenBy2];         */
+      /*  pSrc[i+2] <-> pSrc[j+fftLenBy2+1U]  */
+      in = pSrc[i + 1U];
+      pSrc[i + 1U] = pSrc[j + fftLenBy2];
+      pSrc[j + fftLenBy2] = in;
+
+      /*  Reading the index for the bit reversal */
+      j = *pBitRevTab;
+
+      /*  Updating the bit reversal index depending on the fft length  */
+      pBitRevTab += bitRevFactor;
+   }
+}
diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_bitreversal2.S b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_bitreversal2.S
index c16091b..01c1e76 100644
--- a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_bitreversal2.S
+++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_bitreversal2.S
@@ -1,216 +1,216 @@
-;/* ----------------------------------------------------------------------
-; * Project:      CMSIS DSP Library
-; * Title:        arm_bitreversal2.S
-; * Description:  arm_bitreversal_32 function done in assembly for maximum speed.
-; *               Called after doing an fft to reorder the output.
-; *               The function is loop unrolled by 2. arm_bitreversal_16 as well.
-; *
-; * $Date:        18. March 2019
-; * $Revision:    V1.5.2
-; *
-; * Target Processor: Cortex-M cores
-; * -------------------------------------------------------------------- */
-;/*
-; * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
-; *
-; * SPDX-License-Identifier: Apache-2.0
-; *
-; * Licensed under the Apache License, Version 2.0 (the License); you may
-; * not use this file except in compliance with the License.
-; * You may obtain a copy of the License at
-; *
-; * www.apache.org/licenses/LICENSE-2.0
-; *
-; * Unless required by applicable law or agreed to in writing, software
-; * distributed under the License is distributed on an AS IS BASIS, WITHOUT
-; * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-; * See the License for the specific language governing permissions and
-; * limitations under the License.
-; */
-
-#if   defined ( __CC_ARM )     /* Keil */
-    #define CODESECT AREA     ||.text||, CODE, READONLY, ALIGN=2
-    #define LABEL
-#elif defined ( __IASMARM__ )  /* IAR */
-    #define CODESECT SECTION `.text`:CODE
-    #define PROC
-    #define LABEL
-    #define ENDP
-    #define EXPORT PUBLIC
-#elif defined ( __CSMC__ )	   /* Cosmic */
-	#define	CODESECT	switch .text
-	#define THUMB
-	#define EXPORT	xdef
-	#define PROC	:
-	#define LABEL	:
-	#define ENDP
-	#define arm_bitreversal_32 _arm_bitreversal_32
-#elif defined ( __TI_ARM__ )   /* TI ARM */
-    #define THUMB    .thumb
-    #define CODESECT .text
-    #define EXPORT   .global
-    #define PROC     : .asmfunc
-    #define LABEL    :
-    #define ENDP     .endasmfunc
-    #define END
-#elif defined ( __GNUC__ )     /* GCC */
-    #define THUMB .thumb
-    #define CODESECT .section .text
-    #define EXPORT .global
-    #define PROC :
-    #define LABEL :
-    #define ENDP
-    #define END
-
-    .syntax unified
-#endif
-
-	CODESECT
-	THUMB
-
-;/**
-;  @brief         In-place bit reversal function.
-;  @param[in,out] pSrc        points to the in-place buffer of unknown 32-bit data type
-;  @param[in]     bitRevLen   bit reversal table length
-;  @param[in]     pBitRevTab  points to bit reversal table
-;  @return        none
-; */
-	EXPORT arm_bitreversal_32
-	EXPORT arm_bitreversal_16
-
-#if   defined ( __CC_ARM )     /* Keil */
-#elif defined ( __IASMARM__ )  /* IAR */
-#elif defined ( __CSMC__ )	   /* Cosmic */
-#elif defined ( __TI_ARM__ )   /* TI ARM */
-#elif defined ( __GNUC__ )     /* GCC */
-	.type   arm_bitreversal_16, %function
-	.type   arm_bitreversal_32, %function
-#endif
-
-#if defined (ARM_MATH_CM0_FAMILY)
-
-arm_bitreversal_32 PROC
-	ADDS     r3,r1,#1
-	PUSH     {r4-r6}
-	ADDS     r1,r2,#0
-	LSRS     r3,r3,#1
-arm_bitreversal_32_0 LABEL
-	LDRH     r2,[r1,#2]
-	LDRH     r6,[r1,#0]
-	ADD      r2,r0,r2
-	ADD      r6,r0,r6
-	LDR      r5,[r2,#0]
-	LDR      r4,[r6,#0]
-	STR      r5,[r6,#0]
-	STR      r4,[r2,#0]
-	LDR      r5,[r2,#4]
-	LDR      r4,[r6,#4]
-	STR      r5,[r6,#4]
-	STR      r4,[r2,#4]
-	ADDS     r1,r1,#4
-	SUBS     r3,r3,#1
-	BNE      arm_bitreversal_32_0
-	POP      {r4-r6}
-	BX       lr
-	ENDP
-
-arm_bitreversal_16 PROC
-	ADDS     r3,r1,#1
-	PUSH     {r4-r6}
-	ADDS     r1,r2,#0
-	LSRS     r3,r3,#1
-arm_bitreversal_16_0 LABEL
-	LDRH     r2,[r1,#2]
-	LDRH     r6,[r1,#0]
-    LSRS     r2,r2,#1
-    LSRS     r6,r6,#1
-	ADD      r2,r0,r2
-	ADD      r6,r0,r6
-	LDR      r5,[r2,#0]
-	LDR      r4,[r6,#0]
-	STR      r5,[r6,#0]
-	STR      r4,[r2,#0]
-	ADDS     r1,r1,#4
-	SUBS     r3,r3,#1
-	BNE      arm_bitreversal_16_0
-	POP      {r4-r6}
-	BX       lr
-	ENDP
-
-#else
-
-arm_bitreversal_32 PROC
-	ADDS     r3,r1,#1
-	CMP      r3,#1
-	IT       LS
-	BXLS     lr
-	PUSH     {r4-r9}
-	ADDS     r1,r2,#2
-	LSRS     r3,r3,#2
-arm_bitreversal_32_0 LABEL       ;/* loop unrolled by 2 */
-	LDRH     r8,[r1,#4]
-	LDRH     r9,[r1,#2]
-	LDRH     r2,[r1,#0]
-	LDRH     r12,[r1,#-2]
-	ADD      r8,r0,r8
-	ADD      r9,r0,r9
-	ADD      r2,r0,r2
-	ADD      r12,r0,r12
-	LDR      r7,[r9,#0]
-	LDR      r6,[r8,#0]
-	LDR      r5,[r2,#0]
-	LDR      r4,[r12,#0]
-	STR      r6,[r9,#0]
-	STR      r7,[r8,#0]
-	STR      r5,[r12,#0]
-	STR      r4,[r2,#0]
-	LDR      r7,[r9,#4]
-	LDR      r6,[r8,#4]
-	LDR      r5,[r2,#4]
-	LDR      r4,[r12,#4]
-	STR      r6,[r9,#4]
-	STR      r7,[r8,#4]
-	STR      r5,[r12,#4]
-	STR      r4,[r2,#4]
-	ADDS     r1,r1,#8
-	SUBS     r3,r3,#1
-	BNE      arm_bitreversal_32_0
-	POP      {r4-r9}
-	BX       lr
-	ENDP
-
-arm_bitreversal_16 PROC
-	ADDS     r3,r1,#1
-	CMP      r3,#1
-	IT       LS
-	BXLS     lr
-	PUSH     {r4-r9}
-	ADDS     r1,r2,#2
-	LSRS     r3,r3,#2
-arm_bitreversal_16_0 LABEL       ;/* loop unrolled by 2 */
-	LDRH     r8,[r1,#4]
-	LDRH     r9,[r1,#2]
-	LDRH     r2,[r1,#0]
-	LDRH     r12,[r1,#-2]
-	ADD      r8,r0,r8,LSR #1
-	ADD      r9,r0,r9,LSR #1
-	ADD      r2,r0,r2,LSR #1
-	ADD      r12,r0,r12,LSR #1
-	LDR      r7,[r9,#0]
-	LDR      r6,[r8,#0]
-	LDR      r5,[r2,#0]
-	LDR      r4,[r12,#0]
-	STR      r6,[r9,#0]
-	STR      r7,[r8,#0]
-	STR      r5,[r12,#0]
-	STR      r4,[r2,#0]
-	ADDS     r1,r1,#8
-	SUBS     r3,r3,#1
-	BNE      arm_bitreversal_16_0
-	POP      {r4-r9}
-	BX       lr
-	ENDP
-
-#endif
-
-	END
+;/* ----------------------------------------------------------------------
+; * Project:      CMSIS DSP Library
+; * Title:        arm_bitreversal2.S
+; * Description:  arm_bitreversal_32 function done in assembly for maximum speed.
+; *               Called after doing an fft to reorder the output.
+; *               The function is loop unrolled by 2. arm_bitreversal_16 as well.
+; *
+; * $Date:        18. March 2019
+; * $Revision:    V1.5.2
+; *
+; * Target Processor: Cortex-M cores
+; * -------------------------------------------------------------------- */
+;/*
+; * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+; *
+; * SPDX-License-Identifier: Apache-2.0
+; *
+; * Licensed under the Apache License, Version 2.0 (the License); you may
+; * not use this file except in compliance with the License.
+; * You may obtain a copy of the License at
+; *
+; * www.apache.org/licenses/LICENSE-2.0
+; *
+; * Unless required by applicable law or agreed to in writing, software
+; * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+; * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+; * See the License for the specific language governing permissions and
+; * limitations under the License.
+; */
+
+#if   defined ( __CC_ARM )     /* Keil */
+    #define CODESECT AREA     ||.text||, CODE, READONLY, ALIGN=2
+    #define LABEL
+#elif defined ( __IASMARM__ )  /* IAR */
+    #define CODESECT SECTION `.text`:CODE
+    #define PROC
+    #define LABEL
+    #define ENDP
+    #define EXPORT PUBLIC
+#elif defined ( __CSMC__ )	   /* Cosmic */
+	#define	CODESECT	switch .text
+	#define THUMB
+	#define EXPORT	xdef
+	#define PROC	:
+	#define LABEL	:
+	#define ENDP
+	#define arm_bitreversal_32 _arm_bitreversal_32
+#elif defined ( __TI_ARM__ )   /* TI ARM */
+    #define THUMB    .thumb
+    #define CODESECT .text
+    #define EXPORT   .global
+    #define PROC     : .asmfunc
+    #define LABEL    :
+    #define ENDP     .endasmfunc
+    #define END
+#elif defined ( __GNUC__ )     /* GCC */
+    #define THUMB .thumb
+    #define CODESECT .section .text
+    #define EXPORT .global
+    #define PROC :
+    #define LABEL :
+    #define ENDP
+    #define END
+
+    .syntax unified
+#endif
+
+	CODESECT
+	THUMB
+
+;/**
+;  @brief         In-place bit reversal function.
+;  @param[in,out] pSrc        points to the in-place buffer of unknown 32-bit data type
+;  @param[in]     bitRevLen   bit reversal table length
+;  @param[in]     pBitRevTab  points to bit reversal table
+;  @return        none
+; */
+	EXPORT arm_bitreversal_32
+	EXPORT arm_bitreversal_16
+
+#if   defined ( __CC_ARM )     /* Keil */
+#elif defined ( __IASMARM__ )  /* IAR */
+#elif defined ( __CSMC__ )	   /* Cosmic */
+#elif defined ( __TI_ARM__ )   /* TI ARM */
+#elif defined ( __GNUC__ )     /* GCC */
+	.type   arm_bitreversal_16, %function
+	.type   arm_bitreversal_32, %function
+#endif
+
+#if defined (ARM_MATH_CM0_FAMILY)
+
+arm_bitreversal_32 PROC
+	ADDS     r3,r1,#1
+	PUSH     {r4-r6}
+	ADDS     r1,r2,#0
+	LSRS     r3,r3,#1
+arm_bitreversal_32_0 LABEL
+	LDRH     r2,[r1,#2]
+	LDRH     r6,[r1,#0]
+	ADD      r2,r0,r2
+	ADD      r6,r0,r6
+	LDR      r5,[r2,#0]
+	LDR      r4,[r6,#0]
+	STR      r5,[r6,#0]
+	STR      r4,[r2,#0]
+	LDR      r5,[r2,#4]
+	LDR      r4,[r6,#4]
+	STR      r5,[r6,#4]
+	STR      r4,[r2,#4]
+	ADDS     r1,r1,#4
+	SUBS     r3,r3,#1
+	BNE      arm_bitreversal_32_0
+	POP      {r4-r6}
+	BX       lr
+	ENDP
+
+arm_bitreversal_16 PROC
+	ADDS     r3,r1,#1
+	PUSH     {r4-r6}
+	ADDS     r1,r2,#0
+	LSRS     r3,r3,#1
+arm_bitreversal_16_0 LABEL
+	LDRH     r2,[r1,#2]
+	LDRH     r6,[r1,#0]
+    LSRS     r2,r2,#1
+    LSRS     r6,r6,#1
+	ADD      r2,r0,r2
+	ADD      r6,r0,r6
+	LDR      r5,[r2,#0]
+	LDR      r4,[r6,#0]
+	STR      r5,[r6,#0]
+	STR      r4,[r2,#0]
+	ADDS     r1,r1,#4
+	SUBS     r3,r3,#1
+	BNE      arm_bitreversal_16_0
+	POP      {r4-r6}
+	BX       lr
+	ENDP
+
+#else
+
+arm_bitreversal_32 PROC
+	ADDS     r3,r1,#1
+	CMP      r3,#1
+	IT       LS
+	BXLS     lr
+	PUSH     {r4-r9}
+	ADDS     r1,r2,#2
+	LSRS     r3,r3,#2
+arm_bitreversal_32_0 LABEL       ;/* loop unrolled by 2 */
+	LDRH     r8,[r1,#4]
+	LDRH     r9,[r1,#2]
+	LDRH     r2,[r1,#0]
+	LDRH     r12,[r1,#-2]
+	ADD      r8,r0,r8
+	ADD      r9,r0,r9
+	ADD      r2,r0,r2
+	ADD      r12,r0,r12
+	LDR      r7,[r9,#0]
+	LDR      r6,[r8,#0]
+	LDR      r5,[r2,#0]
+	LDR      r4,[r12,#0]
+	STR      r6,[r9,#0]
+	STR      r7,[r8,#0]
+	STR      r5,[r12,#0]
+	STR      r4,[r2,#0]
+	LDR      r7,[r9,#4]
+	LDR      r6,[r8,#4]
+	LDR      r5,[r2,#4]
+	LDR      r4,[r12,#4]
+	STR      r6,[r9,#4]
+	STR      r7,[r8,#4]
+	STR      r5,[r12,#4]
+	STR      r4,[r2,#4]
+	ADDS     r1,r1,#8
+	SUBS     r3,r3,#1
+	BNE      arm_bitreversal_32_0
+	POP      {r4-r9}
+	BX       lr
+	ENDP
+
+arm_bitreversal_16 PROC
+	ADDS     r3,r1,#1
+	CMP      r3,#1
+	IT       LS
+	BXLS     lr
+	PUSH     {r4-r9}
+	ADDS     r1,r2,#2
+	LSRS     r3,r3,#2
+arm_bitreversal_16_0 LABEL       ;/* loop unrolled by 2 */
+	LDRH     r8,[r1,#4]
+	LDRH     r9,[r1,#2]
+	LDRH     r2,[r1,#0]
+	LDRH     r12,[r1,#-2]
+	ADD      r8,r0,r8,LSR #1
+	ADD      r9,r0,r9,LSR #1
+	ADD      r2,r0,r2,LSR #1
+	ADD      r12,r0,r12,LSR #1
+	LDR      r7,[r9,#0]
+	LDR      r6,[r8,#0]
+	LDR      r5,[r2,#0]
+	LDR      r4,[r12,#0]
+	STR      r6,[r9,#0]
+	STR      r7,[r8,#0]
+	STR      r5,[r12,#0]
+	STR      r4,[r2,#0]
+	ADDS     r1,r1,#8
+	SUBS     r3,r3,#1
+	BNE      arm_bitreversal_16_0
+	POP      {r4-r9}
+	BX       lr
+	ENDP
+
+#endif
+
+	END
diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_bitreversal2.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_bitreversal2.c
index 77fac1f..29d5757 100644
--- a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_bitreversal2.c
+++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_bitreversal2.c
@@ -1,134 +1,99 @@
-/* ----------------------------------------------------------------------
- * Project:      CMSIS DSP Library
- * Title:        arm_bitreversal2.c
- * Description:  Bitreversal functions
- *
- * $Date:        23 April 2021
- * $Revision:    V1.9.0
- *
- * Target Processor: Cortex-M and Cortex-A cores
- * -------------------------------------------------------------------- */
-/*
- * Copyright (C) 2019 ARM Limited or its affiliates. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "dsp/transform_functions.h"
-#include "arm_common_tables.h"
-
-
-/**
-  @brief         In-place 64 bit reversal function.
-  @param[in,out] pSrc        points to in-place buffer of unknown 64-bit data type
-  @param[in]     bitRevLen   bit reversal table length
-  @param[in]     pBitRevTab  points to bit reversal table
-  @return        none
-*/
-
-void arm_bitreversal_64(
-        uint64_t *pSrc,
-  const uint16_t bitRevLen,
-  const uint16_t *pBitRevTab)
-{
-  uint64_t a, b, i, tmp;
-
-  for (i = 0; i < bitRevLen; )
-  {
-     a = pBitRevTab[i    ] >> 2;
-     b = pBitRevTab[i + 1] >> 2;
-
-     //real
-     tmp = pSrc[a];
-     pSrc[a] = pSrc[b];
-     pSrc[b] = tmp;
-
-     //complex
-     tmp = pSrc[a+1];
-     pSrc[a+1] = pSrc[b+1];
-     pSrc[b+1] = tmp;
-
-    i += 2;
-  }
-}
-
-/**
-  @brief         In-place 32 bit reversal function.
-  @param[in,out] pSrc        points to in-place buffer of unknown 32-bit data type
-  @param[in]     bitRevLen   bit reversal table length
-  @param[in]     pBitRevTab  points to bit reversal table
-  @return        none
-*/
-
-void arm_bitreversal_32(
-        uint32_t *pSrc,
-  const uint16_t bitRevLen,
-  const uint16_t *pBitRevTab)
-{
-  uint32_t a, b, i, tmp;
-
-  for (i = 0; i < bitRevLen; )
-  {
-     a = pBitRevTab[i    ] >> 2;
-     b = pBitRevTab[i + 1] >> 2;
-
-     //real
-     tmp = pSrc[a];
-     pSrc[a] = pSrc[b];
-     pSrc[b] = tmp;
-
-     //complex
-     tmp = pSrc[a+1];
-     pSrc[a+1] = pSrc[b+1];
-     pSrc[b+1] = tmp;
-
-    i += 2;
-  }
-}
-
-
-/**
-  @brief         In-place 16 bit reversal function.
-  @param[in,out] pSrc        points to in-place buffer of unknown 16-bit data type
-  @param[in]     bitRevLen   bit reversal table length
-  @param[in]     pBitRevTab  points to bit reversal table
-  @return        none
-*/
-
-void arm_bitreversal_16(
-        uint16_t *pSrc,
-  const uint16_t bitRevLen,
-  const uint16_t *pBitRevTab)
-{
-  uint16_t a, b, i, tmp;
-
-  for (i = 0; i < bitRevLen; )
-  {
-     a = pBitRevTab[i    ] >> 2;
-     b = pBitRevTab[i + 1] >> 2;
-
-     //real
-     tmp = pSrc[a];
-     pSrc[a] = pSrc[b];
-     pSrc[b] = tmp;
-
-     //complex
-     tmp = pSrc[a+1];
-     pSrc[a+1] = pSrc[b+1];
-     pSrc[b+1] = tmp;
-
-    i += 2;
-  }
-}
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_bitreversal2.c
+ * Description:  Bitreversal functions
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.0.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+#include "arm_common_tables.h"
+
+/**
+  @brief         In-place 32 bit reversal function.
+  @param[in,out] pSrc        points to in-place buffer of unknown 32-bit data type
+  @param[in]     bitRevLen   bit reversal table length
+  @param[in]     pBitRevTab  points to bit reversal table
+  @return        none
+*/
+
+void arm_bitreversal_32(
+        uint32_t *pSrc, 
+  const uint16_t bitRevLen, 
+  const uint16_t *pBitRevTab)
+{
+  uint32_t a, b, i, tmp;
+
+  for (i = 0; i < bitRevLen; )
+  {
+     a = pBitRevTab[i    ] >> 2;
+     b = pBitRevTab[i + 1] >> 2;
+
+     //real
+     tmp = pSrc[a];
+     pSrc[a] = pSrc[b];
+     pSrc[b] = tmp;
+
+     //complex
+     tmp = pSrc[a+1];
+     pSrc[a+1] = pSrc[b+1];
+     pSrc[b+1] = tmp;
+
+    i += 2;
+  }
+}
+
+
+/**
+  @brief         In-place 16 bit reversal function.
+  @param[in,out] pSrc        points to in-place buffer of unknown 16-bit data type
+  @param[in]     bitRevLen   bit reversal table length
+  @param[in]     pBitRevTab  points to bit reversal table
+  @return        none
+*/
+
+void arm_bitreversal_16(
+        uint16_t *pSrc, 
+  const uint16_t bitRevLen, 
+  const uint16_t *pBitRevTab)
+{
+  uint16_t a, b, i, tmp;
+
+  for (i = 0; i < bitRevLen; )
+  {
+     a = pBitRevTab[i    ] >> 2;
+     b = pBitRevTab[i + 1] >> 2;
+
+     //real
+     tmp = pSrc[a];
+     pSrc[a] = pSrc[b];
+     pSrc[b] = tmp;
+
+     //complex
+     tmp = pSrc[a+1];
+     pSrc[a+1] = pSrc[b+1];
+     pSrc[b+1] = tmp;
+
+    i += 2;
+  }
+}
diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_f32.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_f32.c
index 0b33e8e..15dbb8f 100644
--- a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_f32.c
+++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_f32.c
@@ -1,1192 +1,629 @@
-/* ----------------------------------------------------------------------
- * Project:      CMSIS DSP Library
- * Title:        arm_cfft_f32.c
- * Description:  Combined Radix Decimation in Frequency CFFT Floating point processing function
- *
- * $Date:        23 April 2021
- * $Revision:    V1.9.0
- *
- * Target Processor: Cortex-M and Cortex-A cores
- * -------------------------------------------------------------------- */
-/*
- * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "dsp/transform_functions.h"
-#include "arm_common_tables.h"
-
-#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
-
-#include "arm_helium_utils.h"
-#include "arm_vec_fft.h"
-#include "arm_mve_tables.h"
-
-
-static float32_t arm_inverse_fft_length_f32(uint16_t fftLen)
-{
-  float32_t retValue=1.0;
-
-  switch (fftLen)
-  {
-
-  case 4096U:
-    retValue = 0.000244140625;
-    break;
-
-  case 2048U:
-    retValue = 0.00048828125;
-    break;
-
-  case 1024U:
-    retValue = 0.0009765625f;
-    break;
-
-  case 512U:
-    retValue = 0.001953125;
-    break;
-
-  case 256U:
-    retValue = 0.00390625f;
-    break;
-
-  case 128U:
-    retValue = 0.0078125;
-    break;
-
-  case 64U:
-    retValue = 0.015625f;
-    break;
-
-  case 32U:
-    retValue = 0.03125;
-    break;
-
-  case 16U:
-    retValue = 0.0625f;
-    break;
-
-
-  default:
-    break;
-  }
-  return(retValue);
-}
-
-
-
-
-static void _arm_radix4_butterfly_f32_mve(const arm_cfft_instance_f32 * S,float32_t * pSrc, uint32_t fftLen)
-{
-    f32x4_t     vecTmp0, vecTmp1;
-    f32x4_t     vecSum0, vecDiff0, vecSum1, vecDiff1;
-    f32x4_t     vecA, vecB, vecC, vecD;
-    uint32_t    blkCnt;
-    uint32_t    n1, n2;
-    uint32_t    stage = 0;
-    int32_t     iter = 1;
-    static const int32_t strides[4] = {
-        (0 - 16) * (int32_t)sizeof(q31_t *),
-        (1 - 16) * (int32_t)sizeof(q31_t *),
-        (8 - 16) * (int32_t)sizeof(q31_t *),
-        (9 - 16) * (int32_t)sizeof(q31_t *)
-    };
-
-    n2 = fftLen;
-    n1 = n2;
-    n2 >>= 2u;
-    for (int k = fftLen / 4u; k > 1; k >>= 2)
-    {
-        float32_t const     *p_rearranged_twiddle_tab_stride1 =
-                            &S->rearranged_twiddle_stride1[
-                            S->rearranged_twiddle_tab_stride1_arr[stage]];
-        float32_t const     *p_rearranged_twiddle_tab_stride2 =
-                            &S->rearranged_twiddle_stride2[
-                            S->rearranged_twiddle_tab_stride2_arr[stage]];
-        float32_t const     *p_rearranged_twiddle_tab_stride3 =
-                            &S->rearranged_twiddle_stride3[
-                            S->rearranged_twiddle_tab_stride3_arr[stage]];
-
-        float32_t * pBase = pSrc;
-        for (int i = 0; i < iter; i++)
-        {
-            float32_t    *inA = pBase;
-            float32_t    *inB = inA + n2 * CMPLX_DIM;
-            float32_t    *inC = inB + n2 * CMPLX_DIM;
-            float32_t    *inD = inC + n2 * CMPLX_DIM;
-            float32_t const *pW1 = p_rearranged_twiddle_tab_stride1;
-            float32_t const *pW2 = p_rearranged_twiddle_tab_stride2;
-            float32_t const *pW3 = p_rearranged_twiddle_tab_stride3;
-            f32x4_t            vecW;
-
-            blkCnt = n2 / 2;
-            /*
-             * load 2 f32 complex pair
-             */
-            vecA = vldrwq_f32(inA);
-            vecC = vldrwq_f32(inC);
-            while (blkCnt > 0U)
-            {
-                vecB = vldrwq_f32(inB);
-                vecD = vldrwq_f32(inD);
-
-                vecSum0 = vecA + vecC;  /* vecSum0 = vaddq(vecA, vecC) */
-                vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */
-
-                vecSum1 = vecB + vecD;
-                vecDiff1 = vecB - vecD;
-                /*
-                 * [ 1 1 1 1 ] * [ A B C D ]' .* 1
-                 */
-                vecTmp0 = vecSum0 + vecSum1;
-                vst1q(inA, vecTmp0);
-                inA += 4;
-
-                /*
-                 * [ 1 -1 1 -1 ] * [ A B C D ]'
-                 */
-                vecTmp0 = vecSum0 - vecSum1;
-                /*
-                 * [ 1 -1 1 -1 ] * [ A B C D ]'.* W2
-                 */
-                vecW = vld1q(pW2);
-                pW2 += 4;
-                vecTmp1 = MVE_CMPLX_MULT_FLT_Conj_AxB(vecW, vecTmp0);
-                vst1q(inB, vecTmp1);
-                inB += 4;
-
-                /*
-                 * [ 1 -i -1 +i ] * [ A B C D ]'
-                 */
-                vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1);
-                /*
-                 * [ 1 -i -1 +i ] * [ A B C D ]'.* W1
-                 */
-                vecW = vld1q(pW1);
-                pW1 +=4;
-                vecTmp1 = MVE_CMPLX_MULT_FLT_Conj_AxB(vecW, vecTmp0);
-                vst1q(inC, vecTmp1);
-                inC += 4;
-
-                /*
-                 * [ 1 +i -1 -i ] * [ A B C D ]'
-                 */
-                vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1);
-                /*
-                 * [ 1 +i -1 -i ] * [ A B C D ]'.* W3
-                 */
-                vecW = vld1q(pW3);
-                pW3 += 4;
-                vecTmp1 = MVE_CMPLX_MULT_FLT_Conj_AxB(vecW, vecTmp0);
-                vst1q(inD, vecTmp1);
-                inD += 4;
-
-                vecA = vldrwq_f32(inA);
-                vecC = vldrwq_f32(inC);
-
-                blkCnt--;
-            }
-            pBase +=  CMPLX_DIM * n1;
-        }
-        n1 = n2;
-        n2 >>= 2u;
-        iter = iter << 2;
-        stage++;
-    }
-
-    /*
-     * start of Last stage process
-     */
-    uint32x4_t vecScGathAddr = vld1q_u32((uint32_t*)strides);
-    vecScGathAddr = vecScGathAddr + (uint32_t) pSrc;
-
-    /* load scheduling */
-    vecA = vldrwq_gather_base_wb_f32(&vecScGathAddr, 64);
-    vecC = vldrwq_gather_base_f32(vecScGathAddr, 16);
-
-    blkCnt = (fftLen >> 3);
-    while (blkCnt > 0U)
-    {
-        vecSum0 = vecA + vecC;  /* vecSum0 = vaddq(vecA, vecC) */
-        vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */
-
-        vecB = vldrwq_gather_base_f32(vecScGathAddr, 8);
-        vecD = vldrwq_gather_base_f32(vecScGathAddr, 24);
-
-        vecSum1 = vecB + vecD;
-        vecDiff1 = vecB - vecD;
-
-        /* pre-load for next iteration */
-        vecA = vldrwq_gather_base_wb_f32(&vecScGathAddr, 64);
-        vecC = vldrwq_gather_base_f32(vecScGathAddr, 16);
-
-        vecTmp0 = vecSum0 + vecSum1;
-        vstrwq_scatter_base_f32(vecScGathAddr, -64, vecTmp0);
-
-        vecTmp0 = vecSum0 - vecSum1;
-        vstrwq_scatter_base_f32(vecScGathAddr, -64 + 8, vecTmp0);
-
-        vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1);
-        vstrwq_scatter_base_f32(vecScGathAddr, -64 + 16, vecTmp0);
-
-        vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1);
-        vstrwq_scatter_base_f32(vecScGathAddr, -64 + 24, vecTmp0);
-
-        blkCnt--;
-    }
-
-    /*
-     * End of last stage process
-     */
-}
-
-static void arm_cfft_radix4by2_f32_mve(const arm_cfft_instance_f32 * S, float32_t *pSrc, uint32_t fftLen)
-{
-    float32_t const *pCoefVec;
-    float32_t const  *pCoef = S->pTwiddle;
-    float32_t        *pIn0, *pIn1;
-    uint32_t          n2;
-    uint32_t          blkCnt;
-    f32x4_t         vecIn0, vecIn1, vecSum, vecDiff;
-    f32x4_t         vecCmplxTmp, vecTw;
-
-
-    n2 = fftLen >> 1;
-    pIn0 = pSrc;
-    pIn1 = pSrc + fftLen;
-    pCoefVec = pCoef;
-
-    blkCnt = n2 / 2;
-    while (blkCnt > 0U)
-    {
-        vecIn0 = *(f32x4_t *) pIn0;
-        vecIn1 = *(f32x4_t *) pIn1;
-        vecTw = vld1q(pCoefVec);
-        pCoefVec += 4;
-
-        vecSum = vecIn0 + vecIn1;
-        vecDiff = vecIn0 - vecIn1;
-
-        vecCmplxTmp = MVE_CMPLX_MULT_FLT_Conj_AxB(vecTw, vecDiff);
-
-        vst1q(pIn0, vecSum);
-        pIn0 += 4;
-        vst1q(pIn1, vecCmplxTmp);
-        pIn1 += 4;
-
-        blkCnt--;
-    }
-
-    _arm_radix4_butterfly_f32_mve(S, pSrc, n2);
-
-    _arm_radix4_butterfly_f32_mve(S, pSrc + fftLen, n2);
-
-    pIn0 = pSrc;
-}
-
-static void _arm_radix4_butterfly_inverse_f32_mve(const arm_cfft_instance_f32 * S,float32_t * pSrc, uint32_t fftLen, float32_t onebyfftLen)
-{
-    f32x4_t vecTmp0, vecTmp1;
-    f32x4_t vecSum0, vecDiff0, vecSum1, vecDiff1;
-    f32x4_t vecA, vecB, vecC, vecD;
-    uint32_t  blkCnt;
-    uint32_t  n1, n2;
-    uint32_t  stage = 0;
-    int32_t  iter = 1;
-    static const int32_t strides[4] = {
-        (0 - 16) * (int32_t)sizeof(q31_t *),
-        (1 - 16) * (int32_t)sizeof(q31_t *),
-        (8 - 16) * (int32_t)sizeof(q31_t *),
-        (9 - 16) * (int32_t)sizeof(q31_t *)
-    };
-
-    n2 = fftLen;
-    n1 = n2;
-    n2 >>= 2u;
-    for (int k = fftLen / 4; k > 1; k >>= 2)
-    {
-        float32_t const *p_rearranged_twiddle_tab_stride1 =
-                &S->rearranged_twiddle_stride1[
-                S->rearranged_twiddle_tab_stride1_arr[stage]];
-        float32_t const *p_rearranged_twiddle_tab_stride2 =
-                &S->rearranged_twiddle_stride2[
-                S->rearranged_twiddle_tab_stride2_arr[stage]];
-        float32_t const *p_rearranged_twiddle_tab_stride3 =
-                &S->rearranged_twiddle_stride3[
-                S->rearranged_twiddle_tab_stride3_arr[stage]];
-
-        float32_t * pBase = pSrc;
-        for (int i = 0; i < iter; i++)
-        {
-            float32_t    *inA = pBase;
-            float32_t    *inB = inA + n2 * CMPLX_DIM;
-            float32_t    *inC = inB + n2 * CMPLX_DIM;
-            float32_t    *inD = inC + n2 * CMPLX_DIM;
-            float32_t const *pW1 = p_rearranged_twiddle_tab_stride1;
-            float32_t const *pW2 = p_rearranged_twiddle_tab_stride2;
-            float32_t const *pW3 = p_rearranged_twiddle_tab_stride3;
-            f32x4_t       vecW;
-
-            blkCnt = n2 / 2;
-            /*
-             * load 2 f32 complex pair
-             */
-            vecA = vldrwq_f32(inA);
-            vecC = vldrwq_f32(inC);
-            while (blkCnt > 0U)
-            {
-                vecB = vldrwq_f32(inB);
-                vecD = vldrwq_f32(inD);
-
-                vecSum0 = vecA + vecC;  /* vecSum0 = vaddq(vecA, vecC) */
-                vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */
-
-                vecSum1 = vecB + vecD;
-                vecDiff1 = vecB - vecD;
-                /*
-                 * [ 1 1 1 1 ] * [ A B C D ]' .* 1
-                 */
-                vecTmp0 = vecSum0 + vecSum1;
-                vst1q(inA, vecTmp0);
-                inA += 4;
-                /*
-                 * [ 1 -1 1 -1 ] * [ A B C D ]'
-                 */
-                vecTmp0 = vecSum0 - vecSum1;
-                /*
-                 * [ 1 -1 1 -1 ] * [ A B C D ]'.* W1
-                 */
-                vecW = vld1q(pW2);
-                pW2 += 4;
-                vecTmp1 = MVE_CMPLX_MULT_FLT_AxB(vecW, vecTmp0);
-                vst1q(inB, vecTmp1);
-                inB += 4;
-
-                /*
-                 * [ 1 -i -1 +i ] * [ A B C D ]'
-                 */
-                vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1);
-                /*
-                 * [ 1 -i -1 +i ] * [ A B C D ]'.* W2
-                 */
-                vecW = vld1q(pW1);
-                pW1 += 4;
-                vecTmp1 = MVE_CMPLX_MULT_FLT_AxB(vecW, vecTmp0);
-                vst1q(inC, vecTmp1);
-                inC += 4;
-
-                /*
-                 * [ 1 +i -1 -i ] * [ A B C D ]'
-                 */
-                vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1);
-                /*
-                 * [ 1 +i -1 -i ] * [ A B C D ]'.* W3
-                 */
-                vecW = vld1q(pW3);
-                pW3 += 4;
-                vecTmp1 = MVE_CMPLX_MULT_FLT_AxB(vecW, vecTmp0);
-                vst1q(inD, vecTmp1);
-                inD += 4;
-
-                vecA = vldrwq_f32(inA);
-                vecC = vldrwq_f32(inC);
-
-                blkCnt--;
-            }
-            pBase +=  CMPLX_DIM * n1;
-        }
-        n1 = n2;
-        n2 >>= 2u;
-        iter = iter << 2;
-        stage++;
-    }
-
-    /*
-     * start of Last stage process
-     */
-    uint32x4_t vecScGathAddr = vld1q_u32 ((uint32_t*)strides);
-    vecScGathAddr = vecScGathAddr + (uint32_t) pSrc;
-
-    /*
-     * load scheduling
-     */
-    vecA = vldrwq_gather_base_wb_f32(&vecScGathAddr, 64);
-    vecC = vldrwq_gather_base_f32(vecScGathAddr, 16);
-
-    blkCnt = (fftLen >> 3);
-    while (blkCnt > 0U)
-    {
-        vecSum0 = vecA + vecC;  /* vecSum0 = vaddq(vecA, vecC) */
-        vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */
-
-        vecB = vldrwq_gather_base_f32(vecScGathAddr, 8);
-        vecD = vldrwq_gather_base_f32(vecScGathAddr, 24);
-
-        vecSum1 = vecB + vecD;
-        vecDiff1 = vecB - vecD;
-
-        vecA = vldrwq_gather_base_wb_f32(&vecScGathAddr, 64);
-        vecC = vldrwq_gather_base_f32(vecScGathAddr, 16);
-
-        vecTmp0 = vecSum0 + vecSum1;
-        vecTmp0 = vecTmp0 * onebyfftLen;
-        vstrwq_scatter_base_f32(vecScGathAddr, -64, vecTmp0);
-
-        vecTmp0 = vecSum0 - vecSum1;
-        vecTmp0 = vecTmp0 * onebyfftLen;
-        vstrwq_scatter_base_f32(vecScGathAddr, -64 + 8, vecTmp0);
-
-        vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1);
-        vecTmp0 = vecTmp0 * onebyfftLen;
-        vstrwq_scatter_base_f32(vecScGathAddr, -64 + 16, vecTmp0);
-
-        vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1);
-        vecTmp0 = vecTmp0 * onebyfftLen;
-        vstrwq_scatter_base_f32(vecScGathAddr, -64 + 24, vecTmp0);
-
-        blkCnt--;
-    }
-
-    /*
-     * End of last stage process
-     */
-}
-
-static void arm_cfft_radix4by2_inverse_f32_mve(const arm_cfft_instance_f32 * S,float32_t *pSrc, uint32_t fftLen)
-{
-    float32_t const *pCoefVec;
-    float32_t const  *pCoef = S->pTwiddle;
-    float32_t        *pIn0, *pIn1;
-    uint32_t          n2;
-    float32_t         onebyfftLen = arm_inverse_fft_length_f32(fftLen);
-    uint32_t          blkCnt;
-    f32x4_t         vecIn0, vecIn1, vecSum, vecDiff;
-    f32x4_t         vecCmplxTmp, vecTw;
-
-
-    n2 = fftLen >> 1;
-    pIn0 = pSrc;
-    pIn1 = pSrc + fftLen;
-    pCoefVec = pCoef;
-
-    blkCnt = n2 / 2;
-    while (blkCnt > 0U)
-    {
-        vecIn0 = *(f32x4_t *) pIn0;
-        vecIn1 = *(f32x4_t *) pIn1;
-        vecTw = vld1q(pCoefVec);
-        pCoefVec += 4;
-
-        vecSum = vecIn0 + vecIn1;
-        vecDiff = vecIn0 - vecIn1;
-
-        vecCmplxTmp = MVE_CMPLX_MULT_FLT_AxB(vecTw, vecDiff);
-
-        vst1q(pIn0, vecSum);
-        pIn0 += 4;
-        vst1q(pIn1, vecCmplxTmp);
-        pIn1 += 4;
-
-        blkCnt--;
-    }
-
-    _arm_radix4_butterfly_inverse_f32_mve(S, pSrc, n2, onebyfftLen);
-
-    _arm_radix4_butterfly_inverse_f32_mve(S, pSrc + fftLen, n2, onebyfftLen);
-}
-
-
-/**
-  @addtogroup ComplexFFT
-  @{
- */
-
-/**
-  @brief         Processing function for the floating-point complex FFT.
-  @param[in]     S              points to an instance of the floating-point CFFT structure
-  @param[in,out] p1             points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place
-  @param[in]     ifftFlag       flag that selects transform direction
-                   - value = 0: forward transform
-                   - value = 1: inverse transform
-  @param[in]     bitReverseFlag flag that enables / disables bit reversal of output
-                   - value = 0: disables bit reversal of output
-                   - value = 1: enables bit reversal of output
-  @return        none
- */
-
-
-void arm_cfft_f32(
-  const arm_cfft_instance_f32 * S,
-        float32_t * pSrc,
-        uint8_t ifftFlag,
-        uint8_t bitReverseFlag)
-{
-        uint32_t fftLen = S->fftLen;
-
-        if (ifftFlag == 1U) {
-
-            switch (fftLen) {
-            case 16:
-            case 64:
-            case 256:
-            case 1024:
-            case 4096:
-                _arm_radix4_butterfly_inverse_f32_mve(S, pSrc, fftLen, arm_inverse_fft_length_f32(S->fftLen));
-                break;
-
-            case 32:
-            case 128:
-            case 512:
-            case 2048:
-                arm_cfft_radix4by2_inverse_f32_mve(S, pSrc, fftLen);
-                break;
-            }
-        } else {
-            switch (fftLen) {
-            case 16:
-            case 64:
-            case 256:
-            case 1024:
-            case 4096:
-                _arm_radix4_butterfly_f32_mve(S, pSrc, fftLen);
-                break;
-
-            case 32:
-            case 128:
-            case 512:
-            case 2048:
-                arm_cfft_radix4by2_f32_mve(S, pSrc, fftLen);
-                break;
-            }
-        }
-
-
-        if (bitReverseFlag)
-        {
-
-            arm_bitreversal_32_inpl_mve((uint32_t*)pSrc, S->bitRevLength, S->pBitRevTable);
-
-        }
-}
-
-
-#else
-extern void arm_radix8_butterfly_f32(
-        float32_t * pSrc,
-        uint16_t fftLen,
-  const float32_t * pCoef,
-        uint16_t twidCoefModifier);
-
-extern void arm_bitreversal_32(
-        uint32_t * pSrc,
-  const uint16_t bitRevLen,
-  const uint16_t * pBitRevTable);
-
-/**
-  @ingroup groupTransforms
- */
-
-/**
-  @defgroup ComplexFFT Complex FFT Functions
-
-  @par
-                   The Fast Fourier Transform (FFT) is an efficient algorithm for computing the
-                   Discrete Fourier Transform (DFT).  The FFT can be orders of magnitude faster
-                   than the DFT, especially for long lengths.
-                   The algorithms described in this section
-                   operate on complex data.  A separate set of functions is devoted to handling
-                   of real sequences.
-  @par
-                   There are separate algorithms for handling floating-point, Q15, and Q31 data
-                   types.  The algorithms available for each data type are described next.
-  @par
-                   The FFT functions operate in-place.  That is, the array holding the input data
-                   will also be used to hold the corresponding result.  The input data is complex
-                   and contains <code>2*fftLen</code> interleaved values as shown below.
-                   <pre>{real[0], imag[0], real[1], imag[1], ...} </pre>
-                   The FFT result will be contained in the same array and the frequency domain
-                   values will have the same interleaving.
-
-  @par Floating-point
-                   The floating-point complex FFT uses a mixed-radix algorithm.  Multiple radix-8
-                   stages are performed along with a single radix-2 or radix-4 stage, as needed.
-                   The algorithm supports lengths of [16, 32, 64, ..., 4096] and each length uses
-                   a different twiddle factor table.
-  @par
-                   The function uses the standard FFT definition and output values may grow by a
-                   factor of <code>fftLen</code> when computing the forward transform.  The
-                   inverse transform includes a scale of <code>1/fftLen</code> as part of the
-                   calculation and this matches the textbook definition of the inverse FFT.
-  @par
-                   For the MVE version, the new arm_cfft_init_f32 initialization function is
-                   <b>mandatory</b>. <b>Compilation flags are available to include only the required tables for the
-                   needed FFTs.</b> Other FFT versions can continue to be initialized as
-                   explained below.
-  @par
-                   For not MVE versions, pre-initialized data structures containing twiddle factors
-                   and bit reversal tables are provided and defined in <code>arm_const_structs.h</code>.  Include
-                   this header in your function and then pass one of the constant structures as
-                   an argument to arm_cfft_f32.  For example:
-  @par
-                   <code>arm_cfft_f32(arm_cfft_sR_f32_len64, pSrc, 1, 1)</code>
-  @par
-                   computes a 64-point inverse complex FFT including bit reversal.
-                   The data structures are treated as constant data and not modified during the
-                   calculation.  The same data structure can be reused for multiple transforms
-                   including mixing forward and inverse transforms.
-  @par
-                   Earlier releases of the library provided separate radix-2 and radix-4
-                   algorithms that operated on floating-point data.  These functions are still
-                   provided but are deprecated.  The older functions are slower and less general
-                   than the new functions.
-  @par
-                   An example of initialization of the constants for the arm_cfft_f32 function follows:
-  @code
-                   const static arm_cfft_instance_f32 *S;
-                   ...
-                     switch (length) {
-                       case 16:
-                         S = &arm_cfft_sR_f32_len16;
-                         break;
-                       case 32:
-                         S = &arm_cfft_sR_f32_len32;
-                         break;
-                       case 64:
-                         S = &arm_cfft_sR_f32_len64;
-                         break;
-                       case 128:
-                         S = &arm_cfft_sR_f32_len128;
-                         break;
-                       case 256:
-                         S = &arm_cfft_sR_f32_len256;
-                         break;
-                       case 512:
-                         S = &arm_cfft_sR_f32_len512;
-                         break;
-                       case 1024:
-                         S = &arm_cfft_sR_f32_len1024;
-                         break;
-                       case 2048:
-                         S = &arm_cfft_sR_f32_len2048;
-                         break;
-                       case 4096:
-                         S = &arm_cfft_sR_f32_len4096;
-                         break;
-                     }
-  @endcode
-  @par
-                   The new arm_cfft_init_f32 can also be used.
-  @par Q15 and Q31
-                   The floating-point complex FFT uses a mixed-radix algorithm.  Multiple radix-4
-                   stages are performed along with a single radix-2 stage, as needed.
-                   The algorithm supports lengths of [16, 32, 64, ..., 4096] and each length uses
-                   a different twiddle factor table.
-  @par
-                   The function uses the standard FFT definition and output values may grow by a
-                   factor of <code>fftLen</code> when computing the forward transform.  The
-                   inverse transform includes a scale of <code>1/fftLen</code> as part of the
-                   calculation and this matches the textbook definition of the inverse FFT.
-  @par
-                   Pre-initialized data structures containing twiddle factors and bit reversal
-                   tables are provided and defined in <code>arm_const_structs.h</code>.  Include
-                   this header in your function and then pass one of the constant structures as
-                   an argument to arm_cfft_q31. For example:
-  @par
-                   <code>arm_cfft_q31(arm_cfft_sR_q31_len64, pSrc, 1, 1)</code>
-  @par
-                   computes a 64-point inverse complex FFT including bit reversal.
-                   The data structures are treated as constant data and not modified during the
-                   calculation.  The same data structure can be reused for multiple transforms
-                   including mixing forward and inverse transforms.
-  @par
-                   Earlier releases of the library provided separate radix-2 and radix-4
-                   algorithms that operated on floating-point data.  These functions are still
-                   provided but are deprecated.  The older functions are slower and less general
-                   than the new functions.
-  @par
-                   An example of initialization of the constants for the arm_cfft_q31 function follows:
-  @code
-                   const static arm_cfft_instance_q31 *S;
-                   ...
-                     switch (length) {
-                       case 16:
-                         S = &arm_cfft_sR_q31_len16;
-                         break;
-                       case 32:
-                         S = &arm_cfft_sR_q31_len32;
-                         break;
-                       case 64:
-                         S = &arm_cfft_sR_q31_len64;
-                         break;
-                       case 128:
-                         S = &arm_cfft_sR_q31_len128;
-                         break;
-                       case 256:
-                         S = &arm_cfft_sR_q31_len256;
-                         break;
-                       case 512:
-                         S = &arm_cfft_sR_q31_len512;
-                         break;
-                       case 1024:
-                         S = &arm_cfft_sR_q31_len1024;
-                         break;
-                       case 2048:
-                         S = &arm_cfft_sR_q31_len2048;
-                         break;
-                       case 4096:
-                         S = &arm_cfft_sR_q31_len4096;
-                         break;
-                     }
-  @endcode
-
- */
-
-void arm_cfft_radix8by2_f32 (arm_cfft_instance_f32 * S, float32_t * p1)
-{
-  uint32_t    L  = S->fftLen;
-  float32_t * pCol1, * pCol2, * pMid1, * pMid2;
-  float32_t * p2 = p1 + L;
-  const float32_t * tw = (float32_t *) S->pTwiddle;
-  float32_t t1[4], t2[4], t3[4], t4[4], twR, twI;
-  float32_t m0, m1, m2, m3;
-  uint32_t l;
-
-  pCol1 = p1;
-  pCol2 = p2;
-
-  /* Define new length */
-  L >>= 1;
-
-  /* Initialize mid pointers */
-  pMid1 = p1 + L;
-  pMid2 = p2 + L;
-
-  /* do two dot Fourier transform */
-  for (l = L >> 2; l > 0; l-- )
-  {
-    t1[0] = p1[0];
-    t1[1] = p1[1];
-    t1[2] = p1[2];
-    t1[3] = p1[3];
-
-    t2[0] = p2[0];
-    t2[1] = p2[1];
-    t2[2] = p2[2];
-    t2[3] = p2[3];
-
-    t3[0] = pMid1[0];
-    t3[1] = pMid1[1];
-    t3[2] = pMid1[2];
-    t3[3] = pMid1[3];
-
-    t4[0] = pMid2[0];
-    t4[1] = pMid2[1];
-    t4[2] = pMid2[2];
-    t4[3] = pMid2[3];
-
-    *p1++ = t1[0] + t2[0];
-    *p1++ = t1[1] + t2[1];
-    *p1++ = t1[2] + t2[2];
-    *p1++ = t1[3] + t2[3];    /* col 1 */
-
-    t2[0] = t1[0] - t2[0];
-    t2[1] = t1[1] - t2[1];
-    t2[2] = t1[2] - t2[2];
-    t2[3] = t1[3] - t2[3];    /* for col 2 */
-
-    *pMid1++ = t3[0] + t4[0];
-    *pMid1++ = t3[1] + t4[1];
-    *pMid1++ = t3[2] + t4[2];
-    *pMid1++ = t3[3] + t4[3]; /* col 1 */
-
-    t4[0] = t4[0] - t3[0];
-    t4[1] = t4[1] - t3[1];
-    t4[2] = t4[2] - t3[2];
-    t4[3] = t4[3] - t3[3];    /* for col 2 */
-
-    twR = *tw++;
-    twI = *tw++;
-
-    /* multiply by twiddle factors */
-    m0 = t2[0] * twR;
-    m1 = t2[1] * twI;
-    m2 = t2[1] * twR;
-    m3 = t2[0] * twI;
-
-    /* R  =  R  *  Tr - I * Ti */
-    *p2++ = m0 + m1;
-    /* I  =  I  *  Tr + R * Ti */
-    *p2++ = m2 - m3;
-
-    /* use vertical symmetry */
-    /*  0.9988 - 0.0491i <==> -0.0491 - 0.9988i */
-    m0 = t4[0] * twI;
-    m1 = t4[1] * twR;
-    m2 = t4[1] * twI;
-    m3 = t4[0] * twR;
-
-    *pMid2++ = m0 - m1;
-    *pMid2++ = m2 + m3;
-
-    twR = *tw++;
-    twI = *tw++;
-
-    m0 = t2[2] * twR;
-    m1 = t2[3] * twI;
-    m2 = t2[3] * twR;
-    m3 = t2[2] * twI;
-
-    *p2++ = m0 + m1;
-    *p2++ = m2 - m3;
-
-    m0 = t4[2] * twI;
-    m1 = t4[3] * twR;
-    m2 = t4[3] * twI;
-    m3 = t4[2] * twR;
-
-    *pMid2++ = m0 - m1;
-    *pMid2++ = m2 + m3;
-  }
-
-  /* first col */
-  arm_radix8_butterfly_f32 (pCol1, L, (float32_t *) S->pTwiddle, 2U);
-
-  /* second col */
-  arm_radix8_butterfly_f32 (pCol2, L, (float32_t *) S->pTwiddle, 2U);
-}
-
-void arm_cfft_radix8by4_f32 (arm_cfft_instance_f32 * S, float32_t * p1)
-{
-    uint32_t    L  = S->fftLen >> 1;
-    float32_t * pCol1, *pCol2, *pCol3, *pCol4, *pEnd1, *pEnd2, *pEnd3, *pEnd4;
-    const float32_t *tw2, *tw3, *tw4;
-    float32_t * p2 = p1 + L;
-    float32_t * p3 = p2 + L;
-    float32_t * p4 = p3 + L;
-    float32_t t2[4], t3[4], t4[4], twR, twI;
-    float32_t p1ap3_0, p1sp3_0, p1ap3_1, p1sp3_1;
-    float32_t m0, m1, m2, m3;
-    uint32_t l, twMod2, twMod3, twMod4;
-
-    pCol1 = p1;         /* points to real values by default */
-    pCol2 = p2;
-    pCol3 = p3;
-    pCol4 = p4;
-    pEnd1 = p2 - 1;     /* points to imaginary values by default */
-    pEnd2 = p3 - 1;
-    pEnd3 = p4 - 1;
-    pEnd4 = pEnd3 + L;
-
-    tw2 = tw3 = tw4 = (float32_t *) S->pTwiddle;
-
-    L >>= 1;
-
-    /* do four dot Fourier transform */
-
-    twMod2 = 2;
-    twMod3 = 4;
-    twMod4 = 6;
-
-    /* TOP */
-    p1ap3_0 = p1[0] + p3[0];
-    p1sp3_0 = p1[0] - p3[0];
-    p1ap3_1 = p1[1] + p3[1];
-    p1sp3_1 = p1[1] - p3[1];
-
-    /* col 2 */
-    t2[0] = p1sp3_0 + p2[1] - p4[1];
-    t2[1] = p1sp3_1 - p2[0] + p4[0];
-    /* col 3 */
-    t3[0] = p1ap3_0 - p2[0] - p4[0];
-    t3[1] = p1ap3_1 - p2[1] - p4[1];
-    /* col 4 */
-    t4[0] = p1sp3_0 - p2[1] + p4[1];
-    t4[1] = p1sp3_1 + p2[0] - p4[0];
-    /* col 1 */
-    *p1++ = p1ap3_0 + p2[0] + p4[0];
-    *p1++ = p1ap3_1 + p2[1] + p4[1];
-
-    /* Twiddle factors are ones */
-    *p2++ = t2[0];
-    *p2++ = t2[1];
-    *p3++ = t3[0];
-    *p3++ = t3[1];
-    *p4++ = t4[0];
-    *p4++ = t4[1];
-
-    tw2 += twMod2;
-    tw3 += twMod3;
-    tw4 += twMod4;
-
-    for (l = (L - 2) >> 1; l > 0; l-- )
-    {
-      /* TOP */
-      p1ap3_0 = p1[0] + p3[0];
-      p1sp3_0 = p1[0] - p3[0];
-      p1ap3_1 = p1[1] + p3[1];
-      p1sp3_1 = p1[1] - p3[1];
-      /* col 2 */
-      t2[0] = p1sp3_0 + p2[1] - p4[1];
-      t2[1] = p1sp3_1 - p2[0] + p4[0];
-      /* col 3 */
-      t3[0] = p1ap3_0 - p2[0] - p4[0];
-      t3[1] = p1ap3_1 - p2[1] - p4[1];
-      /* col 4 */
-      t4[0] = p1sp3_0 - p2[1] + p4[1];
-      t4[1] = p1sp3_1 + p2[0] - p4[0];
-      /* col 1 - top */
-      *p1++ = p1ap3_0 + p2[0] + p4[0];
-      *p1++ = p1ap3_1 + p2[1] + p4[1];
-
-      /* BOTTOM */
-      p1ap3_1 = pEnd1[-1] + pEnd3[-1];
-      p1sp3_1 = pEnd1[-1] - pEnd3[-1];
-      p1ap3_0 = pEnd1[ 0] + pEnd3[0];
-      p1sp3_0 = pEnd1[ 0] - pEnd3[0];
-      /* col 2 */
-      t2[2] = pEnd2[0] - pEnd4[0] + p1sp3_1;
-      t2[3] = pEnd1[0] - pEnd3[0] - pEnd2[-1] + pEnd4[-1];
-      /* col 3 */
-      t3[2] = p1ap3_1 - pEnd2[-1] - pEnd4[-1];
-      t3[3] = p1ap3_0 - pEnd2[ 0] - pEnd4[ 0];
-      /* col 4 */
-      t4[2] = pEnd2[ 0] - pEnd4[ 0] - p1sp3_1;
-      t4[3] = pEnd4[-1] - pEnd2[-1] - p1sp3_0;
-      /* col 1 - Bottom */
-      *pEnd1-- = p1ap3_0 + pEnd2[ 0] + pEnd4[ 0];
-      *pEnd1-- = p1ap3_1 + pEnd2[-1] + pEnd4[-1];
-
-      /* COL 2 */
-      /* read twiddle factors */
-      twR = *tw2++;
-      twI = *tw2++;
-      /* multiply by twiddle factors */
-      /*  let    Z1 = a + i(b),   Z2 = c + i(d) */
-      /*   =>  Z1 * Z2  =  (a*c - b*d) + i(b*c + a*d) */
-
-      /* Top */
-      m0 = t2[0] * twR;
-      m1 = t2[1] * twI;
-      m2 = t2[1] * twR;
-      m3 = t2[0] * twI;
-
-      *p2++ = m0 + m1;
-      *p2++ = m2 - m3;
-      /* use vertical symmetry col 2 */
-      /* 0.9997 - 0.0245i  <==>  0.0245 - 0.9997i */
-      /* Bottom */
-      m0 = t2[3] * twI;
-      m1 = t2[2] * twR;
-      m2 = t2[2] * twI;
-      m3 = t2[3] * twR;
-
-      *pEnd2-- = m0 - m1;
-      *pEnd2-- = m2 + m3;
-
-      /* COL 3 */
-      twR = tw3[0];
-      twI = tw3[1];
-      tw3 += twMod3;
-      /* Top */
-      m0 = t3[0] * twR;
-      m1 = t3[1] * twI;
-      m2 = t3[1] * twR;
-      m3 = t3[0] * twI;
-
-      *p3++ = m0 + m1;
-      *p3++ = m2 - m3;
-      /* use vertical symmetry col 3 */
-      /* 0.9988 - 0.0491i  <==>  -0.9988 - 0.0491i */
-      /* Bottom */
-      m0 = -t3[3] * twR;
-      m1 =  t3[2] * twI;
-      m2 =  t3[2] * twR;
-      m3 =  t3[3] * twI;
-
-      *pEnd3-- = m0 - m1;
-      *pEnd3-- = m3 - m2;
-
-      /* COL 4 */
-      twR = tw4[0];
-      twI = tw4[1];
-      tw4 += twMod4;
-      /* Top */
-      m0 = t4[0] * twR;
-      m1 = t4[1] * twI;
-      m2 = t4[1] * twR;
-      m3 = t4[0] * twI;
-
-      *p4++ = m0 + m1;
-      *p4++ = m2 - m3;
-      /* use vertical symmetry col 4 */
-      /* 0.9973 - 0.0736i  <==>  -0.0736 + 0.9973i */
-      /* Bottom */
-      m0 = t4[3] * twI;
-      m1 = t4[2] * twR;
-      m2 = t4[2] * twI;
-      m3 = t4[3] * twR;
-
-      *pEnd4-- = m0 - m1;
-      *pEnd4-- = m2 + m3;
-    }
-
-    /* MIDDLE */
-    /* Twiddle factors are */
-    /*  1.0000  0.7071-0.7071i  -1.0000i  -0.7071-0.7071i */
-    p1ap3_0 = p1[0] + p3[0];
-    p1sp3_0 = p1[0] - p3[0];
-    p1ap3_1 = p1[1] + p3[1];
-    p1sp3_1 = p1[1] - p3[1];
-
-    /* col 2 */
-    t2[0] = p1sp3_0 + p2[1] - p4[1];
-    t2[1] = p1sp3_1 - p2[0] + p4[0];
-    /* col 3 */
-    t3[0] = p1ap3_0 - p2[0] - p4[0];
-    t3[1] = p1ap3_1 - p2[1] - p4[1];
-    /* col 4 */
-    t4[0] = p1sp3_0 - p2[1] + p4[1];
-    t4[1] = p1sp3_1 + p2[0] - p4[0];
-    /* col 1 - Top */
-    *p1++ = p1ap3_0 + p2[0] + p4[0];
-    *p1++ = p1ap3_1 + p2[1] + p4[1];
-
-    /* COL 2 */
-    twR = tw2[0];
-    twI = tw2[1];
-
-    m0 = t2[0] * twR;
-    m1 = t2[1] * twI;
-    m2 = t2[1] * twR;
-    m3 = t2[0] * twI;
-
-    *p2++ = m0 + m1;
-    *p2++ = m2 - m3;
-    /* COL 3 */
-    twR = tw3[0];
-    twI = tw3[1];
-
-    m0 = t3[0] * twR;
-    m1 = t3[1] * twI;
-    m2 = t3[1] * twR;
-    m3 = t3[0] * twI;
-
-    *p3++ = m0 + m1;
-    *p3++ = m2 - m3;
-    /* COL 4 */
-    twR = tw4[0];
-    twI = tw4[1];
-
-    m0 = t4[0] * twR;
-    m1 = t4[1] * twI;
-    m2 = t4[1] * twR;
-    m3 = t4[0] * twI;
-
-    *p4++ = m0 + m1;
-    *p4++ = m2 - m3;
-
-    /* first col */
-    arm_radix8_butterfly_f32 (pCol1, L, (float32_t *) S->pTwiddle, 4U);
-
-    /* second col */
-    arm_radix8_butterfly_f32 (pCol2, L, (float32_t *) S->pTwiddle, 4U);
-
-    /* third col */
-    arm_radix8_butterfly_f32 (pCol3, L, (float32_t *) S->pTwiddle, 4U);
-
-    /* fourth col */
-    arm_radix8_butterfly_f32 (pCol4, L, (float32_t *) S->pTwiddle, 4U);
-}
-
-/**
-  @addtogroup ComplexFFT
-  @{
- */
-
-/**
-  @brief         Processing function for the floating-point complex FFT.
-  @param[in]     S              points to an instance of the floating-point CFFT structure
-  @param[in,out] p1             points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place
-  @param[in]     ifftFlag       flag that selects transform direction
-                   - value = 0: forward transform
-                   - value = 1: inverse transform
-  @param[in]     bitReverseFlag flag that enables / disables bit reversal of output
-                   - value = 0: disables bit reversal of output
-                   - value = 1: enables bit reversal of output
-  @return        none
- */
-
-void arm_cfft_f32(
-  const arm_cfft_instance_f32 * S,
-        float32_t * p1,
-        uint8_t ifftFlag,
-        uint8_t bitReverseFlag)
-{
-  uint32_t  L = S->fftLen, l;
-  float32_t invL, * pSrc;
-
-  if (ifftFlag == 1U)
-  {
-    /* Conjugate input data */
-    pSrc = p1 + 1;
-    for (l = 0; l < L; l++)
-    {
-      *pSrc = -*pSrc;
-      pSrc += 2;
-    }
-  }
-
-  switch (L)
-  {
-  case 16:
-  case 128:
-  case 1024:
-    arm_cfft_radix8by2_f32 ( (arm_cfft_instance_f32 *) S, p1);
-    break;
-  case 32:
-  case 256:
-  case 2048:
-    arm_cfft_radix8by4_f32 ( (arm_cfft_instance_f32 *) S, p1);
-    break;
-  case 64:
-  case 512:
-  case 4096:
-    arm_radix8_butterfly_f32 ( p1, L, (float32_t *) S->pTwiddle, 1);
-    break;
-  }
-
-  if ( bitReverseFlag )
-    arm_bitreversal_32 ((uint32_t*) p1, S->bitRevLength, S->pBitRevTable);
-
-  if (ifftFlag == 1U)
-  {
-    invL = 1.0f / (float32_t)L;
-
-    /* Conjugate and scale output data */
-    pSrc = p1;
-    for (l= 0; l < L; l++)
-    {
-      *pSrc++ *=   invL ;
-      *pSrc    = -(*pSrc) * invL;
-      pSrc++;
-    }
-  }
-}
-#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
-
-/**
-  @} end of ComplexFFT group
- */
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_cfft_f32.c
+ * Description:  Combined Radix Decimation in Frequency CFFT Floating point processing function
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+#include "arm_common_tables.h"
+
+extern void arm_radix8_butterfly_f32(
+        float32_t * pSrc,
+        uint16_t fftLen,
+  const float32_t * pCoef,
+        uint16_t twidCoefModifier);
+
+extern void arm_bitreversal_32(
+        uint32_t * pSrc,
+  const uint16_t bitRevLen,
+  const uint16_t * pBitRevTable);
+
+/**
+  @ingroup groupTransforms
+ */
+
+/**
+  @defgroup ComplexFFT Complex FFT Functions
+ 
+  @par
+                   The Fast Fourier Transform (FFT) is an efficient algorithm for computing the
+                   Discrete Fourier Transform (DFT).  The FFT can be orders of magnitude faster
+                   than the DFT, especially for long lengths.
+                   The algorithms described in this section
+                   operate on complex data.  A separate set of functions is devoted to handling
+                   of real sequences.
+  @par
+                   There are separate algorithms for handling floating-point, Q15, and Q31 data
+                   types.  The algorithms available for each data type are described next.
+  @par
+                   The FFT functions operate in-place.  That is, the array holding the input data
+                   will also be used to hold the corresponding result.  The input data is complex
+                   and contains <code>2*fftLen</code> interleaved values as shown below.
+                   <pre>{real[0], imag[0], real[1], imag[1], ...} </pre>
+                   The FFT result will be contained in the same array and the frequency domain
+                   values will have the same interleaving.
+ 
+  @par Floating-point
+                   The floating-point complex FFT uses a mixed-radix algorithm.  Multiple radix-8
+                   stages are performed along with a single radix-2 or radix-4 stage, as needed.
+                   The algorithm supports lengths of [16, 32, 64, ..., 4096] and each length uses
+                   a different twiddle factor table.
+  @par
+                   The function uses the standard FFT definition and output values may grow by a
+                   factor of <code>fftLen</code> when computing the forward transform.  The
+                   inverse transform includes a scale of <code>1/fftLen</code> as part of the
+                   calculation and this matches the textbook definition of the inverse FFT.
+  @par
+                   Pre-initialized data structures containing twiddle factors and bit reversal
+                   tables are provided and defined in <code>arm_const_structs.h</code>.  Include
+                   this header in your function and then pass one of the constant structures as
+                   an argument to arm_cfft_f32.  For example:
+  @par
+                   <code>arm_cfft_f32(arm_cfft_sR_f32_len64, pSrc, 1, 1)</code>
+  @par
+                   computes a 64-point inverse complex FFT including bit reversal.
+                   The data structures are treated as constant data and not modified during the
+                   calculation.  The same data structure can be reused for multiple transforms
+                   including mixing forward and inverse transforms.
+  @par
+                   Earlier releases of the library provided separate radix-2 and radix-4
+                   algorithms that operated on floating-point data.  These functions are still
+                   provided but are deprecated.  The older functions are slower and less general
+                   than the new functions.
+  @par
+                   An example of initialization of the constants for the arm_cfft_f32 function follows:
+  @code
+                   const static arm_cfft_instance_f32 *S;
+                   ...
+                     switch (length) {
+                       case 16:
+                         S = &arm_cfft_sR_f32_len16;
+                         break;
+                       case 32:
+                         S = &arm_cfft_sR_f32_len32;
+                         break;
+                       case 64:
+                         S = &arm_cfft_sR_f32_len64;
+                         break;
+                       case 128:
+                         S = &arm_cfft_sR_f32_len128;
+                         break;
+                       case 256:
+                         S = &arm_cfft_sR_f32_len256;
+                         break;
+                       case 512:
+                         S = &arm_cfft_sR_f32_len512;
+                         break;
+                       case 1024:
+                         S = &arm_cfft_sR_f32_len1024;
+                         break;
+                       case 2048:
+                         S = &arm_cfft_sR_f32_len2048;
+                         break;
+                       case 4096:
+                         S = &arm_cfft_sR_f32_len4096;
+                         break;
+                     }
+  @endcode
+  @par Q15 and Q31
+                   The floating-point complex FFT uses a mixed-radix algorithm.  Multiple radix-4
+                   stages are performed along with a single radix-2 stage, as needed.
+                   The algorithm supports lengths of [16, 32, 64, ..., 4096] and each length uses
+                   a different twiddle factor table.
+  @par
+                   The function uses the standard FFT definition and output values may grow by a
+                   factor of <code>fftLen</code> when computing the forward transform.  The
+                   inverse transform includes a scale of <code>1/fftLen</code> as part of the
+                   calculation and this matches the textbook definition of the inverse FFT.
+  @par
+                   Pre-initialized data structures containing twiddle factors and bit reversal
+                   tables are provided and defined in <code>arm_const_structs.h</code>.  Include
+                   this header in your function and then pass one of the constant structures as
+                   an argument to arm_cfft_q31. For example:
+  @par
+                   <code>arm_cfft_q31(arm_cfft_sR_q31_len64, pSrc, 1, 1)</code>
+  @par
+                   computes a 64-point inverse complex FFT including bit reversal.
+                   The data structures are treated as constant data and not modified during the
+                   calculation.  The same data structure can be reused for multiple transforms
+                   including mixing forward and inverse transforms.
+  @par
+                   Earlier releases of the library provided separate radix-2 and radix-4
+                   algorithms that operated on floating-point data.  These functions are still
+                   provided but are deprecated.  The older functions are slower and less general
+                   than the new functions.
+  @par
+                   An example of initialization of the constants for the arm_cfft_q31 function follows:
+  @code
+                   const static arm_cfft_instance_q31 *S;
+                   ...
+                     switch (length) {
+                       case 16:
+                         S = &arm_cfft_sR_q31_len16;
+                         break;
+                       case 32:
+                         S = &arm_cfft_sR_q31_len32;
+                         break;
+                       case 64:
+                         S = &arm_cfft_sR_q31_len64;
+                         break;
+                       case 128:
+                         S = &arm_cfft_sR_q31_len128;
+                         break;
+                       case 256:
+                         S = &arm_cfft_sR_q31_len256;
+                         break;
+                       case 512:
+                         S = &arm_cfft_sR_q31_len512;
+                         break;
+                       case 1024:
+                         S = &arm_cfft_sR_q31_len1024;
+                         break;
+                       case 2048:
+                         S = &arm_cfft_sR_q31_len2048;
+                         break;
+                       case 4096:
+                         S = &arm_cfft_sR_q31_len4096;
+                         break;
+                     }
+  @endcode
+ 
+ */
+
+void arm_cfft_radix8by2_f32 (arm_cfft_instance_f32 * S, float32_t * p1)
+{
+  uint32_t    L  = S->fftLen;
+  float32_t * pCol1, * pCol2, * pMid1, * pMid2;
+  float32_t * p2 = p1 + L;
+  const float32_t * tw = (float32_t *) S->pTwiddle;
+  float32_t t1[4], t2[4], t3[4], t4[4], twR, twI;
+  float32_t m0, m1, m2, m3;
+  uint32_t l;
+
+  pCol1 = p1;
+  pCol2 = p2;
+
+  /* Define new length */
+  L >>= 1;
+
+  /* Initialize mid pointers */
+  pMid1 = p1 + L;
+  pMid2 = p2 + L;
+
+  /* do two dot Fourier transform */
+  for (l = L >> 2; l > 0; l-- )
+  {
+    t1[0] = p1[0];
+    t1[1] = p1[1];
+    t1[2] = p1[2];
+    t1[3] = p1[3];
+
+    t2[0] = p2[0];
+    t2[1] = p2[1];
+    t2[2] = p2[2];
+    t2[3] = p2[3];
+
+    t3[0] = pMid1[0];
+    t3[1] = pMid1[1];
+    t3[2] = pMid1[2];
+    t3[3] = pMid1[3];
+
+    t4[0] = pMid2[0];
+    t4[1] = pMid2[1];
+    t4[2] = pMid2[2];
+    t4[3] = pMid2[3];
+
+    *p1++ = t1[0] + t2[0];
+    *p1++ = t1[1] + t2[1];
+    *p1++ = t1[2] + t2[2];
+    *p1++ = t1[3] + t2[3];    /* col 1 */
+
+    t2[0] = t1[0] - t2[0];
+    t2[1] = t1[1] - t2[1];
+    t2[2] = t1[2] - t2[2];
+    t2[3] = t1[3] - t2[3];    /* for col 2 */
+
+    *pMid1++ = t3[0] + t4[0];
+    *pMid1++ = t3[1] + t4[1];
+    *pMid1++ = t3[2] + t4[2];
+    *pMid1++ = t3[3] + t4[3]; /* col 1 */
+
+    t4[0] = t4[0] - t3[0];
+    t4[1] = t4[1] - t3[1];
+    t4[2] = t4[2] - t3[2];
+    t4[3] = t4[3] - t3[3];    /* for col 2 */
+
+    twR = *tw++;
+    twI = *tw++;
+
+    /* multiply by twiddle factors */
+    m0 = t2[0] * twR;
+    m1 = t2[1] * twI;
+    m2 = t2[1] * twR;
+    m3 = t2[0] * twI;
+
+    /* R  =  R  *  Tr - I * Ti */
+    *p2++ = m0 + m1;
+    /* I  =  I  *  Tr + R * Ti */
+    *p2++ = m2 - m3;
+
+    /* use vertical symmetry */
+    /*  0.9988 - 0.0491i <==> -0.0491 - 0.9988i */
+    m0 = t4[0] * twI;
+    m1 = t4[1] * twR;
+    m2 = t4[1] * twI;
+    m3 = t4[0] * twR;
+
+    *pMid2++ = m0 - m1;
+    *pMid2++ = m2 + m3;
+
+    twR = *tw++;
+    twI = *tw++;
+
+    m0 = t2[2] * twR;
+    m1 = t2[3] * twI;
+    m2 = t2[3] * twR;
+    m3 = t2[2] * twI;
+
+    *p2++ = m0 + m1;
+    *p2++ = m2 - m3;
+
+    m0 = t4[2] * twI;
+    m1 = t4[3] * twR;
+    m2 = t4[3] * twI;
+    m3 = t4[2] * twR;
+
+    *pMid2++ = m0 - m1;
+    *pMid2++ = m2 + m3;
+  }
+
+  /* first col */
+  arm_radix8_butterfly_f32 (pCol1, L, (float32_t *) S->pTwiddle, 2U);
+
+  /* second col */
+  arm_radix8_butterfly_f32 (pCol2, L, (float32_t *) S->pTwiddle, 2U);
+}
+
+void arm_cfft_radix8by4_f32 (arm_cfft_instance_f32 * S, float32_t * p1)
+{
+    uint32_t    L  = S->fftLen >> 1;
+    float32_t * pCol1, *pCol2, *pCol3, *pCol4, *pEnd1, *pEnd2, *pEnd3, *pEnd4;
+    const float32_t *tw2, *tw3, *tw4;
+    float32_t * p2 = p1 + L;
+    float32_t * p3 = p2 + L;
+    float32_t * p4 = p3 + L;
+    float32_t t2[4], t3[4], t4[4], twR, twI;
+    float32_t p1ap3_0, p1sp3_0, p1ap3_1, p1sp3_1;
+    float32_t m0, m1, m2, m3;
+    uint32_t l, twMod2, twMod3, twMod4;
+
+    pCol1 = p1;         /* points to real values by default */
+    pCol2 = p2;
+    pCol3 = p3;
+    pCol4 = p4;
+    pEnd1 = p2 - 1;     /* points to imaginary values by default */
+    pEnd2 = p3 - 1;
+    pEnd3 = p4 - 1;
+    pEnd4 = pEnd3 + L;
+
+    tw2 = tw3 = tw4 = (float32_t *) S->pTwiddle;
+
+    L >>= 1;
+
+    /* do four dot Fourier transform */
+
+    twMod2 = 2;
+    twMod3 = 4;
+    twMod4 = 6;
+
+    /* TOP */
+    p1ap3_0 = p1[0] + p3[0];
+    p1sp3_0 = p1[0] - p3[0];
+    p1ap3_1 = p1[1] + p3[1];
+    p1sp3_1 = p1[1] - p3[1];
+
+    /* col 2 */
+    t2[0] = p1sp3_0 + p2[1] - p4[1];
+    t2[1] = p1sp3_1 - p2[0] + p4[0];
+    /* col 3 */
+    t3[0] = p1ap3_0 - p2[0] - p4[0];
+    t3[1] = p1ap3_1 - p2[1] - p4[1];
+    /* col 4 */
+    t4[0] = p1sp3_0 - p2[1] + p4[1];
+    t4[1] = p1sp3_1 + p2[0] - p4[0];
+    /* col 1 */
+    *p1++ = p1ap3_0 + p2[0] + p4[0];
+    *p1++ = p1ap3_1 + p2[1] + p4[1];
+
+    /* Twiddle factors are ones */
+    *p2++ = t2[0];
+    *p2++ = t2[1];
+    *p3++ = t3[0];
+    *p3++ = t3[1];
+    *p4++ = t4[0];
+    *p4++ = t4[1];
+
+    tw2 += twMod2;
+    tw3 += twMod3;
+    tw4 += twMod4;
+
+    for (l = (L - 2) >> 1; l > 0; l-- )
+    {
+      /* TOP */
+      p1ap3_0 = p1[0] + p3[0];
+      p1sp3_0 = p1[0] - p3[0];
+      p1ap3_1 = p1[1] + p3[1];
+      p1sp3_1 = p1[1] - p3[1];
+      /* col 2 */
+      t2[0] = p1sp3_0 + p2[1] - p4[1];
+      t2[1] = p1sp3_1 - p2[0] + p4[0];
+      /* col 3 */
+      t3[0] = p1ap3_0 - p2[0] - p4[0];
+      t3[1] = p1ap3_1 - p2[1] - p4[1];
+      /* col 4 */
+      t4[0] = p1sp3_0 - p2[1] + p4[1];
+      t4[1] = p1sp3_1 + p2[0] - p4[0];
+      /* col 1 - top */
+      *p1++ = p1ap3_0 + p2[0] + p4[0];
+      *p1++ = p1ap3_1 + p2[1] + p4[1];
+
+      /* BOTTOM */
+      p1ap3_1 = pEnd1[-1] + pEnd3[-1];
+      p1sp3_1 = pEnd1[-1] - pEnd3[-1];
+      p1ap3_0 = pEnd1[ 0] + pEnd3[0];
+      p1sp3_0 = pEnd1[ 0] - pEnd3[0];
+      /* col 2 */
+      t2[2] = pEnd2[0] - pEnd4[0] + p1sp3_1;
+      t2[3] = pEnd1[0] - pEnd3[0] - pEnd2[-1] + pEnd4[-1];
+      /* col 3 */
+      t3[2] = p1ap3_1 - pEnd2[-1] - pEnd4[-1];
+      t3[3] = p1ap3_0 - pEnd2[ 0] - pEnd4[ 0];
+      /* col 4 */
+      t4[2] = pEnd2[ 0] - pEnd4[ 0] - p1sp3_1;
+      t4[3] = pEnd4[-1] - pEnd2[-1] - p1sp3_0;
+      /* col 1 - Bottom */
+      *pEnd1-- = p1ap3_0 + pEnd2[ 0] + pEnd4[ 0];
+      *pEnd1-- = p1ap3_1 + pEnd2[-1] + pEnd4[-1];
+
+      /* COL 2 */
+      /* read twiddle factors */
+      twR = *tw2++;
+      twI = *tw2++;
+      /* multiply by twiddle factors */
+      /*  let    Z1 = a + i(b),   Z2 = c + i(d) */
+      /*   =>  Z1 * Z2  =  (a*c - b*d) + i(b*c + a*d) */
+
+      /* Top */
+      m0 = t2[0] * twR;
+      m1 = t2[1] * twI;
+      m2 = t2[1] * twR;
+      m3 = t2[0] * twI;
+
+      *p2++ = m0 + m1;
+      *p2++ = m2 - m3;
+      /* use vertical symmetry col 2 */
+      /* 0.9997 - 0.0245i  <==>  0.0245 - 0.9997i */
+      /* Bottom */
+      m0 = t2[3] * twI;
+      m1 = t2[2] * twR;
+      m2 = t2[2] * twI;
+      m3 = t2[3] * twR;
+
+      *pEnd2-- = m0 - m1;
+      *pEnd2-- = m2 + m3;
+
+      /* COL 3 */
+      twR = tw3[0];
+      twI = tw3[1];
+      tw3 += twMod3;
+      /* Top */
+      m0 = t3[0] * twR;
+      m1 = t3[1] * twI;
+      m2 = t3[1] * twR;
+      m3 = t3[0] * twI;
+
+      *p3++ = m0 + m1;
+      *p3++ = m2 - m3;
+      /* use vertical symmetry col 3 */
+      /* 0.9988 - 0.0491i  <==>  -0.9988 - 0.0491i */
+      /* Bottom */
+      m0 = -t3[3] * twR;
+      m1 =  t3[2] * twI;
+      m2 =  t3[2] * twR;
+      m3 =  t3[3] * twI;
+
+      *pEnd3-- = m0 - m1;
+      *pEnd3-- = m3 - m2;
+
+      /* COL 4 */
+      twR = tw4[0];
+      twI = tw4[1];
+      tw4 += twMod4;
+      /* Top */
+      m0 = t4[0] * twR;
+      m1 = t4[1] * twI;
+      m2 = t4[1] * twR;
+      m3 = t4[0] * twI;
+
+      *p4++ = m0 + m1;
+      *p4++ = m2 - m3;
+      /* use vertical symmetry col 4 */
+      /* 0.9973 - 0.0736i  <==>  -0.0736 + 0.9973i */
+      /* Bottom */
+      m0 = t4[3] * twI;
+      m1 = t4[2] * twR;
+      m2 = t4[2] * twI;
+      m3 = t4[3] * twR;
+
+      *pEnd4-- = m0 - m1;
+      *pEnd4-- = m2 + m3;
+    }
+
+    /* MIDDLE */
+    /* Twiddle factors are */
+    /*  1.0000  0.7071-0.7071i  -1.0000i  -0.7071-0.7071i */
+    p1ap3_0 = p1[0] + p3[0];
+    p1sp3_0 = p1[0] - p3[0];
+    p1ap3_1 = p1[1] + p3[1];
+    p1sp3_1 = p1[1] - p3[1];
+
+    /* col 2 */
+    t2[0] = p1sp3_0 + p2[1] - p4[1];
+    t2[1] = p1sp3_1 - p2[0] + p4[0];
+    /* col 3 */
+    t3[0] = p1ap3_0 - p2[0] - p4[0];
+    t3[1] = p1ap3_1 - p2[1] - p4[1];
+    /* col 4 */
+    t4[0] = p1sp3_0 - p2[1] + p4[1];
+    t4[1] = p1sp3_1 + p2[0] - p4[0];
+    /* col 1 - Top */
+    *p1++ = p1ap3_0 + p2[0] + p4[0];
+    *p1++ = p1ap3_1 + p2[1] + p4[1];
+
+    /* COL 2 */
+    twR = tw2[0];
+    twI = tw2[1];
+
+    m0 = t2[0] * twR;
+    m1 = t2[1] * twI;
+    m2 = t2[1] * twR;
+    m3 = t2[0] * twI;
+
+    *p2++ = m0 + m1;
+    *p2++ = m2 - m3;
+    /* COL 3 */
+    twR = tw3[0];
+    twI = tw3[1];
+
+    m0 = t3[0] * twR;
+    m1 = t3[1] * twI;
+    m2 = t3[1] * twR;
+    m3 = t3[0] * twI;
+
+    *p3++ = m0 + m1;
+    *p3++ = m2 - m3;
+    /* COL 4 */
+    twR = tw4[0];
+    twI = tw4[1];
+
+    m0 = t4[0] * twR;
+    m1 = t4[1] * twI;
+    m2 = t4[1] * twR;
+    m3 = t4[0] * twI;
+
+    *p4++ = m0 + m1;
+    *p4++ = m2 - m3;
+
+    /* first col */
+    arm_radix8_butterfly_f32 (pCol1, L, (float32_t *) S->pTwiddle, 4U);
+
+    /* second col */
+    arm_radix8_butterfly_f32 (pCol2, L, (float32_t *) S->pTwiddle, 4U);
+
+    /* third col */
+    arm_radix8_butterfly_f32 (pCol3, L, (float32_t *) S->pTwiddle, 4U);
+
+    /* fourth col */
+    arm_radix8_butterfly_f32 (pCol4, L, (float32_t *) S->pTwiddle, 4U);
+}
+
+/**
+  @addtogroup ComplexFFT
+  @{
+ */
+
+/**
+  @brief         Processing function for the floating-point complex FFT.
+  @param[in]     S              points to an instance of the floating-point CFFT structure
+  @param[in,out] p1             points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place
+  @param[in]     ifftFlag       flag that selects transform direction
+                   - value = 0: forward transform
+                   - value = 1: inverse transform
+  @param[in]     bitReverseFlag flag that enables / disables bit reversal of output
+                   - value = 0: disables bit reversal of output
+                   - value = 1: enables bit reversal of output
+  @return        none
+ */
+
+void arm_cfft_f32(
+  const arm_cfft_instance_f32 * S,
+        float32_t * p1,
+        uint8_t ifftFlag,
+        uint8_t bitReverseFlag)
+{
+  uint32_t  L = S->fftLen, l;
+  float32_t invL, * pSrc;
+
+  if (ifftFlag == 1U)
+  {
+    /* Conjugate input data */
+    pSrc = p1 + 1;
+    for (l = 0; l < L; l++)
+    {
+      *pSrc = -*pSrc;
+      pSrc += 2;
+    }
+  }
+
+  switch (L)
+  {
+  case 16:
+  case 128:
+  case 1024:
+    arm_cfft_radix8by2_f32 ( (arm_cfft_instance_f32 *) S, p1);
+    break;
+  case 32:
+  case 256:
+  case 2048:
+    arm_cfft_radix8by4_f32 ( (arm_cfft_instance_f32 *) S, p1);
+    break;
+  case 64:
+  case 512:
+  case 4096:
+    arm_radix8_butterfly_f32 ( p1, L, (float32_t *) S->pTwiddle, 1);
+    break;
+  }
+
+  if ( bitReverseFlag )
+    arm_bitreversal_32 ((uint32_t*) p1, S->bitRevLength, S->pBitRevTable);
+
+  if (ifftFlag == 1U)
+  {
+    invL = 1.0f / (float32_t)L;
+
+    /* Conjugate and scale output data */
+    pSrc = p1;
+    for (l= 0; l < L; l++)
+    {
+      *pSrc++ *=   invL ;
+      *pSrc    = -(*pSrc) * invL;
+      pSrc++;
+    }
+  }
+}
+
+/**
+  @} end of ComplexFFT group
+ */
diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_q15.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_q15.c
index 74a6e7a..a47dd02 100644
--- a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_q15.c
+++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_q15.c
@@ -1,893 +1,332 @@
-/* ----------------------------------------------------------------------
- * Project:      CMSIS DSP Library
- * Title:        arm_cfft_q15.c
- * Description:  Combined Radix Decimation in Q15 Frequency CFFT processing function
- *
- * $Date:        23 April 2021
- * $Revision:    V1.9.0
- *
- * Target Processor: Cortex-M and Cortex-A cores
- * -------------------------------------------------------------------- */
-/*
- * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "dsp/transform_functions.h"
-
-#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
-
-#include "arm_vec_fft.h"
-
-
-static void _arm_radix4_butterfly_q15_mve(
-    const arm_cfft_instance_q15 * S,
-    q15_t   *pSrc,
-    uint32_t fftLen)
-{
-    q15x8_t vecTmp0, vecTmp1;
-    q15x8_t vecSum0, vecDiff0, vecSum1, vecDiff1;
-    q15x8_t vecA, vecB, vecC, vecD;
-    uint32_t  blkCnt;
-    uint32_t  n1, n2;
-    uint32_t  stage = 0;
-    int32_t  iter = 1;
-    static const int32_t strides[4] = {
-        (0 - 16) * (int32_t)sizeof(q15_t *), (4 - 16) * (int32_t)sizeof(q15_t *),
-        (8 - 16) * (int32_t)sizeof(q15_t *), (12 - 16) * (int32_t)sizeof(q15_t *)
-    };
-
-    /*
-     * Process first stages
-     * Each stage in middle stages provides two down scaling of the input
-     */
-    n2 = fftLen;
-    n1 = n2;
-    n2 >>= 2u;
-
-    for (int k = fftLen / 4u; k > 1; k >>= 2u)
-    {
-        q15_t const *p_rearranged_twiddle_tab_stride2 =
-            &S->rearranged_twiddle_stride2[
-            S->rearranged_twiddle_tab_stride2_arr[stage]];
-        q15_t const *p_rearranged_twiddle_tab_stride3 = &S->rearranged_twiddle_stride3[
-            S->rearranged_twiddle_tab_stride3_arr[stage]];
-        q15_t const *p_rearranged_twiddle_tab_stride1 =
-            &S->rearranged_twiddle_stride1[
-            S->rearranged_twiddle_tab_stride1_arr[stage]];
-
-        q15_t * pBase = pSrc;
-        for (int i = 0; i < iter; i++)
-        {
-            q15_t    *inA = pBase;
-            q15_t    *inB = inA + n2 * CMPLX_DIM;
-            q15_t    *inC = inB + n2 * CMPLX_DIM;
-            q15_t    *inD = inC + n2 * CMPLX_DIM;
-            q15_t const *pW1 = p_rearranged_twiddle_tab_stride1;
-            q15_t const *pW2 = p_rearranged_twiddle_tab_stride2;
-            q15_t const *pW3 = p_rearranged_twiddle_tab_stride3;
-            q15x8_t    vecW;
-
-            blkCnt = n2 / 4;
-            /*
-             * load 4 x q15 complex pair
-             */
-            vecA = vldrhq_s16(inA);
-            vecC = vldrhq_s16(inC);
-            while (blkCnt > 0U)
-            {
-                vecB = vldrhq_s16(inB);
-                vecD = vldrhq_s16(inD);
-
-                vecSum0 = vhaddq(vecA, vecC);
-                vecDiff0 = vhsubq(vecA, vecC);
-
-                vecSum1 = vhaddq(vecB, vecD);
-                vecDiff1 = vhsubq(vecB, vecD);
-                /*
-                 * [ 1 1 1 1 ] * [ A B C D ]' .* 1
-                 */
-                vecTmp0 = vhaddq(vecSum0, vecSum1);
-                vst1q(inA, vecTmp0);
-                inA += 8;
-                /*
-                 * [ 1 -1 1 -1 ] * [ A B C D ]'
-                 */
-                vecTmp0 = vhsubq(vecSum0, vecSum1);
-                /*
-                 * [ 1 -1 1 -1 ] * [ A B C D ]'.* W2
-                 */
-                vecW = vld1q(pW2);
-                pW2 += 8;
-                vecTmp1 = MVE_CMPLX_MULT_FX_AxB(vecW, vecTmp0, q15x8_t);
-
-                vst1q(inB, vecTmp1);
-                inB += 8;
-                /*
-                 * [ 1 -i -1 +i ] * [ A B C D ]'
-                 */
-                vecTmp0 = MVE_CMPLX_SUB_FX_A_ixB(vecDiff0, vecDiff1);
-                /*
-                 * [ 1 -i -1 +i ] * [ A B C D ]'.* W1
-                 */
-                vecW = vld1q(pW1);
-                pW1 += 8;
-                vecTmp1 = MVE_CMPLX_MULT_FX_AxB(vecW, vecTmp0, q15x8_t);
-                vst1q(inC, vecTmp1);
-                inC += 8;
-
-                /*
-                 * [ 1 +i -1 -i ] * [ A B C D ]'
-                 */
-                vecTmp0 = MVE_CMPLX_ADD_FX_A_ixB(vecDiff0, vecDiff1);
-                /*
-                 * [ 1 +i -1 -i ] * [ A B C D ]'.* W3
-                 */
-                vecW = vld1q(pW3);
-                pW3 += 8;
-                vecTmp1 = MVE_CMPLX_MULT_FX_AxB(vecW, vecTmp0, q15x8_t);
-                vst1q(inD, vecTmp1);
-                inD += 8;
-
-                vecA = vldrhq_s16(inA);
-                vecC = vldrhq_s16(inC);
-
-                blkCnt--;
-            }
-            pBase +=  CMPLX_DIM * n1;
-        }
-        n1 = n2;
-        n2 >>= 2u;
-        iter = iter << 2;
-        stage++;
-    }
-
-    /*
-     * start of Last stage process
-     */
-    uint32x4_t vecScGathAddr = vld1q_u32 ((uint32_t*)strides);
-    vecScGathAddr = vecScGathAddr + (uint32_t) pSrc;
-
-    /*
-     * load scheduling
-     */
-    vecA = (q15x8_t) vldrwq_gather_base_wb_s32(&vecScGathAddr, 64);
-    vecC = (q15x8_t) vldrwq_gather_base_s32(vecScGathAddr, 8);
-
-    blkCnt = (fftLen >> 4);
-    while (blkCnt > 0U)
-    {
-        vecSum0 = vhaddq(vecA, vecC);
-        vecDiff0 = vhsubq(vecA, vecC);
-
-        vecB = (q15x8_t) vldrwq_gather_base_s32(vecScGathAddr, 4);
-        vecD = (q15x8_t) vldrwq_gather_base_s32(vecScGathAddr, 12);
-
-        vecSum1 = vhaddq(vecB, vecD);
-        vecDiff1 = vhsubq(vecB, vecD);
-        /*
-         * pre-load for next iteration
-         */
-        vecA = (q15x8_t) vldrwq_gather_base_wb_s32(&vecScGathAddr, 64);
-        vecC = (q15x8_t) vldrwq_gather_base_s32(vecScGathAddr, 8);
-
-        vecTmp0 = vhaddq(vecSum0, vecSum1);
-        vstrwq_scatter_base_s32(vecScGathAddr, -64, (int32x4_t) vecTmp0);
-
-        vecTmp0 = vhsubq(vecSum0, vecSum1);
-        vstrwq_scatter_base_s32(vecScGathAddr, -64 + 4, (int32x4_t) vecTmp0);
-
-        vecTmp0 = MVE_CMPLX_SUB_FX_A_ixB(vecDiff0, vecDiff1);
-        vstrwq_scatter_base_s32(vecScGathAddr, -64 + 8, (int32x4_t) vecTmp0);
-
-        vecTmp0 = MVE_CMPLX_ADD_FX_A_ixB(vecDiff0, vecDiff1);
-        vstrwq_scatter_base_s32(vecScGathAddr, -64 + 12, (int32x4_t) vecTmp0);
-
-        blkCnt--;
-    }
-
-}
-
-static void arm_cfft_radix4by2_q15_mve(const arm_cfft_instance_q15 *S, q15_t *pSrc, uint32_t fftLen)
-{
-    uint32_t n2;
-    q15_t *pIn0;
-    q15_t *pIn1;
-    const q15_t *pCoef = S->pTwiddle;
-    uint32_t     blkCnt;
-    q15x8_t    vecIn0, vecIn1, vecSum, vecDiff;
-    q15x8_t    vecCmplxTmp, vecTw;
-    q15_t  const *pCoefVec;
-
-    n2 = fftLen >> 1;
-
-    pIn0 = pSrc;
-    pIn1 = pSrc + fftLen;
-    pCoefVec = pCoef;
-
-    blkCnt = n2 / 4;
-
-    while (blkCnt > 0U)
-    {
-        vecIn0 = *(q15x8_t *) pIn0;
-        vecIn1 = *(q15x8_t *) pIn1;
-
-        vecIn0 = vecIn0 >> 1;
-        vecIn1 = vecIn1 >> 1;
-        vecSum = vhaddq(vecIn0, vecIn1);
-        vst1q(pIn0, vecSum);
-        pIn0 += 8;
-
-        vecTw = vld1q(pCoefVec);
-        pCoefVec += 8;
-
-        vecDiff = vhsubq(vecIn0, vecIn1);
-        vecCmplxTmp = MVE_CMPLX_MULT_FX_AxConjB(vecDiff, vecTw, q15x8_t);
-        vst1q(pIn1, vecCmplxTmp);
-        pIn1 += 8;
-
-        blkCnt--;
-    }
-
-    _arm_radix4_butterfly_q15_mve(S, pSrc, n2);
-
-    _arm_radix4_butterfly_q15_mve(S, pSrc + fftLen, n2);
-
-
-    pIn0 = pSrc;
-    blkCnt = (fftLen << 1) >> 3;
-    while (blkCnt > 0U)
-    {
-        vecIn0 = *(q15x8_t *) pIn0;
-        vecIn0 = vecIn0 << 1;
-        vst1q(pIn0, vecIn0);
-        pIn0 += 8;
-        blkCnt--;
-    }
-    /*
-     * tail
-     * (will be merged thru tail predication)
-     */
-    blkCnt = (fftLen << 1) & 7;
-    if (blkCnt > 0U)
-    {
-        mve_pred16_t p0 = vctp16q(blkCnt);
-
-        vecIn0 = *(q15x8_t *) pIn0;
-        vecIn0 = vecIn0 << 1;
-        vstrhq_p(pIn0, vecIn0, p0);
-    }
-}
-
-static void _arm_radix4_butterfly_inverse_q15_mve(const arm_cfft_instance_q15 *S,q15_t *pSrc, uint32_t fftLen)
-{
-    q15x8_t vecTmp0, vecTmp1;
-    q15x8_t vecSum0, vecDiff0, vecSum1, vecDiff1;
-    q15x8_t vecA, vecB, vecC, vecD;
-    uint32_t  blkCnt;
-    uint32_t  n1, n2;
-    uint32_t  stage = 0;
-    int32_t  iter = 1;
-    static const int32_t strides[4] = {
-        (0 - 16) * (int32_t)sizeof(q15_t *), (4 - 16) * (int32_t)sizeof(q15_t *),
-        (8 - 16) * (int32_t)sizeof(q15_t *), (12 - 16) * (int32_t)sizeof(q15_t *)
-    };
-
-
-    /*
-     * Process first stages
-     * Each stage in middle stages provides two down scaling of the input
-     */
-    n2 = fftLen;
-    n1 = n2;
-    n2 >>= 2u;
-
-    for (int k = fftLen / 4u; k > 1; k >>= 2u)
-    {
-        q15_t const *p_rearranged_twiddle_tab_stride2 =
-            &S->rearranged_twiddle_stride2[
-            S->rearranged_twiddle_tab_stride2_arr[stage]];
-        q15_t const *p_rearranged_twiddle_tab_stride3 = &S->rearranged_twiddle_stride3[
-            S->rearranged_twiddle_tab_stride3_arr[stage]];
-        q15_t const *p_rearranged_twiddle_tab_stride1 =
-            &S->rearranged_twiddle_stride1[
-            S->rearranged_twiddle_tab_stride1_arr[stage]];
-
-        q15_t * pBase = pSrc;
-        for (int i = 0; i < iter; i++)
-        {
-            q15_t    *inA = pBase;
-            q15_t    *inB = inA + n2 * CMPLX_DIM;
-            q15_t    *inC = inB + n2 * CMPLX_DIM;
-            q15_t    *inD = inC + n2 * CMPLX_DIM;
-            q15_t const *pW1 = p_rearranged_twiddle_tab_stride1;
-            q15_t const *pW2 = p_rearranged_twiddle_tab_stride2;
-            q15_t const *pW3 = p_rearranged_twiddle_tab_stride3;
-            q15x8_t    vecW;
-
-
-            blkCnt = n2 / 4;
-            /*
-             * load 4 x q15 complex pair
-             */
-            vecA = vldrhq_s16(inA);
-            vecC = vldrhq_s16(inC);
-            while (blkCnt > 0U)
-            {
-                vecB = vldrhq_s16(inB);
-                vecD = vldrhq_s16(inD);
-
-                vecSum0 = vhaddq(vecA, vecC);
-                vecDiff0 = vhsubq(vecA, vecC);
-
-                vecSum1 = vhaddq(vecB, vecD);
-                vecDiff1 = vhsubq(vecB, vecD);
-                /*
-                 * [ 1 1 1 1 ] * [ A B C D ]' .* 1
-                 */
-                vecTmp0 = vhaddq(vecSum0, vecSum1);
-                vst1q(inA, vecTmp0);
-                inA += 8;
-                /*
-                 * [ 1 -1 1 -1 ] * [ A B C D ]'
-                 */
-                vecTmp0 = vhsubq(vecSum0, vecSum1);
-                /*
-                 * [ 1 -1 1 -1 ] * [ A B C D ]'.* W2
-                 */
-                vecW = vld1q(pW2);
-                pW2 += 8;
-                vecTmp1 = MVE_CMPLX_MULT_FX_AxConjB(vecTmp0, vecW, q15x8_t);
-
-                vst1q(inB, vecTmp1);
-                inB += 8;
-                /*
-                 * [ 1 -i -1 +i ] * [ A B C D ]'
-                 */
-                vecTmp0 = MVE_CMPLX_ADD_FX_A_ixB(vecDiff0, vecDiff1);
-                /*
-                 * [ 1 -i -1 +i ] * [ A B C D ]'.* W1
-                 */
-                vecW = vld1q(pW1);
-                pW1 += 8;
-                vecTmp1 = MVE_CMPLX_MULT_FX_AxConjB(vecTmp0, vecW, q15x8_t);
-                vst1q(inC, vecTmp1);
-                inC += 8;
-                /*
-                 * [ 1 +i -1 -i ] * [ A B C D ]'
-                 */
-                vecTmp0 = MVE_CMPLX_SUB_FX_A_ixB(vecDiff0, vecDiff1);
-                /*
-                 * [ 1 +i -1 -i ] * [ A B C D ]'.* W3
-                 */
-                vecW = vld1q(pW3);
-                pW3 += 8;
-                vecTmp1 = MVE_CMPLX_MULT_FX_AxConjB(vecTmp0, vecW, q15x8_t);
-                vst1q(inD, vecTmp1);
-                inD += 8;
-
-                vecA = vldrhq_s16(inA);
-                vecC = vldrhq_s16(inC);
-
-                blkCnt--;
-            }
-            pBase +=  CMPLX_DIM * n1;
-        }
-        n1 = n2;
-        n2 >>= 2u;
-        iter = iter << 2;
-        stage++;
-    }
-
-    /*
-     * start of Last stage process
-     */
-    uint32x4_t vecScGathAddr = vld1q_u32((uint32_t*)strides);
-    vecScGathAddr = vecScGathAddr + (uint32_t) pSrc;
-
-    /*
-     * load scheduling
-     */
-    vecA = (q15x8_t) vldrwq_gather_base_wb_s32(&vecScGathAddr, 64);
-    vecC = (q15x8_t) vldrwq_gather_base_s32(vecScGathAddr, 8);
-
-    blkCnt = (fftLen >> 4);
-    while (blkCnt > 0U)
-    {
-        vecSum0 = vhaddq(vecA, vecC);
-        vecDiff0 = vhsubq(vecA, vecC);
-
-        vecB = (q15x8_t) vldrwq_gather_base_s32(vecScGathAddr, 4);
-        vecD = (q15x8_t) vldrwq_gather_base_s32(vecScGathAddr, 12);
-
-        vecSum1 = vhaddq(vecB, vecD);
-        vecDiff1 = vhsubq(vecB, vecD);
-        /*
-         * pre-load for next iteration
-         */
-        vecA = (q15x8_t) vldrwq_gather_base_wb_s32(&vecScGathAddr, 64);
-        vecC = (q15x8_t) vldrwq_gather_base_s32(vecScGathAddr, 8);
-
-        vecTmp0 = vhaddq(vecSum0, vecSum1);
-        vstrwq_scatter_base_s32(vecScGathAddr, -64, (int32x4_t) vecTmp0);
-
-        vecTmp0 = vhsubq(vecSum0, vecSum1);
-        vstrwq_scatter_base_s32(vecScGathAddr, -64 + 4, (int32x4_t) vecTmp0);
-
-        vecTmp0 = MVE_CMPLX_ADD_FX_A_ixB(vecDiff0, vecDiff1);
-        vstrwq_scatter_base_s32(vecScGathAddr, -64 + 8, (int32x4_t) vecTmp0);
-
-        vecTmp0 = MVE_CMPLX_SUB_FX_A_ixB(vecDiff0, vecDiff1);
-        vstrwq_scatter_base_s32(vecScGathAddr, -64 + 12, (int32x4_t) vecTmp0);
-
-        blkCnt--;
-    }
-}
-
-static void arm_cfft_radix4by2_inverse_q15_mve(const arm_cfft_instance_q15 *S, q15_t *pSrc, uint32_t fftLen)
-{
-    uint32_t n2;
-    q15_t *pIn0;
-    q15_t *pIn1;
-    const q15_t *pCoef = S->pTwiddle;
-
-    uint32_t     blkCnt;
-    q15x8_t    vecIn0, vecIn1, vecSum, vecDiff;
-    q15x8_t    vecCmplxTmp, vecTw;
-    q15_t  const *pCoefVec;
-
-    n2 = fftLen >> 1;
-
-    pIn0 = pSrc;
-    pIn1 = pSrc + fftLen;
-    pCoefVec = pCoef;
-
-    blkCnt = n2 / 4;
-
-    while (blkCnt > 0U)
-    {
-        vecIn0 = *(q15x8_t *) pIn0;
-        vecIn1 = *(q15x8_t *) pIn1;
-
-        vecIn0 = vecIn0 >> 1;
-        vecIn1 = vecIn1 >> 1;
-        vecSum = vhaddq(vecIn0, vecIn1);
-        vst1q(pIn0, vecSum);
-        pIn0 += 8;
-
-        vecTw = vld1q(pCoefVec);
-        pCoefVec += 8;
-
-        vecDiff = vhsubq(vecIn0, vecIn1);
-        vecCmplxTmp = vqrdmlsdhq(vuninitializedq_s16() , vecDiff, vecTw);
-        vecCmplxTmp = vqrdmladhxq(vecCmplxTmp, vecDiff, vecTw);
-        vst1q(pIn1, vecCmplxTmp);
-        pIn1 += 8;
-
-        blkCnt--;
-    }
-
-
-    _arm_radix4_butterfly_inverse_q15_mve(S, pSrc, n2);
-
-    _arm_radix4_butterfly_inverse_q15_mve(S, pSrc + fftLen, n2);
-
-    pIn0 = pSrc;
-    blkCnt = (fftLen << 1) >> 3;
-    while (blkCnt > 0U)
-    {
-        vecIn0 = *(q15x8_t *) pIn0;
-        vecIn0 = vecIn0 << 1;
-        vst1q(pIn0, vecIn0);
-        pIn0 += 8;
-        blkCnt--;
-    }
-    /*
-     * tail
-     * (will be merged thru tail predication)
-     */
-    blkCnt = (fftLen << 1) & 7;
-    while (blkCnt > 0U)
-    {
-        mve_pred16_t p0 = vctp16q(blkCnt);
-
-        vecIn0 = *(q15x8_t *) pIn0;
-        vecIn0 = vecIn0 << 1;
-        vstrhq_p(pIn0, vecIn0, p0);
-    }
-}
-
-/**
-  @ingroup groupTransforms
- */
-
-/**
-  @addtogroup ComplexFFT
-  @{
- */
-
-/**
-  @brief         Processing function for Q15 complex FFT.
-  @param[in]     S               points to an instance of Q15 CFFT structure
-  @param[in,out] p1              points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place
-  @param[in]     ifftFlag       flag that selects transform direction
-                   - value = 0: forward transform
-                   - value = 1: inverse transform
-  @param[in]     bitReverseFlag flag that enables / disables bit reversal of output
-                   - value = 0: disables bit reversal of output
-                   - value = 1: enables bit reversal of output
-  @return        none
- */
-void arm_cfft_q15(
-  const arm_cfft_instance_q15 * S,
-        q15_t * pSrc,
-        uint8_t ifftFlag,
-        uint8_t bitReverseFlag)
-{
-        uint32_t fftLen = S->fftLen;
-
-        if (ifftFlag == 1U) {
-
-            switch (fftLen) {
-            case 16:
-            case 64:
-            case 256:
-            case 1024:
-            case 4096:
-                _arm_radix4_butterfly_inverse_q15_mve(S, pSrc, fftLen);
-                break;
-
-            case 32:
-            case 128:
-            case 512:
-            case 2048:
-                arm_cfft_radix4by2_inverse_q15_mve(S, pSrc, fftLen);
-                break;
-            }
-        } else {
-            switch (fftLen) {
-            case 16:
-            case 64:
-            case 256:
-            case 1024:
-            case 4096:
-                _arm_radix4_butterfly_q15_mve(S, pSrc, fftLen);
-                break;
-
-            case 32:
-            case 128:
-            case 512:
-            case 2048:
-                arm_cfft_radix4by2_q15_mve(S, pSrc, fftLen);
-                break;
-            }
-        }
-
-
-        if (bitReverseFlag)
-        {
-
-            arm_bitreversal_16_inpl_mve((uint16_t*)pSrc, S->bitRevLength, S->pBitRevTable);
-
-        }
-}
-
-#else
-
-extern void arm_radix4_butterfly_q15(
-        q15_t * pSrc,
-        uint32_t fftLen,
-  const q15_t * pCoef,
-        uint32_t twidCoefModifier);
-
-extern void arm_radix4_butterfly_inverse_q15(
-        q15_t * pSrc,
-        uint32_t fftLen,
-  const q15_t * pCoef,
-        uint32_t twidCoefModifier);
-
-extern void arm_bitreversal_16(
-        uint16_t * pSrc,
-  const uint16_t bitRevLen,
-  const uint16_t * pBitRevTable);
-
-void arm_cfft_radix4by2_q15(
-        q15_t * pSrc,
-        uint32_t fftLen,
-  const q15_t * pCoef);
-
-void arm_cfft_radix4by2_inverse_q15(
-        q15_t * pSrc,
-        uint32_t fftLen,
-  const q15_t * pCoef);
-
-/**
-  @ingroup groupTransforms
- */
-
-/**
-  @addtogroup ComplexFFT
-  @{
- */
-
-/**
-  @brief         Processing function for Q15 complex FFT.
-  @param[in]     S               points to an instance of Q15 CFFT structure
-  @param[in,out] p1              points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place
-  @param[in]     ifftFlag       flag that selects transform direction
-                   - value = 0: forward transform
-                   - value = 1: inverse transform
-  @param[in]     bitReverseFlag flag that enables / disables bit reversal of output
-                   - value = 0: disables bit reversal of output
-                   - value = 1: enables bit reversal of output
-  @return        none
- */
-
-void arm_cfft_q15(
-  const arm_cfft_instance_q15 * S,
-        q15_t * p1,
-        uint8_t ifftFlag,
-        uint8_t bitReverseFlag)
-{
-  uint32_t L = S->fftLen;
-
-  if (ifftFlag == 1U)
-  {
-     switch (L)
-     {
-     case 16:
-     case 64:
-     case 256:
-     case 1024:
-     case 4096:
-       arm_radix4_butterfly_inverse_q15 ( p1, L, (q15_t*)S->pTwiddle, 1 );
-       break;
-
-     case 32:
-     case 128:
-     case 512:
-     case 2048:
-       arm_cfft_radix4by2_inverse_q15 ( p1, L, S->pTwiddle );
-       break;
-     }
-  }
-  else
-  {
-     switch (L)
-     {
-     case 16:
-     case 64:
-     case 256:
-     case 1024:
-     case 4096:
-       arm_radix4_butterfly_q15  ( p1, L, (q15_t*)S->pTwiddle, 1 );
-       break;
-
-     case 32:
-     case 128:
-     case 512:
-     case 2048:
-       arm_cfft_radix4by2_q15  ( p1, L, S->pTwiddle );
-       break;
-     }
-  }
-
-  if ( bitReverseFlag )
-    arm_bitreversal_16 ((uint16_t*) p1, S->bitRevLength, S->pBitRevTable);
-}
-
-/**
-  @} end of ComplexFFT group
- */
-
-void arm_cfft_radix4by2_q15(
-        q15_t * pSrc,
-        uint32_t fftLen,
-  const q15_t * pCoef)
-{
-        uint32_t i;
-        uint32_t n2;
-        q15_t p0, p1, p2, p3;
-#if defined (ARM_MATH_DSP)
-        q31_t T, S, R;
-        q31_t coeff, out1, out2;
-  const q15_t *pC = pCoef;
-        q15_t *pSi = pSrc;
-        q15_t *pSl = pSrc + fftLen;
-#else
-        uint32_t l;
-        q15_t xt, yt, cosVal, sinVal;
-#endif
-
-  n2 = fftLen >> 1U;
-
-#if defined (ARM_MATH_DSP)
-
-  for (i = n2; i > 0; i--)
-  {
-      coeff = read_q15x2_ia (&pC);
-
-      T = read_q15x2 (pSi);
-      T = __SHADD16(T, 0); /* this is just a SIMD arithmetic shift right by 1 */
-
-      S = read_q15x2 (pSl);
-      S = __SHADD16(S, 0); /* this is just a SIMD arithmetic shift right by 1 */
-
-      R = __QSUB16(T, S);
-
-      write_q15x2_ia (&pSi, __SHADD16(T, S));
-
-#ifndef ARM_MATH_BIG_ENDIAN
-      out1 = __SMUAD(coeff, R) >> 16U;
-      out2 = __SMUSDX(coeff, R);
-#else
-      out1 = __SMUSDX(R, coeff) >> 16U;
-      out2 = __SMUAD(coeff, R);
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
-      write_q15x2_ia (&pSl, (q31_t)__PKHBT( out1, out2, 0 ) );
-  }
-
-#else /* #if defined (ARM_MATH_DSP) */
-
-  for (i = 0; i < n2; i++)
-  {
-     cosVal = pCoef[2 * i];
-     sinVal = pCoef[2 * i + 1];
-
-     l = i + n2;
-
-     xt =           (pSrc[2 * i] >> 1U) - (pSrc[2 * l] >> 1U);
-     pSrc[2 * i] = ((pSrc[2 * i] >> 1U) + (pSrc[2 * l] >> 1U)) >> 1U;
-
-     yt =               (pSrc[2 * i + 1] >> 1U) - (pSrc[2 * l + 1] >> 1U);
-     pSrc[2 * i + 1] = ((pSrc[2 * l + 1] >> 1U) + (pSrc[2 * i + 1] >> 1U)) >> 1U;
-
-     pSrc[2 * l]     = (((int16_t) (((q31_t) xt * cosVal) >> 16U)) +
-                        ((int16_t) (((q31_t) yt * sinVal) >> 16U))  );
-
-     pSrc[2 * l + 1] = (((int16_t) (((q31_t) yt * cosVal) >> 16U)) -
-                        ((int16_t) (((q31_t) xt * sinVal) >> 16U))   );
-  }
-
-#endif /* #if defined (ARM_MATH_DSP) */
-
-  /* first col */
-  arm_radix4_butterfly_q15( pSrc,          n2, (q15_t*)pCoef, 2U);
-
-  /* second col */
-  arm_radix4_butterfly_q15( pSrc + fftLen, n2, (q15_t*)pCoef, 2U);
-
-  n2 = fftLen >> 1U;
-  for (i = 0; i < n2; i++)
-  {
-     p0 = pSrc[4 * i + 0];
-     p1 = pSrc[4 * i + 1];
-     p2 = pSrc[4 * i + 2];
-     p3 = pSrc[4 * i + 3];
-
-     p0 <<= 1U;
-     p1 <<= 1U;
-     p2 <<= 1U;
-     p3 <<= 1U;
-
-     pSrc[4 * i + 0] = p0;
-     pSrc[4 * i + 1] = p1;
-     pSrc[4 * i + 2] = p2;
-     pSrc[4 * i + 3] = p3;
-  }
-
-}
-
-void arm_cfft_radix4by2_inverse_q15(
-        q15_t * pSrc,
-        uint32_t fftLen,
-  const q15_t * pCoef)
-{
-        uint32_t i;
-        uint32_t n2;
-        q15_t p0, p1, p2, p3;
-#if defined (ARM_MATH_DSP)
-        q31_t T, S, R;
-        q31_t coeff, out1, out2;
-  const q15_t *pC = pCoef;
-        q15_t *pSi = pSrc;
-        q15_t *pSl = pSrc + fftLen;
-#else
-        uint32_t l;
-        q15_t xt, yt, cosVal, sinVal;
-#endif
-
-  n2 = fftLen >> 1U;
-
-#if defined (ARM_MATH_DSP)
-
-  for (i = n2; i > 0; i--)
-  {
-     coeff = read_q15x2_ia (&pC);
-
-     T = read_q15x2 (pSi);
-     T = __SHADD16(T, 0); /* this is just a SIMD arithmetic shift right by 1 */
-
-     S = read_q15x2 (pSl);
-     S = __SHADD16(S, 0); /* this is just a SIMD arithmetic shift right by 1 */
-
-     R = __QSUB16(T, S);
-
-     write_q15x2_ia (&pSi, __SHADD16(T, S));
-
-#ifndef ARM_MATH_BIG_ENDIAN
-     out1 = __SMUSD(coeff, R) >> 16U;
-     out2 = __SMUADX(coeff, R);
-#else
-     out1 = __SMUADX(R, coeff) >> 16U;
-     out2 = __SMUSD(__QSUB(0, coeff), R);
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
-     write_q15x2_ia (&pSl, (q31_t)__PKHBT( out1, out2, 0 ));
-  }
-
-#else /* #if defined (ARM_MATH_DSP) */
-
-  for (i = 0; i < n2; i++)
-  {
-     cosVal = pCoef[2 * i];
-     sinVal = pCoef[2 * i + 1];
-
-     l = i + n2;
-
-     xt =           (pSrc[2 * i] >> 1U) - (pSrc[2 * l] >> 1U);
-     pSrc[2 * i] = ((pSrc[2 * i] >> 1U) + (pSrc[2 * l] >> 1U)) >> 1U;
-
-     yt =               (pSrc[2 * i + 1] >> 1U) - (pSrc[2 * l + 1] >> 1U);
-     pSrc[2 * i + 1] = ((pSrc[2 * l + 1] >> 1U) + (pSrc[2 * i + 1] >> 1U)) >> 1U;
-
-     pSrc[2 * l]      = (((int16_t) (((q31_t) xt * cosVal) >> 16U)) -
-                         ((int16_t) (((q31_t) yt * sinVal) >> 16U))  );
-
-     pSrc[2 * l + 1] = (((int16_t) (((q31_t) yt * cosVal) >> 16U)) +
-                        ((int16_t) (((q31_t) xt * sinVal) >> 16U))  );
-  }
-
-#endif /* #if defined (ARM_MATH_DSP) */
-
-  /* first col */
-  arm_radix4_butterfly_inverse_q15( pSrc,          n2, (q15_t*)pCoef, 2U);
-
-  /* second col */
-  arm_radix4_butterfly_inverse_q15( pSrc + fftLen, n2, (q15_t*)pCoef, 2U);
-
-  n2 = fftLen >> 1U;
-  for (i = 0; i < n2; i++)
-  {
-     p0 = pSrc[4 * i + 0];
-     p1 = pSrc[4 * i + 1];
-     p2 = pSrc[4 * i + 2];
-     p3 = pSrc[4 * i + 3];
-
-     p0 <<= 1U;
-     p1 <<= 1U;
-     p2 <<= 1U;
-     p3 <<= 1U;
-
-     pSrc[4 * i + 0] = p0;
-     pSrc[4 * i + 1] = p1;
-     pSrc[4 * i + 2] = p2;
-     pSrc[4 * i + 3] = p3;
-  }
-}
-
-#endif /* defined(ARM_MATH_MVEI) */
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_cfft_q15.c
+ * Description:  Combined Radix Decimation in Q15 Frequency CFFT processing function
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+extern void arm_radix4_butterfly_q15(
+        q15_t * pSrc,
+        uint32_t fftLen,
+  const q15_t * pCoef,
+        uint32_t twidCoefModifier);
+
+extern void arm_radix4_butterfly_inverse_q15(
+        q15_t * pSrc,
+        uint32_t fftLen,
+  const q15_t * pCoef,
+        uint32_t twidCoefModifier);
+
+extern void arm_bitreversal_16(
+        uint16_t * pSrc,
+  const uint16_t bitRevLen,
+  const uint16_t * pBitRevTable);
+
+void arm_cfft_radix4by2_q15(
+        q15_t * pSrc,
+        uint32_t fftLen,
+  const q15_t * pCoef);
+
+void arm_cfft_radix4by2_inverse_q15(
+        q15_t * pSrc,
+        uint32_t fftLen,
+  const q15_t * pCoef);
+
+/**
+  @ingroup groupTransforms
+ */
+
+/**
+  @addtogroup ComplexFFT
+  @{
+ */
+
+/**
+  @brief         Processing function for Q15 complex FFT.
+  @param[in]     S               points to an instance of Q15 CFFT structure
+  @param[in,out] p1              points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place
+  @param[in]     ifftFlag       flag that selects transform direction
+                   - value = 0: forward transform
+                   - value = 1: inverse transform
+  @param[in]     bitReverseFlag flag that enables / disables bit reversal of output
+                   - value = 0: disables bit reversal of output
+                   - value = 1: enables bit reversal of output
+  @return        none
+ */
+
+void arm_cfft_q15(
+  const arm_cfft_instance_q15 * S,
+        q15_t * p1,
+        uint8_t ifftFlag,
+        uint8_t bitReverseFlag)
+{
+  uint32_t L = S->fftLen;
+
+  if (ifftFlag == 1U)
+  {
+     switch (L)
+     {
+     case 16:
+     case 64:
+     case 256:
+     case 1024:
+     case 4096:
+       arm_radix4_butterfly_inverse_q15 ( p1, L, (q15_t*)S->pTwiddle, 1 );
+       break;
+
+     case 32:
+     case 128:
+     case 512:
+     case 2048:
+       arm_cfft_radix4by2_inverse_q15 ( p1, L, S->pTwiddle );
+       break;
+     }
+  }
+  else
+  {
+     switch (L)
+     {
+     case 16:
+     case 64:
+     case 256:
+     case 1024:
+     case 4096:
+       arm_radix4_butterfly_q15  ( p1, L, (q15_t*)S->pTwiddle, 1 );
+       break;
+
+     case 32:
+     case 128:
+     case 512:
+     case 2048:
+       arm_cfft_radix4by2_q15  ( p1, L, S->pTwiddle );
+       break;
+     }
+  }
+
+  if ( bitReverseFlag )
+    arm_bitreversal_16 ((uint16_t*) p1, S->bitRevLength, S->pBitRevTable);
+}
+
+/**
+  @} end of ComplexFFT group
+ */
+
+void arm_cfft_radix4by2_q15(
+        q15_t * pSrc,
+        uint32_t fftLen,
+  const q15_t * pCoef)
+{
+        uint32_t i;
+        uint32_t n2;
+        q15_t p0, p1, p2, p3;
+#if defined (ARM_MATH_DSP)
+        q31_t T, S, R;
+        q31_t coeff, out1, out2;
+  const q15_t *pC = pCoef;
+        q15_t *pSi = pSrc;
+        q15_t *pSl = pSrc + fftLen;
+#else
+        uint32_t l;
+        q15_t xt, yt, cosVal, sinVal;
+#endif
+
+  n2 = fftLen >> 1U;
+
+#if defined (ARM_MATH_DSP)
+
+  for (i = n2; i > 0; i--)
+  {
+      coeff = read_q15x2_ia ((q15_t **) &pC);
+
+      T = read_q15x2 (pSi);
+      T = __SHADD16(T, 0); /* this is just a SIMD arithmetic shift right by 1 */
+
+      S = read_q15x2 (pSl);
+      S = __SHADD16(S, 0); /* this is just a SIMD arithmetic shift right by 1 */
+
+      R = __QSUB16(T, S);
+
+      write_q15x2_ia (&pSi, __SHADD16(T, S));
+
+#ifndef ARM_MATH_BIG_ENDIAN
+      out1 = __SMUAD(coeff, R) >> 16U;
+      out2 = __SMUSDX(coeff, R);
+#else
+      out1 = __SMUSDX(R, coeff) >> 16U;
+      out2 = __SMUAD(coeff, R);
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+      write_q15x2_ia (&pSl, (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
+  }
+
+#else /* #if defined (ARM_MATH_DSP) */
+
+  for (i = 0; i < n2; i++)
+  {
+     cosVal = pCoef[2 * i];
+     sinVal = pCoef[2 * i + 1];
+
+     l = i + n2;
+
+     xt =           (pSrc[2 * i] >> 1U) - (pSrc[2 * l] >> 1U);
+     pSrc[2 * i] = ((pSrc[2 * i] >> 1U) + (pSrc[2 * l] >> 1U)) >> 1U;
+
+     yt =               (pSrc[2 * i + 1] >> 1U) - (pSrc[2 * l + 1] >> 1U);
+     pSrc[2 * i + 1] = ((pSrc[2 * l + 1] >> 1U) + (pSrc[2 * i + 1] >> 1U)) >> 1U;
+
+     pSrc[2 * l]     = (((int16_t) (((q31_t) xt * cosVal) >> 16U)) +
+                        ((int16_t) (((q31_t) yt * sinVal) >> 16U))  );
+
+     pSrc[2 * l + 1] = (((int16_t) (((q31_t) yt * cosVal) >> 16U)) -
+                        ((int16_t) (((q31_t) xt * sinVal) >> 16U))   );
+  }
+
+#endif /* #if defined (ARM_MATH_DSP) */
+
+  /* first col */
+  arm_radix4_butterfly_q15( pSrc,          n2, (q15_t*)pCoef, 2U);
+
+  /* second col */
+  arm_radix4_butterfly_q15( pSrc + fftLen, n2, (q15_t*)pCoef, 2U);
+
+  n2 = fftLen >> 1U;
+  for (i = 0; i < n2; i++)
+  {
+     p0 = pSrc[4 * i + 0];
+     p1 = pSrc[4 * i + 1];
+     p2 = pSrc[4 * i + 2];
+     p3 = pSrc[4 * i + 3];
+
+     p0 <<= 1U;
+     p1 <<= 1U;
+     p2 <<= 1U;
+     p3 <<= 1U;
+
+     pSrc[4 * i + 0] = p0;
+     pSrc[4 * i + 1] = p1;
+     pSrc[4 * i + 2] = p2;
+     pSrc[4 * i + 3] = p3;
+  }
+
+}
+
+void arm_cfft_radix4by2_inverse_q15(
+        q15_t * pSrc,
+        uint32_t fftLen,
+  const q15_t * pCoef)
+{
+        uint32_t i;
+        uint32_t n2;
+        q15_t p0, p1, p2, p3;
+#if defined (ARM_MATH_DSP)
+        q31_t T, S, R;
+        q31_t coeff, out1, out2;
+  const q15_t *pC = pCoef;
+        q15_t *pSi = pSrc;
+        q15_t *pSl = pSrc + fftLen;
+#else
+        uint32_t l;
+        q15_t xt, yt, cosVal, sinVal;
+#endif
+
+  n2 = fftLen >> 1U;
+
+#if defined (ARM_MATH_DSP)
+
+  for (i = n2; i > 0; i--)
+  {
+     coeff = read_q15x2_ia ((q15_t **) &pC);
+
+     T = read_q15x2 (pSi);
+     T = __SHADD16(T, 0); /* this is just a SIMD arithmetic shift right by 1 */
+
+     S = read_q15x2 (pSl);
+     S = __SHADD16(S, 0); /* this is just a SIMD arithmetic shift right by 1 */
+
+     R = __QSUB16(T, S);
+
+     write_q15x2_ia (&pSi, __SHADD16(T, S));
+
+#ifndef ARM_MATH_BIG_ENDIAN
+     out1 = __SMUSD(coeff, R) >> 16U;
+     out2 = __SMUADX(coeff, R);
+#else
+     out1 = __SMUADX(R, coeff) >> 16U;
+     out2 = __SMUSD(__QSUB(0, coeff), R);
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+     write_q15x2_ia (&pSl, (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
+  }
+
+#else /* #if defined (ARM_MATH_DSP) */
+
+  for (i = 0; i < n2; i++)
+  {
+     cosVal = pCoef[2 * i];
+     sinVal = pCoef[2 * i + 1];
+
+     l = i + n2;
+
+     xt =           (pSrc[2 * i] >> 1U) - (pSrc[2 * l] >> 1U);
+     pSrc[2 * i] = ((pSrc[2 * i] >> 1U) + (pSrc[2 * l] >> 1U)) >> 1U;
+
+     yt =               (pSrc[2 * i + 1] >> 1U) - (pSrc[2 * l + 1] >> 1U);
+     pSrc[2 * i + 1] = ((pSrc[2 * l + 1] >> 1U) + (pSrc[2 * i + 1] >> 1U)) >> 1U;
+
+     pSrc[2 * l]      = (((int16_t) (((q31_t) xt * cosVal) >> 16U)) -
+                         ((int16_t) (((q31_t) yt * sinVal) >> 16U))  );
+
+     pSrc[2 * l + 1] = (((int16_t) (((q31_t) yt * cosVal) >> 16U)) +
+                        ((int16_t) (((q31_t) xt * sinVal) >> 16U))  );
+  }
+
+#endif /* #if defined (ARM_MATH_DSP) */
+
+  /* first col */
+  arm_radix4_butterfly_inverse_q15( pSrc,          n2, (q15_t*)pCoef, 2U);
+
+  /* second col */
+  arm_radix4_butterfly_inverse_q15( pSrc + fftLen, n2, (q15_t*)pCoef, 2U);
+
+  n2 = fftLen >> 1U;
+  for (i = 0; i < n2; i++)
+  {
+     p0 = pSrc[4 * i + 0];
+     p1 = pSrc[4 * i + 1];
+     p2 = pSrc[4 * i + 2];
+     p3 = pSrc[4 * i + 3];
+
+     p0 <<= 1U;
+     p1 <<= 1U;
+     p2 <<= 1U;
+     p3 <<= 1U;
+
+     pSrc[4 * i + 0] = p0;
+     pSrc[4 * i + 1] = p1;
+     pSrc[4 * i + 2] = p2;
+     pSrc[4 * i + 3] = p3;
+  }
+}
diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_q31.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_q31.c
index 78ce505..785942f 100644
--- a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_q31.c
+++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_q31.c
@@ -1,847 +1,254 @@
-/* ----------------------------------------------------------------------
- * Project:      CMSIS DSP Library
- * Title:        arm_cfft_q31.c
- * Description:  Combined Radix Decimation in Frequency CFFT fixed point processing function
- *
- * $Date:        23 April 2021
- * $Revision:    V1.9.0
- *
- * Target Processor: Cortex-M and Cortex-A cores
- * -------------------------------------------------------------------- */
-/*
- * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "dsp/transform_functions.h"
-
-
-
-#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
-
-#include "arm_vec_fft.h"
-
-
-static void _arm_radix4_butterfly_q31_mve(
-    const arm_cfft_instance_q31 * S,
-    q31_t   *pSrc,
-    uint32_t fftLen)
-{
-    q31x4_t vecTmp0, vecTmp1;
-    q31x4_t vecSum0, vecDiff0, vecSum1, vecDiff1;
-    q31x4_t vecA, vecB, vecC, vecD;
-    uint32_t  blkCnt;
-    uint32_t  n1, n2;
-    uint32_t  stage = 0;
-    int32_t  iter = 1;
-    static const int32_t strides[4] = {
-        (0 - 16) * (int32_t)sizeof(q31_t *), (1 - 16) * (int32_t)sizeof(q31_t *),
-        (8 - 16) * (int32_t)sizeof(q31_t *), (9 - 16) * (int32_t)sizeof(q31_t *)
-    };
-
-
-    /*
-     * Process first stages
-     * Each stage in middle stages provides two down scaling of the input
-     */
-    n2 = fftLen;
-    n1 = n2;
-    n2 >>= 2u;
-
-    for (int k = fftLen / 4u; k > 1; k >>= 2u)
-    {
-        q31_t const *p_rearranged_twiddle_tab_stride2 =
-            &S->rearranged_twiddle_stride2[
-            S->rearranged_twiddle_tab_stride2_arr[stage]];
-        q31_t const *p_rearranged_twiddle_tab_stride3 = &S->rearranged_twiddle_stride3[
-            S->rearranged_twiddle_tab_stride3_arr[stage]];
-        q31_t const *p_rearranged_twiddle_tab_stride1 =
-            &S->rearranged_twiddle_stride1[
-            S->rearranged_twiddle_tab_stride1_arr[stage]];
-
-        q31_t * pBase = pSrc;
-        for (int i = 0; i < iter; i++)
-        {
-            q31_t    *inA = pBase;
-            q31_t    *inB = inA + n2 * CMPLX_DIM;
-            q31_t    *inC = inB + n2 * CMPLX_DIM;
-            q31_t    *inD = inC + n2 * CMPLX_DIM;
-            q31_t const *pW1 = p_rearranged_twiddle_tab_stride1;
-            q31_t const *pW2 = p_rearranged_twiddle_tab_stride2;
-            q31_t const *pW3 = p_rearranged_twiddle_tab_stride3;
-            q31x4_t    vecW;
-
-
-            blkCnt = n2 / 2;
-            /*
-             * load 2 x q31 complex pair
-             */
-            vecA = vldrwq_s32(inA);
-            vecC = vldrwq_s32(inC);
-            while (blkCnt > 0U)
-            {
-                vecB = vldrwq_s32(inB);
-                vecD = vldrwq_s32(inD);
-
-                vecSum0 = vhaddq(vecA, vecC);
-                vecDiff0 = vhsubq(vecA, vecC);
-
-                vecSum1 = vhaddq(vecB, vecD);
-                vecDiff1 = vhsubq(vecB, vecD);
-                /*
-                 * [ 1 1 1 1 ] * [ A B C D ]' .* 1
-                 */
-                vecTmp0 = vhaddq(vecSum0, vecSum1);
-                vst1q(inA, vecTmp0);
-                inA += 4;
-                /*
-                 * [ 1 -1 1 -1 ] * [ A B C D ]'
-                 */
-                vecTmp0 = vhsubq(vecSum0, vecSum1);
-                /*
-                 * [ 1 -1 1 -1 ] * [ A B C D ]'.* W2
-                 */
-                vecW = vld1q(pW2);
-                pW2 += 4;
-                vecTmp1 = MVE_CMPLX_MULT_FX_AxB(vecW, vecTmp0, q31x4_t);
-
-                vst1q(inB, vecTmp1);
-                inB += 4;
-                /*
-                 * [ 1 -i -1 +i ] * [ A B C D ]'
-                 */
-                vecTmp0 = MVE_CMPLX_SUB_FX_A_ixB(vecDiff0, vecDiff1);
-                /*
-                 * [ 1 -i -1 +i ] * [ A B C D ]'.* W1
-                 */
-                vecW = vld1q(pW1);
-                pW1 += 4;
-                vecTmp1 = MVE_CMPLX_MULT_FX_AxB(vecW, vecTmp0, q31x4_t);
-                vst1q(inC, vecTmp1);
-                inC += 4;
-                /*
-                 * [ 1 +i -1 -i ] * [ A B C D ]'
-                 */
-                vecTmp0 = MVE_CMPLX_ADD_FX_A_ixB(vecDiff0, vecDiff1);
-                /*
-                 * [ 1 +i -1 -i ] * [ A B C D ]'.* W3
-                 */
-                vecW = vld1q(pW3);
-                pW3 += 4;
-                vecTmp1 = MVE_CMPLX_MULT_FX_AxB(vecW, vecTmp0, q31x4_t);
-                vst1q(inD, vecTmp1);
-                inD += 4;
-
-                vecA = vldrwq_s32(inA);
-                vecC = vldrwq_s32(inC);
-
-                blkCnt--;
-            }
-            pBase +=  CMPLX_DIM * n1;
-        }
-        n1 = n2;
-        n2 >>= 2u;
-        iter = iter << 2;
-        stage++;
-    }
-
-    /*
-     * End of 1st stages process
-     * data is in 11.21(q21) format for the 1024 point as there are 3 middle stages
-     * data is in 9.23(q23) format for the 256 point as there are 2 middle stages
-     * data is in 7.25(q25) format for the 64 point as there are 1 middle stage
-     * data is in 5.27(q27) format for the 16 point as there are no middle stages
-     */
-
-    /*
-     * start of Last stage process
-     */
-    uint32x4_t vecScGathAddr = vld1q_u32((uint32_t*)strides);
-    vecScGathAddr = vecScGathAddr + (uint32_t) pSrc;
-
-    /*
-     * load scheduling
-     */
-    vecA = vldrwq_gather_base_wb_s32(&vecScGathAddr, 64);
-    vecC = vldrwq_gather_base_s32(vecScGathAddr, 16);
-
-    blkCnt = (fftLen >> 3);
-    while (blkCnt > 0U)
-    {
-        vecSum0 = vhaddq(vecA, vecC);
-        vecDiff0 = vhsubq(vecA, vecC);
-
-        vecB = vldrwq_gather_base_s32(vecScGathAddr, 8);
-        vecD = vldrwq_gather_base_s32(vecScGathAddr, 24);
-
-        vecSum1 = vhaddq(vecB, vecD);
-        vecDiff1 = vhsubq(vecB, vecD);
-        /*
-         * pre-load for next iteration
-         */
-        vecA = vldrwq_gather_base_wb_s32(&vecScGathAddr, 64);
-        vecC = vldrwq_gather_base_s32(vecScGathAddr, 16);
-
-        vecTmp0 = vhaddq(vecSum0, vecSum1);
-        vstrwq_scatter_base_s32(vecScGathAddr, -64, vecTmp0);
-
-        vecTmp0 = vhsubq(vecSum0, vecSum1);
-        vstrwq_scatter_base_s32(vecScGathAddr, -64 + 8, vecTmp0);
-
-        vecTmp0 = MVE_CMPLX_SUB_FX_A_ixB(vecDiff0, vecDiff1);
-        vstrwq_scatter_base_s32(vecScGathAddr, -64 + 16, vecTmp0);
-
-        vecTmp0 = MVE_CMPLX_ADD_FX_A_ixB(vecDiff0, vecDiff1);
-        vstrwq_scatter_base_s32(vecScGathAddr, -64 + 24, vecTmp0);
-
-        blkCnt--;
-    }
-
-    /*
-     * output is in 11.21(q21) format for the 1024 point
-     * output is in 9.23(q23) format for the 256 point
-     * output is in 7.25(q25) format for the 64 point
-     * output is in 5.27(q27) format for the 16 point
-     */
-}
-
-
-static void arm_cfft_radix4by2_q31_mve(const arm_cfft_instance_q31 *S, q31_t *pSrc, uint32_t fftLen)
-{
-    uint32_t     n2;
-    q31_t       *pIn0;
-    q31_t       *pIn1;
-    const q31_t *pCoef = S->pTwiddle;
-    uint32_t     blkCnt;
-    q31x4_t    vecIn0, vecIn1, vecSum, vecDiff;
-    q31x4_t    vecCmplxTmp, vecTw;
-
-    n2 = fftLen >> 1;
-    pIn0 = pSrc;
-    pIn1 = pSrc + fftLen;
-
-    blkCnt = n2 / 2;
-
-    while (blkCnt > 0U)
-    {
-        vecIn0 = vld1q_s32(pIn0);
-        vecIn1 = vld1q_s32(pIn1);
-
-        vecIn0 = vecIn0 >> 1;
-        vecIn1 = vecIn1 >> 1;
-        vecSum = vhaddq(vecIn0, vecIn1);
-        vst1q(pIn0, vecSum);
-        pIn0 += 4;
-
-        vecTw = vld1q_s32(pCoef);
-        pCoef += 4;
-        vecDiff = vhsubq(vecIn0, vecIn1);
-
-        vecCmplxTmp = MVE_CMPLX_MULT_FX_AxConjB(vecDiff, vecTw, q31x4_t);
-        vst1q(pIn1, vecCmplxTmp);
-        pIn1 += 4;
-
-        blkCnt--;
-    }
-
-   _arm_radix4_butterfly_q31_mve(S, pSrc, n2);
-
-   _arm_radix4_butterfly_q31_mve(S, pSrc + fftLen, n2);
-
-    pIn0 = pSrc;
-    blkCnt = (fftLen << 1) >> 2;
-    while (blkCnt > 0U)
-    {
-        vecIn0 = vld1q_s32(pIn0);
-        vecIn0 = vecIn0 << 1;
-        vst1q(pIn0, vecIn0);
-        pIn0 += 4;
-        blkCnt--;
-    }
-    /*
-     * tail
-     * (will be merged thru tail predication)
-     */
-    blkCnt = (fftLen << 1) & 3;
-    if (blkCnt > 0U)
-    {
-        mve_pred16_t p0 = vctp32q(blkCnt);
-
-        vecIn0 = vld1q_s32(pIn0);
-        vecIn0 = vecIn0 << 1;
-        vstrwq_p(pIn0, vecIn0, p0);
-    }
-
-}
-
-static void _arm_radix4_butterfly_inverse_q31_mve(
-    const arm_cfft_instance_q31 *S,
-    q31_t   *pSrc,
-    uint32_t fftLen)
-{
-    q31x4_t vecTmp0, vecTmp1;
-    q31x4_t vecSum0, vecDiff0, vecSum1, vecDiff1;
-    q31x4_t vecA, vecB, vecC, vecD;
-    uint32_t  blkCnt;
-    uint32_t  n1, n2;
-    uint32_t  stage = 0;
-    int32_t  iter = 1;
-    static const int32_t strides[4] = {
-        (0 - 16) * (int32_t)sizeof(q31_t *), (1 - 16) * (int32_t)sizeof(q31_t *),
-        (8 - 16) * (int32_t)sizeof(q31_t *), (9 - 16) * (int32_t)sizeof(q31_t *)
-    };
-
-    /*
-     * Process first stages
-     * Each stage in middle stages provides two down scaling of the input
-     */
-    n2 = fftLen;
-    n1 = n2;
-    n2 >>= 2u;
-
-    for (int k = fftLen / 4u; k > 1; k >>= 2u)
-    {
-        q31_t const *p_rearranged_twiddle_tab_stride2 =
-            &S->rearranged_twiddle_stride2[
-            S->rearranged_twiddle_tab_stride2_arr[stage]];
-        q31_t const *p_rearranged_twiddle_tab_stride3 = &S->rearranged_twiddle_stride3[
-            S->rearranged_twiddle_tab_stride3_arr[stage]];
-        q31_t const *p_rearranged_twiddle_tab_stride1 =
-            &S->rearranged_twiddle_stride1[
-            S->rearranged_twiddle_tab_stride1_arr[stage]];
-
-        q31_t * pBase = pSrc;
-        for (int i = 0; i < iter; i++)
-        {
-            q31_t    *inA = pBase;
-            q31_t    *inB = inA + n2 * CMPLX_DIM;
-            q31_t    *inC = inB + n2 * CMPLX_DIM;
-            q31_t    *inD = inC + n2 * CMPLX_DIM;
-            q31_t const *pW1 = p_rearranged_twiddle_tab_stride1;
-            q31_t const *pW2 = p_rearranged_twiddle_tab_stride2;
-            q31_t const *pW3 = p_rearranged_twiddle_tab_stride3;
-            q31x4_t    vecW;
-
-            blkCnt = n2 / 2;
-            /*
-             * load 2 x q31 complex pair
-             */
-            vecA = vldrwq_s32(inA);
-            vecC = vldrwq_s32(inC);
-            while (blkCnt > 0U)
-            {
-                vecB = vldrwq_s32(inB);
-                vecD = vldrwq_s32(inD);
-
-                vecSum0 = vhaddq(vecA, vecC);
-                vecDiff0 = vhsubq(vecA, vecC);
-
-                vecSum1 = vhaddq(vecB, vecD);
-                vecDiff1 = vhsubq(vecB, vecD);
-                /*
-                 * [ 1 1 1 1 ] * [ A B C D ]' .* 1
-                 */
-                vecTmp0 = vhaddq(vecSum0, vecSum1);
-                vst1q(inA, vecTmp0);
-                inA += 4;
-                /*
-                 * [ 1 -1 1 -1 ] * [ A B C D ]'
-                 */
-                vecTmp0 = vhsubq(vecSum0, vecSum1);
-                /*
-                 * [ 1 -1 1 -1 ] * [ A B C D ]'.* W2
-                 */
-                vecW = vld1q(pW2);
-                pW2 += 4;
-                vecTmp1 = MVE_CMPLX_MULT_FX_AxConjB(vecTmp0, vecW, q31x4_t);
-
-                vst1q(inB, vecTmp1);
-                inB += 4;
-                /*
-                 * [ 1 -i -1 +i ] * [ A B C D ]'
-                 */
-                vecTmp0 = MVE_CMPLX_ADD_FX_A_ixB(vecDiff0, vecDiff1);
-                /*
-                 * [ 1 -i -1 +i ] * [ A B C D ]'.* W1
-                 */
-                vecW = vld1q(pW1);
-                pW1 += 4;
-                vecTmp1 = MVE_CMPLX_MULT_FX_AxConjB(vecTmp0, vecW, q31x4_t);
-                vst1q(inC, vecTmp1);
-                inC += 4;
-                /*
-                 * [ 1 +i -1 -i ] * [ A B C D ]'
-                 */
-                vecTmp0 = MVE_CMPLX_SUB_FX_A_ixB(vecDiff0, vecDiff1);
-                /*
-                 * [ 1 +i -1 -i ] * [ A B C D ]'.* W3
-                 */
-                vecW = vld1q(pW3);
-                pW3 += 4;
-                vecTmp1 = MVE_CMPLX_MULT_FX_AxConjB(vecTmp0, vecW, q31x4_t);
-                vst1q(inD, vecTmp1);
-                inD += 4;
-
-                vecA = vldrwq_s32(inA);
-                vecC = vldrwq_s32(inC);
-
-                blkCnt--;
-            }
-            pBase +=  CMPLX_DIM * n1;
-        }
-        n1 = n2;
-        n2 >>= 2u;
-        iter = iter << 2;
-        stage++;
-    }
-
-    /*
-     * End of 1st stages process
-     * data is in 11.21(q21) format for the 1024 point as there are 3 middle stages
-     * data is in 9.23(q23) format for the 256 point as there are 2 middle stages
-     * data is in 7.25(q25) format for the 64 point as there are 1 middle stage
-     * data is in 5.27(q27) format for the 16 point as there are no middle stages
-     */
-
-    /*
-     * start of Last stage process
-     */
-    uint32x4_t vecScGathAddr = vld1q_u32((uint32_t*)strides);
-    vecScGathAddr = vecScGathAddr + (uint32_t) pSrc;
-
-    /*
-     * load scheduling
-     */
-    vecA = vldrwq_gather_base_wb_s32(&vecScGathAddr, 64);
-    vecC = vldrwq_gather_base_s32(vecScGathAddr, 16);
-
-    blkCnt = (fftLen >> 3);
-    while (blkCnt > 0U)
-    {
-        vecSum0 = vhaddq(vecA, vecC);
-        vecDiff0 = vhsubq(vecA, vecC);
-
-        vecB = vldrwq_gather_base_s32(vecScGathAddr, 8);
-        vecD = vldrwq_gather_base_s32(vecScGathAddr, 24);
-
-        vecSum1 = vhaddq(vecB, vecD);
-        vecDiff1 = vhsubq(vecB, vecD);
-        /*
-         * pre-load for next iteration
-         */
-        vecA = vldrwq_gather_base_wb_s32(&vecScGathAddr, 64);
-        vecC = vldrwq_gather_base_s32(vecScGathAddr, 16);
-
-        vecTmp0 = vhaddq(vecSum0, vecSum1);
-        vstrwq_scatter_base_s32(vecScGathAddr, -64, vecTmp0);
-
-        vecTmp0 = vhsubq(vecSum0, vecSum1);
-        vstrwq_scatter_base_s32(vecScGathAddr, -64 + 8, vecTmp0);
-
-        vecTmp0 = MVE_CMPLX_ADD_FX_A_ixB(vecDiff0, vecDiff1);
-        vstrwq_scatter_base_s32(vecScGathAddr, -64 + 16, vecTmp0);
-
-        vecTmp0 = MVE_CMPLX_SUB_FX_A_ixB(vecDiff0, vecDiff1);
-        vstrwq_scatter_base_s32(vecScGathAddr, -64 + 24, vecTmp0);
-
-        blkCnt--;
-    }
-    /*
-     * output is in 11.21(q21) format for the 1024 point
-     * output is in 9.23(q23) format for the 256 point
-     * output is in 7.25(q25) format for the 64 point
-     * output is in 5.27(q27) format for the 16 point
-     */
-}
-
-static void arm_cfft_radix4by2_inverse_q31_mve(const arm_cfft_instance_q31 *S, q31_t *pSrc, uint32_t fftLen)
-{
-    uint32_t     n2;
-    q31_t       *pIn0;
-    q31_t       *pIn1;
-    const q31_t *pCoef = S->pTwiddle;
-
-    //uint16_t     twidCoefModifier = arm_cfft_radix2_twiddle_factor(S->fftLen);
-    //q31_t        twidIncr = (2 * twidCoefModifier * sizeof(q31_t));
-    uint32_t     blkCnt;
-    //uint64x2_t   vecOffs;
-    q31x4_t    vecIn0, vecIn1, vecSum, vecDiff;
-    q31x4_t    vecCmplxTmp, vecTw;
-
-    n2 = fftLen >> 1;
-
-    pIn0 = pSrc;
-    pIn1 = pSrc + fftLen;
-    //vecOffs[0] = 0;
-    //vecOffs[1] = (uint64_t) twidIncr;
-    blkCnt = n2 / 2;
-
-    while (blkCnt > 0U)
-    {
-        vecIn0 = vld1q_s32(pIn0);
-        vecIn1 = vld1q_s32(pIn1);
-
-        vecIn0 = vecIn0 >> 1;
-        vecIn1 = vecIn1 >> 1;
-        vecSum = vhaddq(vecIn0, vecIn1);
-        vst1q(pIn0, vecSum);
-        pIn0 += 4;
-
-        //vecTw = (q31x4_t) vldrdq_gather_offset_s64(pCoef, vecOffs);
-        vecTw = vld1q_s32(pCoef);
-        pCoef += 4;
-        vecDiff = vhsubq(vecIn0, vecIn1);
-
-        vecCmplxTmp = MVE_CMPLX_MULT_FX_AxB(vecDiff, vecTw, q31x4_t);
-        vst1q(pIn1, vecCmplxTmp);
-        pIn1 += 4;
-
-        //vecOffs = vaddq((q31x4_t) vecOffs, 2 * twidIncr);
-        blkCnt--;
-    }
-
-    _arm_radix4_butterfly_inverse_q31_mve(S, pSrc, n2);
-
-    _arm_radix4_butterfly_inverse_q31_mve(S, pSrc + fftLen, n2);
-
-    pIn0 = pSrc;
-    blkCnt = (fftLen << 1) >> 2;
-    while (blkCnt > 0U)
-    {
-        vecIn0 = vld1q_s32(pIn0);
-        vecIn0 = vecIn0 << 1;
-        vst1q(pIn0, vecIn0);
-        pIn0 += 4;
-        blkCnt--;
-    }
-    /*
-     * tail
-     * (will be merged thru tail predication)
-     */
-    blkCnt = (fftLen << 1) & 3;
-    if (blkCnt > 0U)
-    {
-        mve_pred16_t p0 = vctp32q(blkCnt);
-
-        vecIn0 = vld1q_s32(pIn0);
-        vecIn0 = vecIn0 << 1;
-        vstrwq_p(pIn0, vecIn0, p0);
-    }
-
-}
-
-/**
-  @ingroup groupTransforms
- */
-
-/**
-  @addtogroup ComplexFFT
-  @{
- */
-
-/**
-  @brief         Processing function for the Q31 complex FFT.
-  @param[in]     S               points to an instance of the fixed-point CFFT structure
-  @param[in,out] p1              points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place
-  @param[in]     ifftFlag       flag that selects transform direction
-                   - value = 0: forward transform
-                   - value = 1: inverse transform
-  @param[in]     bitReverseFlag flag that enables / disables bit reversal of output
-                   - value = 0: disables bit reversal of output
-                   - value = 1: enables bit reversal of output
-  @return        none
- */
-void arm_cfft_q31(
-  const arm_cfft_instance_q31 * S,
-        q31_t * pSrc,
-        uint8_t ifftFlag,
-        uint8_t bitReverseFlag)
-{
-        uint32_t fftLen = S->fftLen;
-
-        if (ifftFlag == 1U) {
-
-            switch (fftLen) {
-            case 16:
-            case 64:
-            case 256:
-            case 1024:
-            case 4096:
-                _arm_radix4_butterfly_inverse_q31_mve(S, pSrc, fftLen);
-                break;
-
-            case 32:
-            case 128:
-            case 512:
-            case 2048:
-                arm_cfft_radix4by2_inverse_q31_mve(S, pSrc, fftLen);
-                break;
-            }
-        } else {
-            switch (fftLen) {
-            case 16:
-            case 64:
-            case 256:
-            case 1024:
-            case 4096:
-                _arm_radix4_butterfly_q31_mve(S, pSrc, fftLen);
-                break;
-
-            case 32:
-            case 128:
-            case 512:
-            case 2048:
-                arm_cfft_radix4by2_q31_mve(S, pSrc, fftLen);
-                break;
-            }
-        }
-
-
-        if (bitReverseFlag)
-        {
-
-            arm_bitreversal_32_inpl_mve((uint32_t*)pSrc, S->bitRevLength, S->pBitRevTable);
-
-        }
-}
-#else
-
-extern void arm_radix4_butterfly_q31(
-        q31_t * pSrc,
-        uint32_t fftLen,
-  const q31_t * pCoef,
-        uint32_t twidCoefModifier);
-
-extern void arm_radix4_butterfly_inverse_q31(
-        q31_t * pSrc,
-        uint32_t fftLen,
-  const q31_t * pCoef,
-        uint32_t twidCoefModifier);
-
-extern void arm_bitreversal_32(
-        uint32_t * pSrc,
-  const uint16_t bitRevLen,
-  const uint16_t * pBitRevTable);
-
-void arm_cfft_radix4by2_q31(
-        q31_t * pSrc,
-        uint32_t fftLen,
-  const q31_t * pCoef);
-
-void arm_cfft_radix4by2_inverse_q31(
-        q31_t * pSrc,
-        uint32_t fftLen,
-  const q31_t * pCoef);
-
-
-/**
-  @ingroup groupTransforms
- */
-
-/**
-  @addtogroup ComplexFFT
-  @{
- */
-
-/**
-  @brief         Processing function for the Q31 complex FFT.
-  @param[in]     S               points to an instance of the fixed-point CFFT structure
-  @param[in,out] p1              points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place
-  @param[in]     ifftFlag       flag that selects transform direction
-                   - value = 0: forward transform
-                   - value = 1: inverse transform
-  @param[in]     bitReverseFlag flag that enables / disables bit reversal of output
-                   - value = 0: disables bit reversal of output
-                   - value = 1: enables bit reversal of output
-  @return        none
- */
-void arm_cfft_q31(
-  const arm_cfft_instance_q31 * S,
-        q31_t * p1,
-        uint8_t ifftFlag,
-        uint8_t bitReverseFlag)
-{
-  uint32_t L = S->fftLen;
-
-  if (ifftFlag == 1U)
-  {
-     switch (L)
-     {
-     case 16:
-     case 64:
-     case 256:
-     case 1024:
-     case 4096:
-       arm_radix4_butterfly_inverse_q31 ( p1, L, (q31_t*)S->pTwiddle, 1 );
-       break;
-
-     case 32:
-     case 128:
-     case 512:
-     case 2048:
-       arm_cfft_radix4by2_inverse_q31 ( p1, L, S->pTwiddle );
-       break;
-     }
-  }
-  else
-  {
-     switch (L)
-     {
-     case 16:
-     case 64:
-     case 256:
-     case 1024:
-     case 4096:
-       arm_radix4_butterfly_q31 ( p1, L, (q31_t*)S->pTwiddle, 1 );
-       break;
-
-     case 32:
-     case 128:
-     case 512:
-     case 2048:
-       arm_cfft_radix4by2_q31 ( p1, L, S->pTwiddle );
-       break;
-     }
-  }
-
-  if ( bitReverseFlag )
-    arm_bitreversal_32 ((uint32_t*) p1, S->bitRevLength, S->pBitRevTable);
-}
-
-/**
-  @} end of ComplexFFT group
- */
-
-void arm_cfft_radix4by2_q31(
-        q31_t * pSrc,
-        uint32_t fftLen,
-  const q31_t * pCoef)
-{
-        uint32_t i, l;
-        uint32_t n2;
-        q31_t xt, yt, cosVal, sinVal;
-        q31_t p0, p1;
-
-  n2 = fftLen >> 1U;
-  for (i = 0; i < n2; i++)
-  {
-     cosVal = pCoef[2 * i];
-     sinVal = pCoef[2 * i + 1];
-
-     l = i + n2;
-
-     xt =          (pSrc[2 * i] >> 2U) - (pSrc[2 * l] >> 2U);
-     pSrc[2 * i] = (pSrc[2 * i] >> 2U) + (pSrc[2 * l] >> 2U);
-
-     yt =              (pSrc[2 * i + 1] >> 2U) - (pSrc[2 * l + 1] >> 2U);
-     pSrc[2 * i + 1] = (pSrc[2 * l + 1] >> 2U) + (pSrc[2 * i + 1] >> 2U);
-
-     mult_32x32_keep32_R(p0, xt, cosVal);
-     mult_32x32_keep32_R(p1, yt, cosVal);
-     multAcc_32x32_keep32_R(p0, yt, sinVal);
-     multSub_32x32_keep32_R(p1, xt, sinVal);
-
-     pSrc[2 * l]     = p0 << 1;
-     pSrc[2 * l + 1] = p1 << 1;
-  }
-
-
-  /* first col */
-  arm_radix4_butterfly_q31 (pSrc,          n2, (q31_t*)pCoef, 2U);
-
-  /* second col */
-  arm_radix4_butterfly_q31 (pSrc + fftLen, n2, (q31_t*)pCoef, 2U);
-
-  n2 = fftLen >> 1U;
-  for (i = 0; i < n2; i++)
-  {
-     p0 = pSrc[4 * i + 0];
-     p1 = pSrc[4 * i + 1];
-     xt = pSrc[4 * i + 2];
-     yt = pSrc[4 * i + 3];
-
-     p0 <<= 1U;
-     p1 <<= 1U;
-     xt <<= 1U;
-     yt <<= 1U;
-
-     pSrc[4 * i + 0] = p0;
-     pSrc[4 * i + 1] = p1;
-     pSrc[4 * i + 2] = xt;
-     pSrc[4 * i + 3] = yt;
-  }
-
-}
-
-void arm_cfft_radix4by2_inverse_q31(
-        q31_t * pSrc,
-        uint32_t fftLen,
-  const q31_t * pCoef)
-{
-  uint32_t i, l;
-  uint32_t n2;
-  q31_t xt, yt, cosVal, sinVal;
-  q31_t p0, p1;
-
-  n2 = fftLen >> 1U;
-  for (i = 0; i < n2; i++)
-  {
-     cosVal = pCoef[2 * i];
-     sinVal = pCoef[2 * i + 1];
-
-     l = i + n2;
-
-     xt =          (pSrc[2 * i] >> 2U) - (pSrc[2 * l] >> 2U);
-     pSrc[2 * i] = (pSrc[2 * i] >> 2U) + (pSrc[2 * l] >> 2U);
-
-     yt =              (pSrc[2 * i + 1] >> 2U) - (pSrc[2 * l + 1] >> 2U);
-     pSrc[2 * i + 1] = (pSrc[2 * l + 1] >> 2U) + (pSrc[2 * i + 1] >> 2U);
-
-     mult_32x32_keep32_R(p0, xt, cosVal);
-     mult_32x32_keep32_R(p1, yt, cosVal);
-     multSub_32x32_keep32_R(p0, yt, sinVal);
-     multAcc_32x32_keep32_R(p1, xt, sinVal);
-
-     pSrc[2 * l]     = p0 << 1U;
-     pSrc[2 * l + 1] = p1 << 1U;
-  }
-
-  /* first col */
-  arm_radix4_butterfly_inverse_q31( pSrc,          n2, (q31_t*)pCoef, 2U);
-
-  /* second col */
-  arm_radix4_butterfly_inverse_q31( pSrc + fftLen, n2, (q31_t*)pCoef, 2U);
-
-  n2 = fftLen >> 1U;
-  for (i = 0; i < n2; i++)
-  {
-     p0 = pSrc[4 * i + 0];
-     p1 = pSrc[4 * i + 1];
-     xt = pSrc[4 * i + 2];
-     yt = pSrc[4 * i + 3];
-
-     p0 <<= 1U;
-     p1 <<= 1U;
-     xt <<= 1U;
-     yt <<= 1U;
-
-     pSrc[4 * i + 0] = p0;
-     pSrc[4 * i + 1] = p1;
-     pSrc[4 * i + 2] = xt;
-     pSrc[4 * i + 3] = yt;
-  }
-}
-#endif /* defined(ARM_MATH_MVEI) */
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_cfft_q31.c
+ * Description:  Combined Radix Decimation in Frequency CFFT fixed point processing function
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+extern void arm_radix4_butterfly_q31(
+        q31_t * pSrc,
+        uint32_t fftLen,
+  const q31_t * pCoef,
+        uint32_t twidCoefModifier);
+
+extern void arm_radix4_butterfly_inverse_q31(
+        q31_t * pSrc,
+        uint32_t fftLen,
+  const q31_t * pCoef,
+        uint32_t twidCoefModifier);
+
+extern void arm_bitreversal_32(
+        uint32_t * pSrc,
+  const uint16_t bitRevLen,
+  const uint16_t * pBitRevTable);
+
+void arm_cfft_radix4by2_q31(
+        q31_t * pSrc,
+        uint32_t fftLen,
+  const q31_t * pCoef);
+
+void arm_cfft_radix4by2_inverse_q31(
+        q31_t * pSrc,
+        uint32_t fftLen,
+  const q31_t * pCoef);
+
+/**
+  @ingroup groupTransforms
+ */
+
+/**
+  @addtogroup ComplexFFT
+  @{
+ */
+
+/**
+  @brief         Processing function for the Q31 complex FFT.
+  @param[in]     S               points to an instance of the fixed-point CFFT structure
+  @param[in,out] p1              points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place
+  @param[in]     ifftFlag       flag that selects transform direction
+                   - value = 0: forward transform
+                   - value = 1: inverse transform
+  @param[in]     bitReverseFlag flag that enables / disables bit reversal of output
+                   - value = 0: disables bit reversal of output
+                   - value = 1: enables bit reversal of output
+  @return        none
+ */
+
+void arm_cfft_q31(
+  const arm_cfft_instance_q31 * S,
+        q31_t * p1,
+        uint8_t ifftFlag,
+        uint8_t bitReverseFlag)
+{
+  uint32_t L = S->fftLen;
+
+  if (ifftFlag == 1U)
+  {
+     switch (L)
+     {
+     case 16:
+     case 64:
+     case 256:
+     case 1024:
+     case 4096:
+       arm_radix4_butterfly_inverse_q31 ( p1, L, (q31_t*)S->pTwiddle, 1 );
+       break;
+
+     case 32:
+     case 128:
+     case 512:
+     case 2048:
+       arm_cfft_radix4by2_inverse_q31 ( p1, L, S->pTwiddle );
+       break;
+     }
+  }
+  else
+  {
+     switch (L)
+     {
+     case 16:
+     case 64:
+     case 256:
+     case 1024:
+     case 4096:
+       arm_radix4_butterfly_q31 ( p1, L, (q31_t*)S->pTwiddle, 1 );
+       break;
+
+     case 32:
+     case 128:
+     case 512:
+     case 2048:
+       arm_cfft_radix4by2_q31 ( p1, L, S->pTwiddle );
+       break;
+     }
+  }
+
+  if ( bitReverseFlag )
+    arm_bitreversal_32 ((uint32_t*) p1, S->bitRevLength, S->pBitRevTable);
+}
+
+/**
+  @} end of ComplexFFT group
+ */
+
+void arm_cfft_radix4by2_q31(
+        q31_t * pSrc,
+        uint32_t fftLen,
+  const q31_t * pCoef)
+{
+        uint32_t i, l;
+        uint32_t n2;
+        q31_t xt, yt, cosVal, sinVal;
+        q31_t p0, p1;
+
+  n2 = fftLen >> 1U;
+  for (i = 0; i < n2; i++)
+  {
+     cosVal = pCoef[2 * i];
+     sinVal = pCoef[2 * i + 1];
+
+     l = i + n2;
+
+     xt =          (pSrc[2 * i] >> 2U) - (pSrc[2 * l] >> 2U);
+     pSrc[2 * i] = (pSrc[2 * i] >> 2U) + (pSrc[2 * l] >> 2U);
+
+     yt =              (pSrc[2 * i + 1] >> 2U) - (pSrc[2 * l + 1] >> 2U);
+     pSrc[2 * i + 1] = (pSrc[2 * l + 1] >> 2U) + (pSrc[2 * i + 1] >> 2U);
+
+     mult_32x32_keep32_R(p0, xt, cosVal);
+     mult_32x32_keep32_R(p1, yt, cosVal);
+     multAcc_32x32_keep32_R(p0, yt, sinVal);
+     multSub_32x32_keep32_R(p1, xt, sinVal);
+
+     pSrc[2 * l]     = p0 << 1;
+     pSrc[2 * l + 1] = p1 << 1;
+  }
+
+  /* first col */
+  arm_radix4_butterfly_q31 (pSrc,          n2, (q31_t*)pCoef, 2U);
+
+  /* second col */
+  arm_radix4_butterfly_q31 (pSrc + fftLen, n2, (q31_t*)pCoef, 2U);
+
+  n2 = fftLen >> 1U;
+  for (i = 0; i < n2; i++)
+  {
+     p0 = pSrc[4 * i + 0];
+     p1 = pSrc[4 * i + 1];
+     xt = pSrc[4 * i + 2];
+     yt = pSrc[4 * i + 3];
+
+     p0 <<= 1U;
+     p1 <<= 1U;
+     xt <<= 1U;
+     yt <<= 1U;
+
+     pSrc[4 * i + 0] = p0;
+     pSrc[4 * i + 1] = p1;
+     pSrc[4 * i + 2] = xt;
+     pSrc[4 * i + 3] = yt;
+  }
+
+}
+
+void arm_cfft_radix4by2_inverse_q31(
+        q31_t * pSrc,
+        uint32_t fftLen,
+  const q31_t * pCoef)
+{
+  uint32_t i, l;
+  uint32_t n2;
+  q31_t xt, yt, cosVal, sinVal;
+  q31_t p0, p1;
+
+  n2 = fftLen >> 1U;
+  for (i = 0; i < n2; i++)
+  {
+     cosVal = pCoef[2 * i];
+     sinVal = pCoef[2 * i + 1];
+
+     l = i + n2;
+
+     xt =          (pSrc[2 * i] >> 2U) - (pSrc[2 * l] >> 2U);
+     pSrc[2 * i] = (pSrc[2 * i] >> 2U) + (pSrc[2 * l] >> 2U);
+
+     yt =              (pSrc[2 * i + 1] >> 2U) - (pSrc[2 * l + 1] >> 2U);
+     pSrc[2 * i + 1] = (pSrc[2 * l + 1] >> 2U) + (pSrc[2 * i + 1] >> 2U);
+
+     mult_32x32_keep32_R(p0, xt, cosVal);
+     mult_32x32_keep32_R(p1, yt, cosVal);
+     multSub_32x32_keep32_R(p0, yt, sinVal);
+     multAcc_32x32_keep32_R(p1, xt, sinVal);
+
+     pSrc[2 * l]     = p0 << 1U;
+     pSrc[2 * l + 1] = p1 << 1U;
+  }
+
+  /* first col */
+  arm_radix4_butterfly_inverse_q31( pSrc,          n2, (q31_t*)pCoef, 2U);
+
+  /* second col */
+  arm_radix4_butterfly_inverse_q31( pSrc + fftLen, n2, (q31_t*)pCoef, 2U);
+
+  n2 = fftLen >> 1U;
+  for (i = 0; i < n2; i++)
+  {
+     p0 = pSrc[4 * i + 0];
+     p1 = pSrc[4 * i + 1];
+     xt = pSrc[4 * i + 2];
+     yt = pSrc[4 * i + 3];
+
+     p0 <<= 1U;
+     p1 <<= 1U;
+     xt <<= 1U;
+     yt <<= 1U;
+
+     pSrc[4 * i + 0] = p0;
+     pSrc[4 * i + 1] = p1;
+     pSrc[4 * i + 2] = xt;
+     pSrc[4 * i + 3] = yt;
+  }
+}
diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_f32.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_f32.c
index ab218a5..c514fda 100644
--- a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_f32.c
+++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_f32.c
@@ -1,470 +1,470 @@
-/* ----------------------------------------------------------------------
- * Project:      CMSIS DSP Library
- * Title:        arm_cfft_radix2_f32.c
- * Description:  Radix-2 Decimation in Frequency CFFT & CIFFT Floating point processing function
- *
- * $Date:        23 April 2021
- * $Revision:    V1.9.0
- *
- * Target Processor: Cortex-M and Cortex-A cores
- * -------------------------------------------------------------------- */
-/*
- * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "dsp/transform_functions.h"
-
-void arm_radix2_butterfly_f32(
-        float32_t * pSrc,
-        uint32_t fftLen,
-  const float32_t * pCoef,
-        uint16_t twidCoefModifier);
-
-void arm_radix2_butterfly_inverse_f32(
-        float32_t * pSrc,
-        uint32_t fftLen,
-  const float32_t * pCoef,
-        uint16_t twidCoefModifier,
-        float32_t onebyfftLen);
-
-extern void arm_bitreversal_f32(
-        float32_t * pSrc,
-        uint16_t fftSize,
-        uint16_t bitRevFactor,
-  const uint16_t * pBitRevTab);
-
-/**
-  @ingroup groupTransforms
- */
-
-/**
-  @addtogroup ComplexFFT
-  @{
- */
-
-/**
-  @brief         Radix-2 CFFT/CIFFT.
-  @deprecated    Do not use this function. It has been superseded by \ref arm_cfft_f32 and will be removed in the future
-  @param[in]     S    points to an instance of the floating-point Radix-2 CFFT/CIFFT structure
-  @param[in,out] pSrc points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place
-  @return        none
- */
-
-void arm_cfft_radix2_f32(
-const arm_cfft_radix2_instance_f32 * S,
-      float32_t * pSrc)
-{
-
-   if (S->ifftFlag == 1U)
-   {
-      /* Complex IFFT radix-2 */
-      arm_radix2_butterfly_inverse_f32(pSrc, S->fftLen, S->pTwiddle,
-      S->twidCoefModifier, S->onebyfftLen);
-   }
-   else
-   {
-      /* Complex FFT radix-2 */
-      arm_radix2_butterfly_f32(pSrc, S->fftLen, S->pTwiddle,
-      S->twidCoefModifier);
-   }
-
-   if (S->bitReverseFlag == 1U)
-   {
-      /* Bit Reversal */
-      arm_bitreversal_f32(pSrc, S->fftLen, S->bitRevFactor, S->pBitRevTable);
-   }
-
-}
-
-
-/**
-  @} end of ComplexFFT group
- */
-
-
-
-/* ----------------------------------------------------------------------
- ** Internal helper function used by the FFTs
- ** ------------------------------------------------------------------- */
-
-/**
-  brief  Core function for the floating-point CFFT butterfly process.
-  param[in,out] pSrc             points to in-place buffer of floating-point data type
-  param[in]     fftLen           length of the FFT
-  param[in]     pCoef            points to twiddle coefficient buffer
-  param[in]     twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table
-  return        none
- */
-
-void arm_radix2_butterfly_f32(
-        float32_t * pSrc,
-        uint32_t fftLen,
-  const float32_t * pCoef,
-        uint16_t twidCoefModifier)
-{
-
-        uint32_t i, j, k, l;
-        uint32_t n1, n2, ia;
-        float32_t xt, yt, cosVal, sinVal;
-        float32_t p0, p1, p2, p3;
-        float32_t a0, a1;
-
-#if defined (ARM_MATH_DSP)
-
-   /*  Initializations for the first stage */
-   n2 = fftLen >> 1;
-   ia = 0;
-   i = 0;
-
-   // loop for groups
-   for (k = n2; k > 0; k--)
-   {
-      cosVal = pCoef[ia * 2];
-      sinVal = pCoef[(ia * 2) + 1];
-
-      /*  Twiddle coefficients index modifier */
-      ia += twidCoefModifier;
-
-      /*  index calculation for the input as, */
-      /*  pSrc[i + 0], pSrc[i + fftLen/1] */
-      l = i + n2;
-
-      /*  Butterfly implementation */
-      a0 = pSrc[2 * i] + pSrc[2 * l];
-      xt = pSrc[2 * i] - pSrc[2 * l];
-
-      yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
-      a1 = pSrc[2 * l + 1] + pSrc[2 * i + 1];
-
-      p0 = xt * cosVal;
-      p1 = yt * sinVal;
-      p2 = yt * cosVal;
-      p3 = xt * sinVal;
-
-      pSrc[2 * i]     = a0;
-      pSrc[2 * i + 1] = a1;
-
-      pSrc[2 * l]     = p0 + p1;
-      pSrc[2 * l + 1] = p2 - p3;
-
-      i++;
-   }                             // groups loop end
-
-   twidCoefModifier <<= 1U;
-
-   // loop for stage
-   for (k = n2; k > 2; k = k >> 1)
-   {
-      n1 = n2;
-      n2 = n2 >> 1;
-      ia = 0;
-
-      // loop for groups
-      j = 0;
-      do
-      {
-         cosVal = pCoef[ia * 2];
-         sinVal = pCoef[(ia * 2) + 1];
-         ia += twidCoefModifier;
-
-         // loop for butterfly
-         i = j;
-         do
-         {
-            l = i + n2;
-            a0 = pSrc[2 * i] + pSrc[2 * l];
-            xt = pSrc[2 * i] - pSrc[2 * l];
-
-            yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
-            a1 = pSrc[2 * l + 1] + pSrc[2 * i + 1];
-
-            p0 = xt * cosVal;
-            p1 = yt * sinVal;
-            p2 = yt * cosVal;
-            p3 = xt * sinVal;
-
-            pSrc[2 * i] = a0;
-            pSrc[2 * i + 1] = a1;
-
-            pSrc[2 * l]     = p0 + p1;
-            pSrc[2 * l + 1] = p2 - p3;
-
-            i += n1;
-         } while ( i < fftLen );                        // butterfly loop end
-         j++;
-      } while ( j < n2);                          // groups loop end
-      twidCoefModifier <<= 1U;
-   }                             // stages loop end
-
-   // loop for butterfly
-   for (i = 0; i < fftLen; i += 2)
-   {
-      a0 = pSrc[2 * i] + pSrc[2 * i + 2];
-      xt = pSrc[2 * i] - pSrc[2 * i + 2];
-
-      yt = pSrc[2 * i + 1] - pSrc[2 * i + 3];
-      a1 = pSrc[2 * i + 3] + pSrc[2 * i + 1];
-
-      pSrc[2 * i] = a0;
-      pSrc[2 * i + 1] = a1;
-      pSrc[2 * i + 2] = xt;
-      pSrc[2 * i + 3] = yt;
-   }                             // groups loop end
-
-#else /* #if defined (ARM_MATH_DSP) */
-
-   n2 = fftLen;
-
-   // loop for stage
-   for (k = fftLen; k > 1; k = k >> 1)
-   {
-      n1 = n2;
-      n2 = n2 >> 1;
-      ia = 0;
-
-      // loop for groups
-      j = 0;
-      do
-      {
-         cosVal = pCoef[ia * 2];
-         sinVal = pCoef[(ia * 2) + 1];
-         ia += twidCoefModifier;
-
-         // loop for butterfly
-         i = j;
-         do
-         {
-            l = i + n2;
-            a0 = pSrc[2 * i] + pSrc[2 * l];
-            xt = pSrc[2 * i] - pSrc[2 * l];
-
-            yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
-            a1 = pSrc[2 * l + 1] + pSrc[2 * i + 1];
-
-            p0 = xt * cosVal;
-            p1 = yt * sinVal;
-            p2 = yt * cosVal;
-            p3 = xt * sinVal;
-
-            pSrc[2 * i] = a0;
-            pSrc[2 * i + 1] = a1;
-
-            pSrc[2 * l]     = p0 + p1;
-            pSrc[2 * l + 1] = p2 - p3;
-
-            i += n1;
-         } while (i < fftLen);
-         j++;
-      } while (j < n2);
-      twidCoefModifier <<= 1U;
-   }
-
-#endif /* #if defined (ARM_MATH_DSP) */
-
-}
-
-
-void arm_radix2_butterfly_inverse_f32(
-        float32_t * pSrc,
-        uint32_t fftLen,
-  const float32_t * pCoef,
-        uint16_t twidCoefModifier,
-        float32_t onebyfftLen)
-{
-
-        uint32_t i, j, k, l;
-        uint32_t n1, n2, ia;
-        float32_t xt, yt, cosVal, sinVal;
-        float32_t p0, p1, p2, p3;
-        float32_t a0, a1;
-
-#if defined (ARM_MATH_DSP)
-
-   n2 = fftLen >> 1;
-   ia = 0;
-
-   // loop for groups
-   for (i = 0; i < n2; i++)
-   {
-      cosVal = pCoef[ia * 2];
-      sinVal = pCoef[(ia * 2) + 1];
-      ia += twidCoefModifier;
-
-      l = i + n2;
-      a0 = pSrc[2 * i] + pSrc[2 * l];
-      xt = pSrc[2 * i] - pSrc[2 * l];
-
-      yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
-      a1 = pSrc[2 * l + 1] + pSrc[2 * i + 1];
-
-      p0 = xt * cosVal;
-      p1 = yt * sinVal;
-      p2 = yt * cosVal;
-      p3 = xt * sinVal;
-
-      pSrc[2 * i] = a0;
-      pSrc[2 * i + 1] = a1;
-
-      pSrc[2 * l]     = p0 - p1;
-      pSrc[2 * l + 1] = p2 + p3;
-   }                             // groups loop end
-
-   twidCoefModifier <<= 1U;
-
-   // loop for stage
-   for (k = fftLen / 2; k > 2; k = k >> 1)
-   {
-      n1 = n2;
-      n2 = n2 >> 1;
-      ia = 0;
-
-      // loop for groups
-      j = 0;
-      do
-      {
-         cosVal = pCoef[ia * 2];
-         sinVal = pCoef[(ia * 2) + 1];
-         ia += twidCoefModifier;
-
-         // loop for butterfly
-         i = j;
-         do
-         {
-            l = i + n2;
-            a0 = pSrc[2 * i] + pSrc[2 * l];
-            xt = pSrc[2 * i] - pSrc[2 * l];
-
-            yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
-            a1 = pSrc[2 * l + 1] + pSrc[2 * i + 1];
-
-            p0 = xt * cosVal;
-            p1 = yt * sinVal;
-            p2 = yt * cosVal;
-            p3 = xt * sinVal;
-
-            pSrc[2 * i] = a0;
-            pSrc[2 * i + 1] = a1;
-
-            pSrc[2 * l]     = p0 - p1;
-            pSrc[2 * l + 1] = p2 + p3;
-
-            i += n1;
-         } while ( i < fftLen );                 // butterfly loop end
-         j++;
-      } while (j < n2);                      // groups loop end
-
-      twidCoefModifier <<= 1U;
-   }                             // stages loop end
-
-   // loop for butterfly
-   for (i = 0; i < fftLen; i += 2)
-   {
-      a0 = pSrc[2 * i] + pSrc[2 * i + 2];
-      xt = pSrc[2 * i] - pSrc[2 * i + 2];
-
-      a1 = pSrc[2 * i + 3] + pSrc[2 * i + 1];
-      yt = pSrc[2 * i + 1] - pSrc[2 * i + 3];
-
-      p0 = a0 * onebyfftLen;
-      p2 = xt * onebyfftLen;
-      p1 = a1 * onebyfftLen;
-      p3 = yt * onebyfftLen;
-
-      pSrc[2 * i] = p0;
-      pSrc[2 * i + 1] = p1;
-      pSrc[2 * i + 2] = p2;
-      pSrc[2 * i + 3] = p3;
-   }                             // butterfly loop end
-
-#else /* #if defined (ARM_MATH_DSP) */
-
-   n2 = fftLen;
-
-   // loop for stage
-   for (k = fftLen; k > 2; k = k >> 1)
-   {
-      n1 = n2;
-      n2 = n2 >> 1;
-      ia = 0;
-
-      // loop for groups
-      j = 0;
-      do
-      {
-         cosVal = pCoef[ia * 2];
-         sinVal = pCoef[(ia * 2) + 1];
-         ia = ia + twidCoefModifier;
-
-         // loop for butterfly
-         i = j;
-         do
-         {
-            l = i + n2;
-            a0 = pSrc[2 * i] + pSrc[2 * l];
-            xt = pSrc[2 * i] - pSrc[2 * l];
-
-            yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
-            a1 = pSrc[2 * l + 1] + pSrc[2 * i + 1];
-
-            p0 = xt * cosVal;
-            p1 = yt * sinVal;
-            p2 = yt * cosVal;
-            p3 = xt * sinVal;
-
-            pSrc[2 * i] = a0;
-            pSrc[2 * i + 1] = a1;
-
-            pSrc[2 * l]     = p0 - p1;
-            pSrc[2 * l + 1] = p2 + p3;
-
-            i += n1;
-         } while ( i < fftLen );                    // butterfly loop end
-         j++;
-      } while ( j < n2 );                      // groups loop end
-
-      twidCoefModifier = twidCoefModifier << 1U;
-   }                             // stages loop end
-
-   n1 = n2;
-   n2 = n2 >> 1;
-
-   // loop for butterfly
-   for (i = 0; i < fftLen; i += n1)
-   {
-      l = i + n2;
-
-      a0 = pSrc[2 * i] + pSrc[2 * l];
-      xt = pSrc[2 * i] - pSrc[2 * l];
-
-      a1 = pSrc[2 * l + 1] + pSrc[2 * i + 1];
-      yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
-
-      p0 = a0 * onebyfftLen;
-      p2 = xt * onebyfftLen;
-      p1 = a1 * onebyfftLen;
-      p3 = yt * onebyfftLen;
-
-      pSrc[2 * i] = p0;
-      pSrc[2 * l] = p2;
-
-      pSrc[2 * i + 1] = p1;
-      pSrc[2 * l + 1] = p3;
-   }                             // butterfly loop end
-
-#endif /* #if defined (ARM_MATH_DSP) */
-
-}
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_cfft_radix2_f32.c
+ * Description:  Radix-2 Decimation in Frequency CFFT & CIFFT Floating point processing function
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+void arm_radix2_butterfly_f32(
+        float32_t * pSrc,
+        uint32_t fftLen,
+  const float32_t * pCoef,
+        uint16_t twidCoefModifier);
+
+void arm_radix2_butterfly_inverse_f32(
+        float32_t * pSrc,
+        uint32_t fftLen,
+  const float32_t * pCoef,
+        uint16_t twidCoefModifier,
+        float32_t onebyfftLen);
+
+extern void arm_bitreversal_f32(
+        float32_t * pSrc,
+        uint16_t fftSize,
+        uint16_t bitRevFactor,
+  const uint16_t * pBitRevTab);
+
+/**
+  @ingroup groupTransforms
+ */
+
+/**
+  @addtogroup ComplexFFT
+  @{
+ */
+
+/**
+  @brief         Radix-2 CFFT/CIFFT.
+  @deprecated    Do not use this function. It has been superseded by \ref arm_cfft_f32 and will be removed in the future
+  @param[in]     S    points to an instance of the floating-point Radix-2 CFFT/CIFFT structure
+  @param[in,out] pSrc points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place
+  @return        none
+ */
+
+void arm_cfft_radix2_f32(
+const arm_cfft_radix2_instance_f32 * S,
+      float32_t * pSrc)
+{
+
+   if (S->ifftFlag == 1U)
+   {
+      /* Complex IFFT radix-2 */
+      arm_radix2_butterfly_inverse_f32(pSrc, S->fftLen, S->pTwiddle,
+      S->twidCoefModifier, S->onebyfftLen);
+   }
+   else
+   {
+      /* Complex FFT radix-2 */
+      arm_radix2_butterfly_f32(pSrc, S->fftLen, S->pTwiddle,
+      S->twidCoefModifier);
+   }
+
+   if (S->bitReverseFlag == 1U)
+   {
+      /* Bit Reversal */
+      arm_bitreversal_f32(pSrc, S->fftLen, S->bitRevFactor, S->pBitRevTable);
+   }
+
+}
+
+
+/**
+  @} end of ComplexFFT group
+ */
+
+
+
+/* ----------------------------------------------------------------------
+ ** Internal helper function used by the FFTs
+ ** ------------------------------------------------------------------- */
+
+/**
+  brief  Core function for the floating-point CFFT butterfly process.
+  param[in,out] pSrc             points to in-place buffer of floating-point data type
+  param[in]     fftLen           length of the FFT
+  param[in]     pCoef            points to twiddle coefficient buffer
+  param[in]     twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table
+  return        none
+ */
+
+void arm_radix2_butterfly_f32(
+        float32_t * pSrc,
+        uint32_t fftLen,
+  const float32_t * pCoef,
+        uint16_t twidCoefModifier)
+{
+
+        uint32_t i, j, k, l;
+        uint32_t n1, n2, ia;
+        float32_t xt, yt, cosVal, sinVal;
+        float32_t p0, p1, p2, p3;
+        float32_t a0, a1;
+
+#if defined (ARM_MATH_DSP)
+
+   /*  Initializations for the first stage */
+   n2 = fftLen >> 1;
+   ia = 0;
+   i = 0;
+
+   // loop for groups
+   for (k = n2; k > 0; k--)
+   {
+      cosVal = pCoef[ia * 2];
+      sinVal = pCoef[(ia * 2) + 1];
+
+      /*  Twiddle coefficients index modifier */
+      ia += twidCoefModifier;
+
+      /*  index calculation for the input as, */
+      /*  pSrc[i + 0], pSrc[i + fftLen/1] */
+      l = i + n2;
+
+      /*  Butterfly implementation */
+      a0 = pSrc[2 * i] + pSrc[2 * l];
+      xt = pSrc[2 * i] - pSrc[2 * l];
+
+      yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
+      a1 = pSrc[2 * l + 1] + pSrc[2 * i + 1];
+
+      p0 = xt * cosVal;
+      p1 = yt * sinVal;
+      p2 = yt * cosVal;
+      p3 = xt * sinVal;
+
+      pSrc[2 * i]     = a0;
+      pSrc[2 * i + 1] = a1;
+
+      pSrc[2 * l]     = p0 + p1;
+      pSrc[2 * l + 1] = p2 - p3;
+
+      i++;
+   }                             // groups loop end
+
+   twidCoefModifier <<= 1U;
+
+   // loop for stage
+   for (k = n2; k > 2; k = k >> 1)
+   {
+      n1 = n2;
+      n2 = n2 >> 1;
+      ia = 0;
+
+      // loop for groups
+      j = 0;
+      do
+      {
+         cosVal = pCoef[ia * 2];
+         sinVal = pCoef[(ia * 2) + 1];
+         ia += twidCoefModifier;
+
+         // loop for butterfly
+         i = j;
+         do
+         {
+            l = i + n2;
+            a0 = pSrc[2 * i] + pSrc[2 * l];
+            xt = pSrc[2 * i] - pSrc[2 * l];
+
+            yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
+            a1 = pSrc[2 * l + 1] + pSrc[2 * i + 1];
+
+            p0 = xt * cosVal;
+            p1 = yt * sinVal;
+            p2 = yt * cosVal;
+            p3 = xt * sinVal;
+
+            pSrc[2 * i] = a0;
+            pSrc[2 * i + 1] = a1;
+
+            pSrc[2 * l]     = p0 + p1;
+            pSrc[2 * l + 1] = p2 - p3;
+
+            i += n1;
+         } while ( i < fftLen );                        // butterfly loop end
+         j++;
+      } while ( j < n2);                          // groups loop end
+      twidCoefModifier <<= 1U;
+   }                             // stages loop end
+
+   // loop for butterfly
+   for (i = 0; i < fftLen; i += 2)
+   {
+      a0 = pSrc[2 * i] + pSrc[2 * i + 2];
+      xt = pSrc[2 * i] - pSrc[2 * i + 2];
+
+      yt = pSrc[2 * i + 1] - pSrc[2 * i + 3];
+      a1 = pSrc[2 * i + 3] + pSrc[2 * i + 1];
+
+      pSrc[2 * i] = a0;
+      pSrc[2 * i + 1] = a1;
+      pSrc[2 * i + 2] = xt;
+      pSrc[2 * i + 3] = yt;
+   }                             // groups loop end
+
+#else /* #if defined (ARM_MATH_DSP) */
+
+   n2 = fftLen;
+
+   // loop for stage
+   for (k = fftLen; k > 1; k = k >> 1)
+   {
+      n1 = n2;
+      n2 = n2 >> 1;
+      ia = 0;
+
+      // loop for groups
+      j = 0;
+      do
+      {
+         cosVal = pCoef[ia * 2];
+         sinVal = pCoef[(ia * 2) + 1];
+         ia += twidCoefModifier;
+
+         // loop for butterfly
+         i = j;
+         do
+         {
+            l = i + n2;
+            a0 = pSrc[2 * i] + pSrc[2 * l];
+            xt = pSrc[2 * i] - pSrc[2 * l];
+
+            yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
+            a1 = pSrc[2 * l + 1] + pSrc[2 * i + 1];
+
+            p0 = xt * cosVal;
+            p1 = yt * sinVal;
+            p2 = yt * cosVal;
+            p3 = xt * sinVal;
+
+            pSrc[2 * i] = a0;
+            pSrc[2 * i + 1] = a1;
+
+            pSrc[2 * l]     = p0 + p1;
+            pSrc[2 * l + 1] = p2 - p3;
+
+            i += n1;
+         } while (i < fftLen);
+         j++;
+      } while (j < n2);
+      twidCoefModifier <<= 1U;
+   }
+
+#endif /* #if defined (ARM_MATH_DSP) */
+
+}
+
+
+void arm_radix2_butterfly_inverse_f32(
+        float32_t * pSrc,
+        uint32_t fftLen,
+  const float32_t * pCoef,
+        uint16_t twidCoefModifier,
+        float32_t onebyfftLen)
+{
+
+        uint32_t i, j, k, l;
+        uint32_t n1, n2, ia;
+        float32_t xt, yt, cosVal, sinVal;
+        float32_t p0, p1, p2, p3;
+        float32_t a0, a1;
+
+#if defined (ARM_MATH_DSP)
+
+   n2 = fftLen >> 1;
+   ia = 0;
+
+   // loop for groups
+   for (i = 0; i < n2; i++)
+   {
+      cosVal = pCoef[ia * 2];
+      sinVal = pCoef[(ia * 2) + 1];
+      ia += twidCoefModifier;
+
+      l = i + n2;
+      a0 = pSrc[2 * i] + pSrc[2 * l];
+      xt = pSrc[2 * i] - pSrc[2 * l];
+
+      yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
+      a1 = pSrc[2 * l + 1] + pSrc[2 * i + 1];
+
+      p0 = xt * cosVal;
+      p1 = yt * sinVal;
+      p2 = yt * cosVal;
+      p3 = xt * sinVal;
+
+      pSrc[2 * i] = a0;
+      pSrc[2 * i + 1] = a1;
+
+      pSrc[2 * l]     = p0 - p1;
+      pSrc[2 * l + 1] = p2 + p3;
+   }                             // groups loop end
+
+   twidCoefModifier <<= 1U;
+
+   // loop for stage
+   for (k = fftLen / 2; k > 2; k = k >> 1)
+   {
+      n1 = n2;
+      n2 = n2 >> 1;
+      ia = 0;
+
+      // loop for groups
+      j = 0;
+      do
+      {
+         cosVal = pCoef[ia * 2];
+         sinVal = pCoef[(ia * 2) + 1];
+         ia += twidCoefModifier;
+
+         // loop for butterfly
+         i = j;
+         do
+         {
+            l = i + n2;
+            a0 = pSrc[2 * i] + pSrc[2 * l];
+            xt = pSrc[2 * i] - pSrc[2 * l];
+
+            yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
+            a1 = pSrc[2 * l + 1] + pSrc[2 * i + 1];
+
+            p0 = xt * cosVal;
+            p1 = yt * sinVal;
+            p2 = yt * cosVal;
+            p3 = xt * sinVal;
+
+            pSrc[2 * i] = a0;
+            pSrc[2 * i + 1] = a1;
+
+            pSrc[2 * l]     = p0 - p1;
+            pSrc[2 * l + 1] = p2 + p3;
+
+            i += n1;
+         } while ( i < fftLen );                 // butterfly loop end
+         j++;
+      } while (j < n2);                      // groups loop end
+
+      twidCoefModifier <<= 1U;
+   }                             // stages loop end
+
+   // loop for butterfly
+   for (i = 0; i < fftLen; i += 2)
+   {
+      a0 = pSrc[2 * i] + pSrc[2 * i + 2];
+      xt = pSrc[2 * i] - pSrc[2 * i + 2];
+
+      a1 = pSrc[2 * i + 3] + pSrc[2 * i + 1];
+      yt = pSrc[2 * i + 1] - pSrc[2 * i + 3];
+
+      p0 = a0 * onebyfftLen;
+      p2 = xt * onebyfftLen;
+      p1 = a1 * onebyfftLen;
+      p3 = yt * onebyfftLen;
+
+      pSrc[2 * i] = p0;
+      pSrc[2 * i + 1] = p1;
+      pSrc[2 * i + 2] = p2;
+      pSrc[2 * i + 3] = p3;
+   }                             // butterfly loop end
+
+#else /* #if defined (ARM_MATH_DSP) */
+
+   n2 = fftLen;
+
+   // loop for stage
+   for (k = fftLen; k > 2; k = k >> 1)
+   {
+      n1 = n2;
+      n2 = n2 >> 1;
+      ia = 0;
+
+      // loop for groups
+      j = 0;
+      do
+      {
+         cosVal = pCoef[ia * 2];
+         sinVal = pCoef[(ia * 2) + 1];
+         ia = ia + twidCoefModifier;
+
+         // loop for butterfly
+         i = j;
+         do
+         {
+            l = i + n2;
+            a0 = pSrc[2 * i] + pSrc[2 * l];
+            xt = pSrc[2 * i] - pSrc[2 * l];
+
+            yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
+            a1 = pSrc[2 * l + 1] + pSrc[2 * i + 1];
+
+            p0 = xt * cosVal;
+            p1 = yt * sinVal;
+            p2 = yt * cosVal;
+            p3 = xt * sinVal;
+
+            pSrc[2 * i] = a0;
+            pSrc[2 * i + 1] = a1;
+
+            pSrc[2 * l]     = p0 - p1;
+            pSrc[2 * l + 1] = p2 + p3;
+
+            i += n1;
+         } while ( i < fftLen );                    // butterfly loop end
+         j++;
+      } while ( j < n2 );                      // groups loop end
+
+      twidCoefModifier = twidCoefModifier << 1U;
+   }                             // stages loop end
+
+   n1 = n2;
+   n2 = n2 >> 1;
+
+   // loop for butterfly
+   for (i = 0; i < fftLen; i += n1)
+   {
+      l = i + n2;
+
+      a0 = pSrc[2 * i] + pSrc[2 * l];
+      xt = pSrc[2 * i] - pSrc[2 * l];
+
+      a1 = pSrc[2 * l + 1] + pSrc[2 * i + 1];
+      yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
+
+      p0 = a0 * onebyfftLen;
+      p2 = xt * onebyfftLen;
+      p1 = a1 * onebyfftLen;
+      p3 = yt * onebyfftLen;
+
+      pSrc[2 * i] = p0;
+      pSrc[2 * l] = p2;
+
+      pSrc[2 * i + 1] = p1;
+      pSrc[2 * l + 1] = p3;
+   }                             // butterfly loop end
+
+#endif /* #if defined (ARM_MATH_DSP) */
+
+}
diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_init_f32.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_init_f32.c
index ae9f29a..4a11840 100644
--- a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_init_f32.c
+++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_init_f32.c
@@ -1,209 +1,197 @@
-/* ----------------------------------------------------------------------
- * Project:      CMSIS DSP Library
- * Title:        arm_cfft_radix2_init_f32.c
- * Description:  Radix-2 Decimation in Frequency Floating-point CFFT & CIFFT Initialization function
- *
- * $Date:        23 April 2021
- * $Revision:    V1.9.0
- *
- * Target Processor: Cortex-M and Cortex-A cores
- * -------------------------------------------------------------------- */
-/*
- * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "dsp/transform_functions.h"
-#include "arm_common_tables.h"
-
-/**
-  @ingroup groupTransforms
- */
-
-/**
-  @addtogroup ComplexFFT
-  @{
- */
-
-/**
-  @brief         Initialization function for the floating-point CFFT/CIFFT.
-  @deprecated    Do not use this function. It has been superseded by \ref arm_cfft_f32 and will be removed in the future.
-  @param[in,out] S              points to an instance of the floating-point CFFT/CIFFT structure
-  @param[in]     fftLen         length of the FFT
-  @param[in]     ifftFlag       flag that selects transform direction
-                   - value = 0: forward transform
-                   - value = 1: inverse transform
-  @param[in]     bitReverseFlag flag that enables / disables bit reversal of output
-                   - value = 0: disables bit reversal of output
-                   - value = 1: enables bit reversal of output
-  @return        execution status
-                   - \ref ARM_MATH_SUCCESS        : Operation successful
-                   - \ref ARM_MATH_ARGUMENT_ERROR : <code>fftLen</code> is not a supported length
-
-  @par           Details
-                   The parameter <code>ifftFlag</code> controls whether a forward or inverse transform is computed.
-                   Set(=1) ifftFlag for calculation of CIFFT otherwise  CFFT is calculated
-  @par
-                   The parameter <code>bitReverseFlag</code> controls whether output is in normal order or bit reversed order.
-                   Set(=1) bitReverseFlag for output to be in normal order otherwise output is in bit reversed order.
-  @par
-                   The parameter <code>fftLen</code> Specifies length of CFFT/CIFFT process. Supported FFT Lengths are 16, 64, 256, 1024.
-  @par
-                   This Function also initializes Twiddle factor table pointer and Bit reversal table pointer.
-*/
-
-arm_status arm_cfft_radix2_init_f32(
-  arm_cfft_radix2_instance_f32 * S,
-  uint16_t fftLen,
-  uint8_t ifftFlag,
-  uint8_t bitReverseFlag)
-{
-   /*  Initialise the default arm status */
-  arm_status status = ARM_MATH_ARGUMENT_ERROR;
-
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)
-
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F32_4096)
-
-  /*  Initialise the default arm status */
-  status = ARM_MATH_SUCCESS;
-
-  /*  Initialise the FFT length */
-  S->fftLen = fftLen;
-
-  /*  Initialise the Twiddle coefficient pointer */
-  S->pTwiddle = (float32_t *) twiddleCoef;
-
-  /*  Initialise the Flag for selection of CFFT or CIFFT */
-  S->ifftFlag = ifftFlag;
-
-  /*  Initialise the Flag for calculation Bit reversal or not */
-  S->bitReverseFlag = bitReverseFlag;
-
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F32_4096)
-
-  /*  Initializations of structure parameters depending on the FFT length */
-  switch (S->fftLen)
-  {
-
-  case 4096U:
-    /*  Initializations of structure parameters for 4096 point FFT */
-
-    /*  Initialise the twiddle coef modifier value */
-    S->twidCoefModifier = 1U;
-    /*  Initialise the bit reversal table modifier */
-    S->bitRevFactor = 1U;
-    /*  Initialise the bit reversal table pointer */
-    S->pBitRevTable = (uint16_t *) armBitRevTable;
-    /*  Initialise the 1/fftLen Value */
-    S->onebyfftLen = 0.000244140625;
-    break;
-
-  case 2048U:
-    /*  Initializations of structure parameters for 2048 point FFT */
-
-    /*  Initialise the twiddle coef modifier value */
-    S->twidCoefModifier = 2U;
-    /*  Initialise the bit reversal table modifier */
-    S->bitRevFactor = 2U;
-    /*  Initialise the bit reversal table pointer */
-    S->pBitRevTable = (uint16_t *) & armBitRevTable[1];
-    /*  Initialise the 1/fftLen Value */
-    S->onebyfftLen = 0.00048828125;
-    break;
-
-  case 1024U:
-    /*  Initializations of structure parameters for 1024 point FFT */
-
-    /*  Initialise the twiddle coef modifier value */
-    S->twidCoefModifier = 4U;
-    /*  Initialise the bit reversal table modifier */
-    S->bitRevFactor = 4U;
-    /*  Initialise the bit reversal table pointer */
-    S->pBitRevTable = (uint16_t *) & armBitRevTable[3];
-    /*  Initialise the 1/fftLen Value */
-    S->onebyfftLen = 0.0009765625f;
-    break;
-
-  case 512U:
-    /*  Initializations of structure parameters for 512 point FFT */
-
-    /*  Initialise the twiddle coef modifier value */
-    S->twidCoefModifier = 8U;
-    /*  Initialise the bit reversal table modifier */
-    S->bitRevFactor = 8U;
-    /*  Initialise the bit reversal table pointer */
-    S->pBitRevTable = (uint16_t *) & armBitRevTable[7];
-    /*  Initialise the 1/fftLen Value */
-    S->onebyfftLen = 0.001953125;
-    break;
-
-  case 256U:
-    /*  Initializations of structure parameters for 256 point FFT */
-    S->twidCoefModifier = 16U;
-    S->bitRevFactor = 16U;
-    S->pBitRevTable = (uint16_t *) & armBitRevTable[15];
-    S->onebyfftLen = 0.00390625f;
-    break;
-
-  case 128U:
-    /*  Initializations of structure parameters for 128 point FFT */
-    S->twidCoefModifier = 32U;
-    S->bitRevFactor = 32U;
-    S->pBitRevTable = (uint16_t *) & armBitRevTable[31];
-    S->onebyfftLen = 0.0078125;
-    break;
-
-  case 64U:
-    /*  Initializations of structure parameters for 64 point FFT */
-    S->twidCoefModifier = 64U;
-    S->bitRevFactor = 64U;
-    S->pBitRevTable = (uint16_t *) & armBitRevTable[63];
-    S->onebyfftLen = 0.015625f;
-    break;
-
-  case 32U:
-    /*  Initializations of structure parameters for 64 point FFT */
-    S->twidCoefModifier = 128U;
-    S->bitRevFactor = 128U;
-    S->pBitRevTable = (uint16_t *) & armBitRevTable[127];
-    S->onebyfftLen = 0.03125;
-    break;
-
-  case 16U:
-    /*  Initializations of structure parameters for 16 point FFT */
-    S->twidCoefModifier = 256U;
-    S->bitRevFactor = 256U;
-    S->pBitRevTable = (uint16_t *) & armBitRevTable[255];
-    S->onebyfftLen = 0.0625f;
-    break;
-
-
-  default:
-    /*  Reporting argument error if fftSize is not valid value */
-    status = ARM_MATH_ARGUMENT_ERROR;
-    break;
-  }
-
-#endif
-#endif
-#endif
-  return (status);
-}
-
-/**
-  @} end of ComplexFFT group
- */
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_cfft_radix2_init_f32.c
+ * Description:  Radix-2 Decimation in Frequency Floating-point CFFT & CIFFT Initialization function
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+#include "arm_common_tables.h"
+
+/**
+  @ingroup groupTransforms
+ */
+
+/**
+  @addtogroup ComplexFFT
+  @{
+ */
+
+/**
+  @brief         Initialization function for the floating-point CFFT/CIFFT.
+  @deprecated    Do not use this function. It has been superseded by \ref arm_cfft_f32 and will be removed in the future.
+  @param[in,out] S              points to an instance of the floating-point CFFT/CIFFT structure
+  @param[in]     fftLen         length of the FFT
+  @param[in]     ifftFlag       flag that selects transform direction
+                   - value = 0: forward transform
+                   - value = 1: inverse transform
+  @param[in]     bitReverseFlag flag that enables / disables bit reversal of output
+                   - value = 0: disables bit reversal of output
+                   - value = 1: enables bit reversal of output
+  @return        execution status
+                   - \ref ARM_MATH_SUCCESS        : Operation successful
+                   - \ref ARM_MATH_ARGUMENT_ERROR : <code>fftLen</code> is not a supported length
+
+  @par           Details
+                   The parameter <code>ifftFlag</code> controls whether a forward or inverse transform is computed.
+                   Set(=1) ifftFlag for calculation of CIFFT otherwise  CFFT is calculated
+  @par
+                   The parameter <code>bitReverseFlag</code> controls whether output is in normal order or bit reversed order.
+                   Set(=1) bitReverseFlag for output to be in normal order otherwise output is in bit reversed order.
+  @par
+                   The parameter <code>fftLen</code> Specifies length of CFFT/CIFFT process. Supported FFT Lengths are 16, 64, 256, 1024.
+  @par
+                   This Function also initializes Twiddle factor table pointer and Bit reversal table pointer.
+*/
+
+arm_status arm_cfft_radix2_init_f32(
+  arm_cfft_radix2_instance_f32 * S,
+  uint16_t fftLen,
+  uint8_t ifftFlag,
+  uint8_t bitReverseFlag)
+{
+  /*  Initialise the default arm status */
+  arm_status status = ARM_MATH_SUCCESS;
+
+  /*  Initialise the FFT length */
+  S->fftLen = fftLen;
+
+  /*  Initialise the Twiddle coefficient pointer */
+  S->pTwiddle = (float32_t *) twiddleCoef;
+
+  /*  Initialise the Flag for selection of CFFT or CIFFT */
+  S->ifftFlag = ifftFlag;
+
+  /*  Initialise the Flag for calculation Bit reversal or not */
+  S->bitReverseFlag = bitReverseFlag;
+
+  /*  Initializations of structure parameters depending on the FFT length */
+  switch (S->fftLen)
+  {
+
+  case 4096U:
+    /*  Initializations of structure parameters for 4096 point FFT */
+
+    /*  Initialise the twiddle coef modifier value */
+    S->twidCoefModifier = 1U;
+    /*  Initialise the bit reversal table modifier */
+    S->bitRevFactor = 1U;
+    /*  Initialise the bit reversal table pointer */
+    S->pBitRevTable = (uint16_t *) armBitRevTable;
+    /*  Initialise the 1/fftLen Value */
+    S->onebyfftLen = 0.000244140625;
+    break;
+
+  case 2048U:
+    /*  Initializations of structure parameters for 2048 point FFT */
+
+    /*  Initialise the twiddle coef modifier value */
+    S->twidCoefModifier = 2U;
+    /*  Initialise the bit reversal table modifier */
+    S->bitRevFactor = 2U;
+    /*  Initialise the bit reversal table pointer */
+    S->pBitRevTable = (uint16_t *) & armBitRevTable[1];
+    /*  Initialise the 1/fftLen Value */
+    S->onebyfftLen = 0.00048828125;
+    break;
+
+  case 1024U:
+    /*  Initializations of structure parameters for 1024 point FFT */
+
+    /*  Initialise the twiddle coef modifier value */
+    S->twidCoefModifier = 4U;
+    /*  Initialise the bit reversal table modifier */
+    S->bitRevFactor = 4U;
+    /*  Initialise the bit reversal table pointer */
+    S->pBitRevTable = (uint16_t *) & armBitRevTable[3];
+    /*  Initialise the 1/fftLen Value */
+    S->onebyfftLen = 0.0009765625f;
+    break;
+
+  case 512U:
+    /*  Initializations of structure parameters for 512 point FFT */
+
+    /*  Initialise the twiddle coef modifier value */
+    S->twidCoefModifier = 8U;
+    /*  Initialise the bit reversal table modifier */
+    S->bitRevFactor = 8U;
+    /*  Initialise the bit reversal table pointer */
+    S->pBitRevTable = (uint16_t *) & armBitRevTable[7];
+    /*  Initialise the 1/fftLen Value */
+    S->onebyfftLen = 0.001953125;
+    break;
+
+  case 256U:
+    /*  Initializations of structure parameters for 256 point FFT */
+    S->twidCoefModifier = 16U;
+    S->bitRevFactor = 16U;
+    S->pBitRevTable = (uint16_t *) & armBitRevTable[15];
+    S->onebyfftLen = 0.00390625f;
+    break;
+
+  case 128U:
+    /*  Initializations of structure parameters for 128 point FFT */
+    S->twidCoefModifier = 32U;
+    S->bitRevFactor = 32U;
+    S->pBitRevTable = (uint16_t *) & armBitRevTable[31];
+    S->onebyfftLen = 0.0078125;
+    break;
+
+  case 64U:
+    /*  Initializations of structure parameters for 64 point FFT */
+    S->twidCoefModifier = 64U;
+    S->bitRevFactor = 64U;
+    S->pBitRevTable = (uint16_t *) & armBitRevTable[63];
+    S->onebyfftLen = 0.015625f;
+    break;
+
+  case 32U:
+    /*  Initializations of structure parameters for 64 point FFT */
+    S->twidCoefModifier = 128U;
+    S->bitRevFactor = 128U;
+    S->pBitRevTable = (uint16_t *) & armBitRevTable[127];
+    S->onebyfftLen = 0.03125;
+    break;
+
+  case 16U:
+    /*  Initializations of structure parameters for 16 point FFT */
+    S->twidCoefModifier = 256U;
+    S->bitRevFactor = 256U;
+    S->pBitRevTable = (uint16_t *) & armBitRevTable[255];
+    S->onebyfftLen = 0.0625f;
+    break;
+
+
+  default:
+    /*  Reporting argument error if fftSize is not valid value */
+    status = ARM_MATH_ARGUMENT_ERROR;
+    break;
+  }
+
+  return (status);
+}
+
+/**
+  @} end of ComplexFFT group
+ */
diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_init_q15.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_init_q15.c
index 68c9930..a31b569 100644
--- a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_init_q15.c
+++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_init_q15.c
@@ -1,194 +1,182 @@
-/* ----------------------------------------------------------------------
- * Project:      CMSIS DSP Library
- * Title:        arm_cfft_radix2_init_q15.c
- * Description:  Radix-2 Decimation in Frequency Q15 FFT & IFFT initialization function
- *
- * $Date:        23 April 2021
- * $Revision:    V1.9.0
- *
- * Target Processor: Cortex-M and Cortex-A cores
- * -------------------------------------------------------------------- */
-/*
- * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "dsp/transform_functions.h"
-#include "arm_common_tables.h"
-
-/**
-  @ingroup groupTransforms
- */
-
-
-/**
-  @addtogroup ComplexFFT
-  @{
- */
-
-/**
-  @brief                        Initialization function for the Q15 CFFT/CIFFT.
-  @deprecated                   Do not use this function. It has been superseded by \ref arm_cfft_q15 and will be removed
-  @param[in,out] S              points to an instance of the Q15 CFFT/CIFFT structure.
-  @param[in]     fftLen         length of the FFT.
-  @param[in]     ifftFlag       flag that selects transform direction
-                   - value = 0: forward transform
-                   - value = 1: inverse transform
-  @param[in]     bitReverseFlag flag that enables / disables bit reversal of output
-                   - value = 0: disables bit reversal of output
-                   - value = 1: enables bit reversal of output
-  @return        execution status
-                   - \ref ARM_MATH_SUCCESS        : Operation successful
-                   - \ref ARM_MATH_ARGUMENT_ERROR : <code>fftLen</code> is not a supported length
-
-  @par           Details
-                   The parameter <code>ifftFlag</code> controls whether a forward or inverse transform is computed.
-                   Set(=1) ifftFlag for calculation of CIFFT otherwise  CFFT is calculated
-  @par
-                   The parameter <code>bitReverseFlag</code> controls whether output is in normal order or bit reversed order.
-                   Set(=1) bitReverseFlag for output to be in normal order otherwise output is in bit reversed order.
-  @par
-                   The parameter <code>fftLen</code> Specifies length of CFFT/CIFFT process. Supported FFT Lengths are 16, 64, 256, 1024.
-  @par
-                   This Function also initializes Twiddle factor table pointer and Bit reversal table pointer.
-*/
-
-arm_status arm_cfft_radix2_init_q15(
-  arm_cfft_radix2_instance_q15 * S,
-  uint16_t fftLen,
-  uint8_t ifftFlag,
-  uint8_t bitReverseFlag)
-{
-   /*  Initialise the default arm status */
-  arm_status status = ARM_MATH_ARGUMENT_ERROR;
-
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)
-
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q15_4096)
-
-  /*  Initialise the default arm status */
-  status = ARM_MATH_SUCCESS;
-
-  /*  Initialise the FFT length */
-  S->fftLen = fftLen;
-
-  /*  Initialise the Twiddle coefficient pointer */
-  S->pTwiddle = (q15_t *) twiddleCoef_4096_q15;
-  /*  Initialise the Flag for selection of CFFT or CIFFT */
-  S->ifftFlag = ifftFlag;
-  /*  Initialise the Flag for calculation Bit reversal or not */
-  S->bitReverseFlag = bitReverseFlag;
-
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREV_1024)
-
-  /*  Initializations of structure parameters depending on the FFT length */
-  switch (S->fftLen)
-  {
-  case 4096U:
-    /*  Initializations of structure parameters for 4096 point FFT */
-
-    /*  Initialise the twiddle coef modifier value */
-    S->twidCoefModifier = 1U;
-    /*  Initialise the bit reversal table modifier */
-    S->bitRevFactor = 1U;
-    /*  Initialise the bit reversal table pointer */
-    S->pBitRevTable = (uint16_t *) armBitRevTable;
-
-    break;
-
-  case 2048U:
-    /*  Initializations of structure parameters for 2048 point FFT */
-
-    /*  Initialise the twiddle coef modifier value */
-    S->twidCoefModifier = 2U;
-    /*  Initialise the bit reversal table modifier */
-    S->bitRevFactor = 2U;
-    /*  Initialise the bit reversal table pointer */
-    S->pBitRevTable = (uint16_t *) & armBitRevTable[1];
-
-    break;
-
-  case 1024U:
-    /*  Initializations of structure parameters for 1024 point FFT */
-    S->twidCoefModifier = 4U;
-    S->bitRevFactor = 4U;
-    S->pBitRevTable = (uint16_t *) & armBitRevTable[3];
-
-    break;
-
-  case 512U:
-    /*  Initializations of structure parameters for 512 point FFT */
-    S->twidCoefModifier = 8U;
-    S->bitRevFactor = 8U;
-    S->pBitRevTable = (uint16_t *) & armBitRevTable[7];
-
-    break;
-
-  case 256U:
-    /*  Initializations of structure parameters for 256 point FFT */
-    S->twidCoefModifier = 16U;
-    S->bitRevFactor = 16U;
-    S->pBitRevTable = (uint16_t *) & armBitRevTable[15];
-
-    break;
-
-  case 128U:
-    /*  Initializations of structure parameters for 128 point FFT */
-    S->twidCoefModifier = 32U;
-    S->bitRevFactor = 32U;
-    S->pBitRevTable = (uint16_t *) & armBitRevTable[31];
-
-    break;
-
-  case 64U:
-    /*  Initializations of structure parameters for 64 point FFT */
-    S->twidCoefModifier = 64U;
-    S->bitRevFactor = 64U;
-    S->pBitRevTable = (uint16_t *) & armBitRevTable[63];
-
-    break;
-
-  case 32U:
-    /*  Initializations of structure parameters for 32 point FFT */
-    S->twidCoefModifier = 128U;
-    S->bitRevFactor = 128U;
-    S->pBitRevTable = (uint16_t *) & armBitRevTable[127];
-
-    break;
-
-  case 16U:
-    /*  Initializations of structure parameters for 16 point FFT */
-    S->twidCoefModifier = 256U;
-    S->bitRevFactor = 256U;
-    S->pBitRevTable = (uint16_t *) & armBitRevTable[255];
-
-    break;
-
-  default:
-    /*  Reporting argument error if fftSize is not valid value */
-    status = ARM_MATH_ARGUMENT_ERROR;
-    break;
-  }
-
-#endif
-#endif
-#endif
-  return (status);
-}
-
-/**
-  @} end of ComplexFFT group
- */
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_cfft_radix2_init_q15.c
+ * Description:  Radix-2 Decimation in Frequency Q15 FFT & IFFT initialization function
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+#include "arm_common_tables.h"
+
+/**
+  @ingroup groupTransforms
+ */
+
+
+/**
+  @addtogroup ComplexFFT
+  @{
+ */
+
+/**
+  @brief                        Initialization function for the Q15 CFFT/CIFFT.
+  @deprecated                   Do not use this function. It has been superseded by \ref arm_cfft_q15 and will be removed
+  @param[in,out] S              points to an instance of the Q15 CFFT/CIFFT structure.
+  @param[in]     fftLen         length of the FFT.
+  @param[in]     ifftFlag       flag that selects transform direction
+                   - value = 0: forward transform
+                   - value = 1: inverse transform
+  @param[in]     bitReverseFlag flag that enables / disables bit reversal of output
+                   - value = 0: disables bit reversal of output
+                   - value = 1: enables bit reversal of output
+  @return        execution status
+                   - \ref ARM_MATH_SUCCESS        : Operation successful
+                   - \ref ARM_MATH_ARGUMENT_ERROR : <code>fftLen</code> is not a supported length
+
+  @par           Details
+                   The parameter <code>ifftFlag</code> controls whether a forward or inverse transform is computed.
+                   Set(=1) ifftFlag for calculation of CIFFT otherwise  CFFT is calculated
+  @par
+                   The parameter <code>bitReverseFlag</code> controls whether output is in normal order or bit reversed order.
+                   Set(=1) bitReverseFlag for output to be in normal order otherwise output is in bit reversed order.
+  @par
+                   The parameter <code>fftLen</code> Specifies length of CFFT/CIFFT process. Supported FFT Lengths are 16, 64, 256, 1024.
+  @par
+                   This Function also initializes Twiddle factor table pointer and Bit reversal table pointer.
+*/
+
+arm_status arm_cfft_radix2_init_q15(
+  arm_cfft_radix2_instance_q15 * S,
+  uint16_t fftLen,
+  uint8_t ifftFlag,
+  uint8_t bitReverseFlag)
+{
+  /*  Initialise the default arm status */
+  arm_status status = ARM_MATH_SUCCESS;
+
+  /*  Initialise the FFT length */
+  S->fftLen = fftLen;
+
+  /*  Initialise the Twiddle coefficient pointer */
+  S->pTwiddle = (q15_t *) twiddleCoef_4096_q15;
+  /*  Initialise the Flag for selection of CFFT or CIFFT */
+  S->ifftFlag = ifftFlag;
+  /*  Initialise the Flag for calculation Bit reversal or not */
+  S->bitReverseFlag = bitReverseFlag;
+
+  /*  Initializations of structure parameters depending on the FFT length */
+  switch (S->fftLen)
+  {
+  case 4096U:
+    /*  Initializations of structure parameters for 4096 point FFT */
+
+    /*  Initialise the twiddle coef modifier value */
+    S->twidCoefModifier = 1U;
+    /*  Initialise the bit reversal table modifier */
+    S->bitRevFactor = 1U;
+    /*  Initialise the bit reversal table pointer */
+    S->pBitRevTable = (uint16_t *) armBitRevTable;
+
+    break;
+
+  case 2048U:
+    /*  Initializations of structure parameters for 2048 point FFT */
+
+    /*  Initialise the twiddle coef modifier value */
+    S->twidCoefModifier = 2U;
+    /*  Initialise the bit reversal table modifier */
+    S->bitRevFactor = 2U;
+    /*  Initialise the bit reversal table pointer */
+    S->pBitRevTable = (uint16_t *) & armBitRevTable[1];
+
+    break;
+
+  case 1024U:
+    /*  Initializations of structure parameters for 1024 point FFT */
+    S->twidCoefModifier = 4U;
+    S->bitRevFactor = 4U;
+    S->pBitRevTable = (uint16_t *) & armBitRevTable[3];
+
+    break;
+
+  case 512U:
+    /*  Initializations of structure parameters for 512 point FFT */
+    S->twidCoefModifier = 8U;
+    S->bitRevFactor = 8U;
+    S->pBitRevTable = (uint16_t *) & armBitRevTable[7];
+
+    break;
+
+  case 256U:
+    /*  Initializations of structure parameters for 256 point FFT */
+    S->twidCoefModifier = 16U;
+    S->bitRevFactor = 16U;
+    S->pBitRevTable = (uint16_t *) & armBitRevTable[15];
+
+    break;
+
+  case 128U:
+    /*  Initializations of structure parameters for 128 point FFT */
+    S->twidCoefModifier = 32U;
+    S->bitRevFactor = 32U;
+    S->pBitRevTable = (uint16_t *) & armBitRevTable[31];
+
+    break;
+
+  case 64U:
+    /*  Initializations of structure parameters for 64 point FFT */
+    S->twidCoefModifier = 64U;
+    S->bitRevFactor = 64U;
+    S->pBitRevTable = (uint16_t *) & armBitRevTable[63];
+
+    break;
+
+  case 32U:
+    /*  Initializations of structure parameters for 32 point FFT */
+    S->twidCoefModifier = 128U;
+    S->bitRevFactor = 128U;
+    S->pBitRevTable = (uint16_t *) & armBitRevTable[127];
+
+    break;
+
+  case 16U:
+    /*  Initializations of structure parameters for 16 point FFT */
+    S->twidCoefModifier = 256U;
+    S->bitRevFactor = 256U;
+    S->pBitRevTable = (uint16_t *) & armBitRevTable[255];
+
+    break;
+
+  default:
+    /*  Reporting argument error if fftSize is not valid value */
+    status = ARM_MATH_ARGUMENT_ERROR;
+    break;
+  }
+
+  return (status);
+}
+
+/**
+  @} end of ComplexFFT group
+ */
diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_init_q31.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_init_q31.c
index 73b8a39..2cf4fd5 100644
--- a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_init_q31.c
+++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_init_q31.c
@@ -1,191 +1,179 @@
-/* ----------------------------------------------------------------------
- * Project:      CMSIS DSP Library
- * Title:        arm_cfft_radix2_init_q31.c
- * Description:  Radix-2 Decimation in Frequency Fixed-point CFFT & CIFFT Initialization function
- *
- * $Date:        23 April 2021
- * $Revision:    V1.9.0
- *
- * Target Processor: Cortex-M and Cortex-A cores
- * -------------------------------------------------------------------- */
-/*
- * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "dsp/transform_functions.h"
-#include "arm_common_tables.h"
-
-/**
-  @ingroup groupTransforms
- */
-
-/**
-  @addtogroup ComplexFFT
-  @{
- */
-
-/**
-  @brief         Initialization function for the Q31 CFFT/CIFFT.
-  @deprecated    Do not use this function. It has been superseded by \ref arm_cfft_q31 and will be removed in the future.
-  @param[in,out] S              points to an instance of the Q31 CFFT/CIFFT structure
-  @param[in]     fftLen         length of the FFT
-  @param[in]     ifftFlag       flag that selects transform direction
-                   - value = 0: forward transform
-                   - value = 1: inverse transform
-  @param[in]     bitReverseFlag flag that enables / disables bit reversal of output
-                   - value = 0: disables bit reversal of output
-                   - value = 1: enables bit reversal of output
-  @return        execution status
-                   - \ref ARM_MATH_SUCCESS        : Operation successful
-                   - \ref ARM_MATH_ARGUMENT_ERROR : <code>fftLen</code> is not a supported length
-
-  @par           Details
-                   The parameter <code>ifftFlag</code> controls whether a forward or inverse transform is computed.
-                   Set(=1) ifftFlag for calculation of CIFFT otherwise CFFT is calculated
-  @par
-                   The parameter <code>bitReverseFlag</code> controls whether output is in normal order or bit reversed order.
-                   Set(=1) bitReverseFlag for output to be in normal order otherwise output is in bit reversed order.
-  @par
-                   The parameter <code>fftLen</code> Specifies length of CFFT/CIFFT process. Supported FFT Lengths are 16, 64, 256, 1024.
-  @par
-                   This Function also initializes Twiddle factor table pointer and Bit reversal table pointer.
-*/
-
-arm_status arm_cfft_radix2_init_q31(
-  arm_cfft_radix2_instance_q31 * S,
-  uint16_t fftLen,
-  uint8_t ifftFlag,
-  uint8_t bitReverseFlag)
-{
-  /*  Initialise the default arm status */
-  arm_status status = ARM_MATH_ARGUMENT_ERROR;
-
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)
-
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q31_4096)
-
-  /*  Initialise the default arm status */
-  status = ARM_MATH_SUCCESS;
-
-  /*  Initialise the FFT length */
-  S->fftLen = fftLen;
-
-  /*  Initialise the Twiddle coefficient pointer */
-  S->pTwiddle = (q31_t *) twiddleCoef_4096_q31;
-
-  /*  Initialise the Flag for selection of CFFT or CIFFT */
-  S->ifftFlag = ifftFlag;
-
-  /*  Initialise the Flag for calculation Bit reversal or not */
-  S->bitReverseFlag = bitReverseFlag;
-
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREV_1024)
-
-  /*  Initializations of Instance structure depending on the FFT length */
-  switch (S->fftLen)
-  {
-    /*  Initializations of structure parameters for 4096 point FFT */
-  case 4096U:
-    /*  Initialise the twiddle coef modifier value */
-    S->twidCoefModifier = 1U;
-    /*  Initialise the bit reversal table modifier */
-    S->bitRevFactor = 1U;
-    /*  Initialise the bit reversal table pointer */
-    S->pBitRevTable = (uint16_t *) armBitRevTable;
-    break;
-
-    /*  Initializations of structure parameters for 2048 point FFT */
-  case 2048U:
-    /*  Initialise the twiddle coef modifier value */
-    S->twidCoefModifier = 2U;
-    /*  Initialise the bit reversal table modifier */
-    S->bitRevFactor = 2U;
-    /*  Initialise the bit reversal table pointer */
-    S->pBitRevTable = (uint16_t *) & armBitRevTable[1];
-    break;
-
-    /*  Initializations of structure parameters for 1024 point FFT */
-  case 1024U:
-    /*  Initialise the twiddle coef modifier value */
-    S->twidCoefModifier = 4U;
-    /*  Initialise the bit reversal table modifier */
-    S->bitRevFactor = 4U;
-    /*  Initialise the bit reversal table pointer */
-    S->pBitRevTable = (uint16_t *) & armBitRevTable[3];
-    break;
-
-    /*  Initializations of structure parameters for 512 point FFT */
-  case 512U:
-    /*  Initialise the twiddle coef modifier value */
-    S->twidCoefModifier = 8U;
-    /*  Initialise the bit reversal table modifier */
-    S->bitRevFactor = 8U;
-    /*  Initialise the bit reversal table pointer */
-    S->pBitRevTable = (uint16_t *) & armBitRevTable[7];
-    break;
-
-  case 256U:
-    /*  Initializations of structure parameters for 256 point FFT */
-    S->twidCoefModifier = 16U;
-    S->bitRevFactor = 16U;
-    S->pBitRevTable = (uint16_t *) & armBitRevTable[15];
-    break;
-
-  case 128U:
-    /*  Initializations of structure parameters for 128 point FFT */
-    S->twidCoefModifier = 32U;
-    S->bitRevFactor = 32U;
-    S->pBitRevTable = (uint16_t *) & armBitRevTable[31];
-    break;
-
-  case 64U:
-    /*  Initializations of structure parameters for 64 point FFT */
-    S->twidCoefModifier = 64U;
-    S->bitRevFactor = 64U;
-    S->pBitRevTable = (uint16_t *) & armBitRevTable[63];
-    break;
-
-  case 32U:
-    /*  Initializations of structure parameters for 32 point FFT */
-    S->twidCoefModifier = 128U;
-    S->bitRevFactor = 128U;
-    S->pBitRevTable = (uint16_t *) & armBitRevTable[127];
-    break;
-
-  case 16U:
-    /*  Initializations of structure parameters for 16 point FFT */
-    S->twidCoefModifier = 256U;
-    S->bitRevFactor = 256U;
-    S->pBitRevTable = (uint16_t *) & armBitRevTable[255];
-    break;
-
-
-  default:
-    /*  Reporting argument error if fftSize is not valid value */
-    status = ARM_MATH_ARGUMENT_ERROR;
-    break;
-  }
-
-#endif
-#endif 
-#endif
-  return (status);
-}
-
-/**
-  @} end of ComplexFFT group
- */
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_cfft_radix2_init_q31.c
+ * Description:  Radix-2 Decimation in Frequency Fixed-point CFFT & CIFFT Initialization function
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+#include "arm_common_tables.h"
+
+/**
+  @ingroup groupTransforms
+ */
+
+/**
+  @addtogroup ComplexFFT
+  @{
+ */
+
+/**
+  @brief         Initialization function for the Q31 CFFT/CIFFT.
+  @deprecated    Do not use this function. It has been superseded by \ref arm_cfft_q31 and will be removed in the future.
+  @param[in,out] S              points to an instance of the Q31 CFFT/CIFFT structure
+  @param[in]     fftLen         length of the FFT
+  @param[in]     ifftFlag       flag that selects transform direction
+                   - value = 0: forward transform
+                   - value = 1: inverse transform
+  @param[in]     bitReverseFlag flag that enables / disables bit reversal of output
+                   - value = 0: disables bit reversal of output
+                   - value = 1: enables bit reversal of output
+  @return        execution status
+                   - \ref ARM_MATH_SUCCESS        : Operation successful
+                   - \ref ARM_MATH_ARGUMENT_ERROR : <code>fftLen</code> is not a supported length
+
+  @par           Details
+                   The parameter <code>ifftFlag</code> controls whether a forward or inverse transform is computed.
+                   Set(=1) ifftFlag for calculation of CIFFT otherwise CFFT is calculated
+  @par
+                   The parameter <code>bitReverseFlag</code> controls whether output is in normal order or bit reversed order.
+                   Set(=1) bitReverseFlag for output to be in normal order otherwise output is in bit reversed order.
+  @par
+                   The parameter <code>fftLen</code> Specifies length of CFFT/CIFFT process. Supported FFT Lengths are 16, 64, 256, 1024.
+  @par
+                   This Function also initializes Twiddle factor table pointer and Bit reversal table pointer.
+*/
+
+arm_status arm_cfft_radix2_init_q31(
+  arm_cfft_radix2_instance_q31 * S,
+  uint16_t fftLen,
+  uint8_t ifftFlag,
+  uint8_t bitReverseFlag)
+{
+  /*  Initialise the default arm status */
+  arm_status status = ARM_MATH_SUCCESS;
+
+  /*  Initialise the FFT length */
+  S->fftLen = fftLen;
+
+  /*  Initialise the Twiddle coefficient pointer */
+  S->pTwiddle = (q31_t *) twiddleCoef_4096_q31;
+
+  /*  Initialise the Flag for selection of CFFT or CIFFT */
+  S->ifftFlag = ifftFlag;
+
+  /*  Initialise the Flag for calculation Bit reversal or not */
+  S->bitReverseFlag = bitReverseFlag;
+
+  /*  Initializations of Instance structure depending on the FFT length */
+  switch (S->fftLen)
+  {
+    /*  Initializations of structure parameters for 4096 point FFT */
+  case 4096U:
+    /*  Initialise the twiddle coef modifier value */
+    S->twidCoefModifier = 1U;
+    /*  Initialise the bit reversal table modifier */
+    S->bitRevFactor = 1U;
+    /*  Initialise the bit reversal table pointer */
+    S->pBitRevTable = (uint16_t *) armBitRevTable;
+    break;
+
+    /*  Initializations of structure parameters for 2048 point FFT */
+  case 2048U:
+    /*  Initialise the twiddle coef modifier value */
+    S->twidCoefModifier = 2U;
+    /*  Initialise the bit reversal table modifier */
+    S->bitRevFactor = 2U;
+    /*  Initialise the bit reversal table pointer */
+    S->pBitRevTable = (uint16_t *) & armBitRevTable[1];
+    break;
+
+    /*  Initializations of structure parameters for 1024 point FFT */
+  case 1024U:
+    /*  Initialise the twiddle coef modifier value */
+    S->twidCoefModifier = 4U;
+    /*  Initialise the bit reversal table modifier */
+    S->bitRevFactor = 4U;
+    /*  Initialise the bit reversal table pointer */
+    S->pBitRevTable = (uint16_t *) & armBitRevTable[3];
+    break;
+
+    /*  Initializations of structure parameters for 512 point FFT */
+  case 512U:
+    /*  Initialise the twiddle coef modifier value */
+    S->twidCoefModifier = 8U;
+    /*  Initialise the bit reversal table modifier */
+    S->bitRevFactor = 8U;
+    /*  Initialise the bit reversal table pointer */
+    S->pBitRevTable = (uint16_t *) & armBitRevTable[7];
+    break;
+
+  case 256U:
+    /*  Initializations of structure parameters for 256 point FFT */
+    S->twidCoefModifier = 16U;
+    S->bitRevFactor = 16U;
+    S->pBitRevTable = (uint16_t *) & armBitRevTable[15];
+    break;
+
+  case 128U:
+    /*  Initializations of structure parameters for 128 point FFT */
+    S->twidCoefModifier = 32U;
+    S->bitRevFactor = 32U;
+    S->pBitRevTable = (uint16_t *) & armBitRevTable[31];
+    break;
+
+  case 64U:
+    /*  Initializations of structure parameters for 64 point FFT */
+    S->twidCoefModifier = 64U;
+    S->bitRevFactor = 64U;
+    S->pBitRevTable = (uint16_t *) & armBitRevTable[63];
+    break;
+
+  case 32U:
+    /*  Initializations of structure parameters for 32 point FFT */
+    S->twidCoefModifier = 128U;
+    S->bitRevFactor = 128U;
+    S->pBitRevTable = (uint16_t *) & armBitRevTable[127];
+    break;
+
+  case 16U:
+    /*  Initializations of structure parameters for 16 point FFT */
+    S->twidCoefModifier = 256U;
+    S->bitRevFactor = 256U;
+    S->pBitRevTable = (uint16_t *) & armBitRevTable[255];
+    break;
+
+
+  default:
+    /*  Reporting argument error if fftSize is not valid value */
+    status = ARM_MATH_ARGUMENT_ERROR;
+    break;
+  }
+
+  return (status);
+}
+
+/**
+  @} end of ComplexFFT group
+ */
diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_q15.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_q15.c
index ca15ea1..fbb809e 100644
--- a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_q15.c
+++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_q15.c
@@ -1,689 +1,689 @@
-/* ----------------------------------------------------------------------
- * Project:      CMSIS DSP Library
- * Title:        arm_cfft_radix2_q15.c
- * Description:  Radix-2 Decimation in Frequency CFFT & CIFFT Fixed point processing function
- *
- * $Date:        23 April 2021
- * $Revision:    V1.9.0
- *
- * Target Processor: Cortex-M and Cortex-A cores
- * -------------------------------------------------------------------- */
-/*
- * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "dsp/transform_functions.h"
-
-void arm_radix2_butterfly_q15(
-        q15_t * pSrc,
-        uint32_t fftLen,
-  const q15_t * pCoef,
-        uint16_t twidCoefModifier);
-
-void arm_radix2_butterfly_inverse_q15(
-        q15_t * pSrc,
-        uint32_t fftLen,
-  const q15_t * pCoef,
-        uint16_t twidCoefModifier);
-
-void arm_bitreversal_q15(
-        q15_t * pSrc,
-        uint32_t fftLen,
-        uint16_t bitRevFactor,
-  const uint16_t * pBitRevTab);
-
-/**
-  @ingroup groupTransforms
- */
-
-/**
-  @addtogroup ComplexFFT
-  @{
- */
-
-/**
-  @brief         Processing function for the fixed-point CFFT/CIFFT.
-  @deprecated    Do not use this function. It has been superseded by \ref arm_cfft_q15 and will be removed in the future.
-  @param[in]     S    points to an instance of the fixed-point CFFT/CIFFT structure
-  @param[in,out] pSrc points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place
-  @return        none
- */
-
-void arm_cfft_radix2_q15(
-  const arm_cfft_radix2_instance_q15 * S,
-        q15_t * pSrc)
-{
-
-  if (S->ifftFlag == 1U)
-  {
-    arm_radix2_butterfly_inverse_q15 (pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier);
-  }
-  else
-  {
-    arm_radix2_butterfly_q15 (pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier);
-  }
-
-  arm_bitreversal_q15(pSrc, S->fftLen, S->bitRevFactor, S->pBitRevTable);
-}
-
-/**
-  @} end of ComplexFFT group
- */
-
-void arm_radix2_butterfly_q15(
-        q15_t * pSrc,
-        uint32_t fftLen,
-  const q15_t * pCoef,
-        uint16_t twidCoefModifier)
-{
-#if defined (ARM_MATH_DSP)
-
-  uint32_t i, j, k, l;
-  uint32_t n1, n2, ia;
-  q15_t in;
-  q31_t T, S, R;
-  q31_t coeff, out1, out2;
-
-  //N = fftLen;
-  n2 = fftLen;
-
-  n1 = n2;
-  n2 = n2 >> 1;
-  ia = 0;
-
-  // loop for groups
-  for (i = 0; i < n2; i++)
-  {
-    coeff = read_q15x2 ((q15_t *)pCoef + (ia * 2U));
-
-    ia = ia + twidCoefModifier;
-
-    l = i + n2;
-
-    T = read_q15x2 (pSrc + (2 * i));
-    in = ((int16_t) (T & 0xFFFF)) >> 1;
-    T = ((T >> 1) & 0xFFFF0000) | (in & 0xFFFF);
-
-    S = read_q15x2 (pSrc + (2 * l));
-    in = ((int16_t) (S & 0xFFFF)) >> 1;
-    S = ((S >> 1) & 0xFFFF0000) | (in & 0xFFFF);
-
-    R = __QSUB16(T, S);
-
-    write_q15x2 (pSrc + (2 * i), __SHADD16(T, S));
-
-#ifndef ARM_MATH_BIG_ENDIAN
-    out1 = __SMUAD(coeff, R) >> 16;
-    out2 = __SMUSDX(coeff, R);
-#else
-    out1 = __SMUSDX(R, coeff) >> 16U;
-    out2 = __SMUAD(coeff, R);
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
-    write_q15x2 (pSrc + (2U * l), (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
-
-    coeff = read_q15x2 ((q15_t *)pCoef + (ia * 2U));
-
-    ia = ia + twidCoefModifier;
-
-    /* loop for butterfly */
-    i++;
-    l++;
-
-    T = read_q15x2 (pSrc + (2 * i));
-    in = ((int16_t) (T & 0xFFFF)) >> 1;
-    T = ((T >> 1) & 0xFFFF0000) | (in & 0xFFFF);
-
-    S = read_q15x2 (pSrc + (2 * l));
-    in = ((int16_t) (S & 0xFFFF)) >> 1;
-    S = ((S >> 1) & 0xFFFF0000) | (in & 0xFFFF);
-
-    R = __QSUB16(T, S);
-
-    write_q15x2 (pSrc + (2 * i), __SHADD16(T, S));
-
-#ifndef ARM_MATH_BIG_ENDIAN
-    out1 = __SMUAD(coeff, R) >> 16;
-    out2 = __SMUSDX(coeff, R);
-#else
-
-    out1 = __SMUSDX(R, coeff) >> 16U;
-    out2 = __SMUAD(coeff, R);
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
-    write_q15x2 (pSrc + (2U * l), (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
-
-  } /* groups loop end */
-
-  twidCoefModifier = twidCoefModifier << 1U;
-
-  /* loop for stage */
-  for (k = fftLen / 2; k > 2; k = k >> 1)
-  {
-    n1 = n2;
-    n2 = n2 >> 1;
-    ia = 0;
-
-    /* loop for groups */
-    for (j = 0; j < n2; j++)
-    {
-      coeff = read_q15x2 ((q15_t *)pCoef + (ia * 2U));
-
-      ia = ia + twidCoefModifier;
-
-      /* loop for butterfly */
-      for (i = j; i < fftLen; i += n1)
-      {
-        l = i + n2;
-
-        T = read_q15x2 (pSrc + (2 * i));
-
-        S = read_q15x2 (pSrc + (2 * l));
-
-        R = __QSUB16(T, S);
-
-        write_q15x2 (pSrc + (2 * i), __SHADD16(T, S));
-
-#ifndef ARM_MATH_BIG_ENDIAN
-        out1 = __SMUAD(coeff, R) >> 16;
-        out2 = __SMUSDX(coeff, R);
-#else
-        out1 = __SMUSDX(R, coeff) >> 16U;
-        out2 = __SMUAD(coeff, R);
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
-        write_q15x2 (pSrc + (2U * l), (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
-
-        i += n1;
-
-        l = i + n2;
-
-        T = read_q15x2 (pSrc + (2 * i));
-
-        S = read_q15x2 (pSrc + (2 * l));
-
-        R = __QSUB16(T, S);
-
-        write_q15x2 (pSrc + (2 * i), __SHADD16(T, S));
-
-#ifndef ARM_MATH_BIG_ENDIAN
-        out1 = __SMUAD(coeff, R) >> 16;
-        out2 = __SMUSDX(coeff, R);
-#else
-        out1 = __SMUSDX(R, coeff) >> 16U;
-        out2 = __SMUAD(coeff, R);
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
-        write_q15x2 (pSrc + (2U * l), (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
-
-      } /* butterfly loop end */
-
-    } /* groups loop end */
-
-    twidCoefModifier = twidCoefModifier << 1U;
-  } /* stages loop end */
-
-  n1 = n2;
-  n2 = n2 >> 1;
-  ia = 0;
-
-  coeff = read_q15x2 ((q15_t *)pCoef + (ia * 2U));
-
-  ia = ia + twidCoefModifier;
-
-  /* loop for butterfly */
-  for (i = 0; i < fftLen; i += n1)
-  {
-    l = i + n2;
-
-    T = read_q15x2 (pSrc + (2 * i));
-
-    S = read_q15x2 (pSrc + (2 * l));
-
-    R = __QSUB16(T, S);
-
-    write_q15x2 (pSrc + (2 * i), __QADD16(T, S));
-
-    write_q15x2 (pSrc + (2 * l), R);
-
-    i += n1;
-    l = i + n2;
-
-    T = read_q15x2 (pSrc + (2 * i));
-
-    S = read_q15x2 (pSrc + (2 * l));
-
-    R = __QSUB16(T, S);
-
-    write_q15x2 (pSrc + (2 * i), __QADD16(T, S));
-
-    write_q15x2 (pSrc + (2 * l), R);
-
-  } /* groups loop end */
-
-
-#else /* #if defined (ARM_MATH_DSP) */
-
-  uint32_t i, j, k, l;
-  uint32_t n1, n2, ia;
-  q15_t xt, yt, cosVal, sinVal;
-
-
-  // N = fftLen;
-  n2 = fftLen;
-
-  n1 = n2;
-  n2 = n2 >> 1;
-  ia = 0;
-
-  /* loop for groups */
-  for (j = 0; j < n2; j++)
-  {
-    cosVal = pCoef[(ia * 2)];
-    sinVal = pCoef[(ia * 2) + 1];
-    ia = ia + twidCoefModifier;
-
-    /* loop for butterfly */
-    for (i = j; i < fftLen; i += n1)
-    {
-      l = i + n2;
-      xt = (pSrc[2 * i] >> 1U) - (pSrc[2 * l] >> 1U);
-      pSrc[2 * i] = ((pSrc[2 * i] >> 1U) + (pSrc[2 * l] >> 1U)) >> 1U;
-
-      yt = (pSrc[2 * i + 1] >> 1U) - (pSrc[2 * l + 1] >> 1U);
-      pSrc[2 * i + 1] = ((pSrc[2 * l + 1] >> 1U) +
-                         (pSrc[2 * i + 1] >> 1U)  ) >> 1U;
-
-      pSrc[2 * l] = (((int16_t) (((q31_t) xt * cosVal) >> 16)) +
-                     ((int16_t) (((q31_t) yt * sinVal) >> 16)));
-
-      pSrc[2U * l + 1] = (((int16_t) (((q31_t) yt * cosVal) >> 16)) -
-                          ((int16_t) (((q31_t) xt * sinVal) >> 16)));
-
-    } /* butterfly loop end */
-
-  } /* groups loop end */
-
-  twidCoefModifier = twidCoefModifier << 1U;
-
-  /* loop for stage */
-  for (k = fftLen / 2; k > 2; k = k >> 1)
-  {
-    n1 = n2;
-    n2 = n2 >> 1;
-    ia = 0;
-
-    /* loop for groups */
-    for (j = 0; j < n2; j++)
-    {
-      cosVal = pCoef[ia * 2];
-      sinVal = pCoef[(ia * 2) + 1];
-      ia = ia + twidCoefModifier;
-
-      /* loop for butterfly */
-      for (i = j; i < fftLen; i += n1)
-      {
-        l = i + n2;
-        xt = pSrc[2 * i] - pSrc[2 * l];
-        pSrc[2 * i] = (pSrc[2 * i] + pSrc[2 * l]) >> 1U;
-
-        yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
-        pSrc[2 * i + 1] = (pSrc[2 * l + 1] + pSrc[2 * i + 1]) >> 1U;
-
-        pSrc[2 * l] = (((int16_t) (((q31_t) xt * cosVal) >> 16)) +
-                       ((int16_t) (((q31_t) yt * sinVal) >> 16)));
-
-        pSrc[2U * l + 1] = (((int16_t) (((q31_t) yt * cosVal) >> 16)) -
-                            ((int16_t) (((q31_t) xt * sinVal) >> 16)));
-
-      } /* butterfly loop end */
-
-    } /* groups loop end */
-
-    twidCoefModifier = twidCoefModifier << 1U;
-  } /* stages loop end */
-
-  n1 = n2;
-  n2 = n2 >> 1;
-  ia = 0;
-
-  /* loop for groups */
-  for (j = 0; j < n2; j++)
-  {
-    cosVal = pCoef[ia * 2];
-    sinVal = pCoef[(ia * 2) + 1];
-
-    ia = ia + twidCoefModifier;
-
-    /* loop for butterfly */
-    for (i = j; i < fftLen; i += n1)
-    {
-      l = i + n2;
-      xt = pSrc[2 * i] - pSrc[2 * l];
-      pSrc[2 * i] = (pSrc[2 * i] + pSrc[2 * l]);
-
-      yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
-      pSrc[2 * i + 1] = (pSrc[2 * l + 1] + pSrc[2 * i + 1]);
-
-      pSrc[2 * l] = xt;
-
-      pSrc[2 * l + 1] = yt;
-
-    } /* butterfly loop end */
-
-  } /* groups loop end */
-
-  twidCoefModifier = twidCoefModifier << 1U;
-
-#endif /* #if defined (ARM_MATH_DSP) */
-
-}
-
-
-void arm_radix2_butterfly_inverse_q15(
-        q15_t * pSrc,
-        uint32_t fftLen,
-  const q15_t * pCoef,
-        uint16_t twidCoefModifier)
-{
-#if defined (ARM_MATH_DSP)
-
-        uint32_t i, j, k, l;
-        uint32_t n1, n2, ia;
-        q15_t in;
-        q31_t T, S, R;
-        q31_t coeff, out1, out2;
-
-  // N = fftLen;
-  n2 = fftLen;
-
-  n1 = n2;
-  n2 = n2 >> 1;
-  ia = 0;
-
-  /* loop for groups */
-  for (i = 0; i < n2; i++)
-  {
-    coeff = read_q15x2 ((q15_t *)pCoef + (ia * 2U));
-
-    ia = ia + twidCoefModifier;
-
-    l = i + n2;
-
-    T = read_q15x2 (pSrc + (2 * i));
-    in = ((int16_t) (T & 0xFFFF)) >> 1;
-    T = ((T >> 1) & 0xFFFF0000) | (in & 0xFFFF);
-
-    S = read_q15x2 (pSrc + (2 * l));
-    in = ((int16_t) (S & 0xFFFF)) >> 1;
-    S = ((S >> 1) & 0xFFFF0000) | (in & 0xFFFF);
-
-    R = __QSUB16(T, S);
-
-    write_q15x2 (pSrc + (2 * i), __SHADD16(T, S));
-
-#ifndef ARM_MATH_BIG_ENDIAN
-    out1 = __SMUSD(coeff, R) >> 16;
-    out2 = __SMUADX(coeff, R);
-#else
-    out1 = __SMUADX(R, coeff) >> 16U;
-    out2 = __SMUSD(__QSUB(0, coeff), R);
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
-    write_q15x2 (pSrc + (2 * l), (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
-
-    coeff = read_q15x2 ((q15_t *)pCoef + (ia * 2U));
-
-    ia = ia + twidCoefModifier;
-
-    /* loop for butterfly */
-    i++;
-    l++;
-
-    T = read_q15x2 (pSrc + (2 * i));
-    in = ((int16_t) (T & 0xFFFF)) >> 1;
-    T = ((T >> 1) & 0xFFFF0000) | (in & 0xFFFF);
-
-    S = read_q15x2 (pSrc + (2 * l));
-    in = ((int16_t) (S & 0xFFFF)) >> 1;
-    S = ((S >> 1) & 0xFFFF0000) | (in & 0xFFFF);
-
-    R = __QSUB16(T, S);
-
-    write_q15x2 (pSrc + (2 * i), __SHADD16(T, S));
-
-#ifndef ARM_MATH_BIG_ENDIAN
-    out1 = __SMUSD(coeff, R) >> 16;
-    out2 = __SMUADX(coeff, R);
-#else
-    out1 = __SMUADX(R, coeff) >> 16U;
-    out2 = __SMUSD(__QSUB(0, coeff), R);
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
-    write_q15x2 (pSrc + (2 * l), (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
-
-  } /* groups loop end */
-
-  twidCoefModifier = twidCoefModifier << 1U;
-
-  /* loop for stage */
-  for (k = fftLen / 2; k > 2; k = k >> 1)
-  {
-    n1 = n2;
-    n2 = n2 >> 1;
-    ia = 0;
-
-    /* loop for groups */
-    for (j = 0; j < n2; j++)
-    {
-      coeff = read_q15x2 ((q15_t *)pCoef + (ia * 2U));
-
-      ia = ia + twidCoefModifier;
-
-      /* loop for butterfly */
-      for (i = j; i < fftLen; i += n1)
-      {
-        l = i + n2;
-
-        T = read_q15x2 (pSrc + (2 * i));
-
-        S = read_q15x2 (pSrc + (2 * l));
-
-        R = __QSUB16(T, S);
-
-        write_q15x2 (pSrc + (2 * i), __SHADD16(T, S));
-
-#ifndef ARM_MATH_BIG_ENDIAN
-        out1 = __SMUSD(coeff, R) >> 16;
-        out2 = __SMUADX(coeff, R);
-#else
-        out1 = __SMUADX(R, coeff) >> 16U;
-        out2 = __SMUSD(__QSUB(0, coeff), R);
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
-        write_q15x2 (pSrc + (2 * l), (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
-
-        i += n1;
-
-        l = i + n2;
-
-        T = read_q15x2 (pSrc + (2 * i));
-
-        S = read_q15x2 (pSrc + (2 * l));
-
-        R = __QSUB16(T, S);
-
-        write_q15x2 (pSrc + (2 * i), __SHADD16(T, S));
-
-#ifndef ARM_MATH_BIG_ENDIAN
-        out1 = __SMUSD(coeff, R) >> 16;
-        out2 = __SMUADX(coeff, R);
-#else
-        out1 = __SMUADX(R, coeff) >> 16U;
-        out2 = __SMUSD(__QSUB(0, coeff), R);
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
-        write_q15x2 (pSrc + (2 * l), (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
-
-      } /* butterfly loop end */
-
-    } /* groups loop end */
-
-    twidCoefModifier = twidCoefModifier << 1U;
-  } /* stages loop end */
-
-  n1 = n2;
-  n2 = n2 >> 1;
-  ia = 0;
-
-  /* loop for groups */
-  for (j = 0; j < n2; j++)
-  {
-    coeff = read_q15x2 ((q15_t *)pCoef + (ia * 2U));
-
-    ia = ia + twidCoefModifier;
-
-    /* loop for butterfly */
-    for (i = j; i < fftLen; i += n1)
-    {
-      l = i + n2;
-
-      T = read_q15x2 (pSrc + (2 * i));
-
-      S = read_q15x2 (pSrc + (2 * l));
-
-      R = __QSUB16(T, S);
-
-      write_q15x2 (pSrc + (2 * i), __QADD16(T, S));
-
-      write_q15x2 (pSrc + (2 * l), R);
-
-    } /* butterfly loop end */
-
-  } /* groups loop end */
-
-  twidCoefModifier = twidCoefModifier << 1U;
-
-#else /* #if defined (ARM_MATH_DSP) */
-
-        uint32_t i, j, k, l;
-        uint32_t n1, n2, ia;
-        q15_t xt, yt, cosVal, sinVal;
-
-  // N = fftLen;
-  n2 = fftLen;
-
-  n1 = n2;
-  n2 = n2 >> 1;
-  ia = 0;
-
-  /* loop for groups */
-  for (j = 0; j < n2; j++)
-  {
-    cosVal = pCoef[(ia * 2)];
-    sinVal = pCoef[(ia * 2) + 1];
-    ia = ia + twidCoefModifier;
-
-    /* loop for butterfly */
-    for (i = j; i < fftLen; i += n1)
-    {
-      l = i + n2;
-      xt = (pSrc[2 * i] >> 1U) - (pSrc[2 * l] >> 1U);
-      pSrc[2 * i] = ((pSrc[2 * i] >> 1U) + (pSrc[2 * l] >> 1U)) >> 1U;
-
-      yt = (pSrc[2 * i + 1] >> 1U) - (pSrc[2 * l + 1] >> 1U);
-      pSrc[2 * i + 1] = ((pSrc[2 * l + 1] >> 1U) +
-                         (pSrc[2 * i + 1] >> 1U)  ) >> 1U;
-
-      pSrc[2 * l] = (((int16_t) (((q31_t) xt * cosVal) >> 16)) -
-                     ((int16_t) (((q31_t) yt * sinVal) >> 16)));
-
-      pSrc[2 * l + 1] = (((int16_t) (((q31_t) yt * cosVal) >> 16)) +
-                         ((int16_t) (((q31_t) xt * sinVal) >> 16)));
-
-    } /* butterfly loop end */
-
-  } /* groups loop end */
-
-  twidCoefModifier = twidCoefModifier << 1U;
-
-  /* loop for stage */
-  for (k = fftLen / 2; k > 2; k = k >> 1)
-  {
-    n1 = n2;
-    n2 = n2 >> 1;
-    ia = 0;
-
-    /* loop for groups */
-    for (j = 0; j < n2; j++)
-    {
-      cosVal = pCoef[(ia * 2)];
-      sinVal = pCoef[(ia * 2) + 1];
-      ia = ia + twidCoefModifier;
-
-      /* loop for butterfly */
-      for (i = j; i < fftLen; i += n1)
-      {
-        l = i + n2;
-        xt = pSrc[2 * i] - pSrc[2 * l];
-        pSrc[2 * i] = (pSrc[2 * i] + pSrc[2 * l]) >> 1U;
-
-        yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
-        pSrc[2 * i + 1] = (pSrc[2 * l + 1] + pSrc[2 * i + 1]) >> 1U;
-
-        pSrc[2 * l] = (((int16_t) (((q31_t) xt * cosVal) >> 16)) -
-                       ((int16_t) (((q31_t) yt * sinVal) >> 16))  );
-
-        pSrc[2 * l + 1] = (((int16_t) (((q31_t) yt * cosVal) >> 16)) +
-                           ((int16_t) (((q31_t) xt * sinVal) >> 16))  );
-
-      } /* butterfly loop end */
-
-    } /* groups loop end */
-
-    twidCoefModifier = twidCoefModifier << 1U;
-  } /* stages loop end */
-
-  n1 = n2;
-  n2 = n2 >> 1;
-  ia = 0;
-
-  cosVal = pCoef[(ia * 2)];
-  sinVal = pCoef[(ia * 2) + 1];
-
-  ia = ia + twidCoefModifier;
-
-  /* loop for butterfly */
-  for (i = 0; i < fftLen; i += n1)
-  {
-    l = i + n2;
-    xt = pSrc[2 * i] - pSrc[2 * l];
-    pSrc[2 * i] = (pSrc[2 * i] + pSrc[2 * l]);
-
-    yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
-    pSrc[2 * i + 1] = (pSrc[2 * l + 1] + pSrc[2 * i + 1]);
-
-    pSrc[2 * l] = xt;
-
-    pSrc[2 * l + 1] = yt;
-
-  } /* groups loop end */
-
-
-#endif /* #if defined (ARM_MATH_DSP) */
-
-}
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_cfft_radix2_q15.c
+ * Description:  Radix-2 Decimation in Frequency CFFT & CIFFT Fixed point processing function
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+void arm_radix2_butterfly_q15(
+        q15_t * pSrc,
+        uint32_t fftLen,
+  const q15_t * pCoef,
+        uint16_t twidCoefModifier);
+
+void arm_radix2_butterfly_inverse_q15(
+        q15_t * pSrc,
+        uint32_t fftLen,
+  const q15_t * pCoef,
+        uint16_t twidCoefModifier);
+
+void arm_bitreversal_q15(
+        q15_t * pSrc,
+        uint32_t fftLen,
+        uint16_t bitRevFactor,
+  const uint16_t * pBitRevTab);
+
+/**
+  @ingroup groupTransforms
+ */
+
+/**
+  @addtogroup ComplexFFT
+  @{
+ */
+
+/**
+  @brief         Processing function for the fixed-point CFFT/CIFFT.
+  @deprecated    Do not use this function. It has been superseded by \ref arm_cfft_q15 and will be removed in the future.
+  @param[in]     S    points to an instance of the fixed-point CFFT/CIFFT structure
+  @param[in,out] pSrc points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place
+  @return        none
+ */
+
+void arm_cfft_radix2_q15(
+  const arm_cfft_radix2_instance_q15 * S,
+        q15_t * pSrc)
+{
+
+  if (S->ifftFlag == 1U)
+  {
+    arm_radix2_butterfly_inverse_q15 (pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier);
+  }
+  else
+  {
+    arm_radix2_butterfly_q15 (pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier);
+  }
+
+  arm_bitreversal_q15(pSrc, S->fftLen, S->bitRevFactor, S->pBitRevTable);
+}
+
+/**
+  @} end of ComplexFFT group
+ */
+
+void arm_radix2_butterfly_q15(
+        q15_t * pSrc,
+        uint32_t fftLen,
+  const q15_t * pCoef,
+        uint16_t twidCoefModifier)
+{
+#if defined (ARM_MATH_DSP)
+
+  uint32_t i, j, k, l;
+  uint32_t n1, n2, ia;
+  q15_t in;
+  q31_t T, S, R;
+  q31_t coeff, out1, out2;
+
+  //N = fftLen;
+  n2 = fftLen;
+
+  n1 = n2;
+  n2 = n2 >> 1;
+  ia = 0;
+
+  // loop for groups
+  for (i = 0; i < n2; i++)
+  {
+    coeff = read_q15x2 ((q15_t *)pCoef + (ia * 2U));
+
+    ia = ia + twidCoefModifier;
+
+    l = i + n2;
+
+    T = read_q15x2 (pSrc + (2 * i));
+    in = ((int16_t) (T & 0xFFFF)) >> 1;
+    T = ((T >> 1) & 0xFFFF0000) | (in & 0xFFFF);
+
+    S = read_q15x2 (pSrc + (2 * l));
+    in = ((int16_t) (S & 0xFFFF)) >> 1;
+    S = ((S >> 1) & 0xFFFF0000) | (in & 0xFFFF);
+
+    R = __QSUB16(T, S);
+
+    write_q15x2 (pSrc + (2 * i), __SHADD16(T, S));
+
+#ifndef ARM_MATH_BIG_ENDIAN
+    out1 = __SMUAD(coeff, R) >> 16;
+    out2 = __SMUSDX(coeff, R);
+#else
+    out1 = __SMUSDX(R, coeff) >> 16U;
+    out2 = __SMUAD(coeff, R);
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+    write_q15x2 (pSrc + (2U * l), (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
+
+    coeff = read_q15x2 ((q15_t *)pCoef + (ia * 2U));
+
+    ia = ia + twidCoefModifier;
+
+    /* loop for butterfly */
+    i++;
+    l++;
+
+    T = read_q15x2 (pSrc + (2 * i));
+    in = ((int16_t) (T & 0xFFFF)) >> 1;
+    T = ((T >> 1) & 0xFFFF0000) | (in & 0xFFFF);
+
+    S = read_q15x2 (pSrc + (2 * l));
+    in = ((int16_t) (S & 0xFFFF)) >> 1;
+    S = ((S >> 1) & 0xFFFF0000) | (in & 0xFFFF);
+
+    R = __QSUB16(T, S);
+
+    write_q15x2 (pSrc + (2 * i), __SHADD16(T, S));
+
+#ifndef ARM_MATH_BIG_ENDIAN
+    out1 = __SMUAD(coeff, R) >> 16;
+    out2 = __SMUSDX(coeff, R);
+#else
+
+    out1 = __SMUSDX(R, coeff) >> 16U;
+    out2 = __SMUAD(coeff, R);
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+    write_q15x2 (pSrc + (2U * l), (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
+
+  } /* groups loop end */
+
+  twidCoefModifier = twidCoefModifier << 1U;
+
+  /* loop for stage */
+  for (k = fftLen / 2; k > 2; k = k >> 1)
+  {
+    n1 = n2;
+    n2 = n2 >> 1;
+    ia = 0;
+
+    /* loop for groups */
+    for (j = 0; j < n2; j++)
+    {
+      coeff = read_q15x2 ((q15_t *)pCoef + (ia * 2U));
+
+      ia = ia + twidCoefModifier;
+
+      /* loop for butterfly */
+      for (i = j; i < fftLen; i += n1)
+      {
+        l = i + n2;
+
+        T = read_q15x2 (pSrc + (2 * i));
+
+        S = read_q15x2 (pSrc + (2 * l));
+
+        R = __QSUB16(T, S);
+
+        write_q15x2 (pSrc + (2 * i), __SHADD16(T, S));
+
+#ifndef ARM_MATH_BIG_ENDIAN
+        out1 = __SMUAD(coeff, R) >> 16;
+        out2 = __SMUSDX(coeff, R);
+#else
+        out1 = __SMUSDX(R, coeff) >> 16U;
+        out2 = __SMUAD(coeff, R);
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+        write_q15x2 (pSrc + (2U * l), (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
+
+        i += n1;
+
+        l = i + n2;
+
+        T = read_q15x2 (pSrc + (2 * i));
+
+        S = read_q15x2 (pSrc + (2 * l));
+
+        R = __QSUB16(T, S);
+
+        write_q15x2 (pSrc + (2 * i), __SHADD16(T, S));
+
+#ifndef ARM_MATH_BIG_ENDIAN
+        out1 = __SMUAD(coeff, R) >> 16;
+        out2 = __SMUSDX(coeff, R);
+#else
+        out1 = __SMUSDX(R, coeff) >> 16U;
+        out2 = __SMUAD(coeff, R);
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+        write_q15x2 (pSrc + (2U * l), (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
+
+      } /* butterfly loop end */
+
+    } /* groups loop end */
+
+    twidCoefModifier = twidCoefModifier << 1U;
+  } /* stages loop end */
+
+  n1 = n2;
+  n2 = n2 >> 1;
+  ia = 0;
+
+  coeff = read_q15x2 ((q15_t *)pCoef + (ia * 2U));
+
+  ia = ia + twidCoefModifier;
+
+  /* loop for butterfly */
+  for (i = 0; i < fftLen; i += n1)
+  {
+    l = i + n2;
+
+    T = read_q15x2 (pSrc + (2 * i));
+
+    S = read_q15x2 (pSrc + (2 * l));
+
+    R = __QSUB16(T, S);
+
+    write_q15x2 (pSrc + (2 * i), __QADD16(T, S));
+
+    write_q15x2 (pSrc + (2 * l), R);
+
+    i += n1;
+    l = i + n2;
+
+    T = read_q15x2 (pSrc + (2 * i));
+
+    S = read_q15x2 (pSrc + (2 * l));
+
+    R = __QSUB16(T, S);
+
+    write_q15x2 (pSrc + (2 * i), __QADD16(T, S));
+
+    write_q15x2 (pSrc + (2 * l), R);
+
+  } /* groups loop end */
+
+
+#else /* #if defined (ARM_MATH_DSP) */
+
+  uint32_t i, j, k, l;
+  uint32_t n1, n2, ia;
+  q15_t xt, yt, cosVal, sinVal;
+
+
+  // N = fftLen;
+  n2 = fftLen;
+
+  n1 = n2;
+  n2 = n2 >> 1;
+  ia = 0;
+
+  /* loop for groups */
+  for (j = 0; j < n2; j++)
+  {
+    cosVal = pCoef[(ia * 2)];
+    sinVal = pCoef[(ia * 2) + 1];
+    ia = ia + twidCoefModifier;
+
+    /* loop for butterfly */
+    for (i = j; i < fftLen; i += n1)
+    {
+      l = i + n2;
+      xt = (pSrc[2 * i] >> 1U) - (pSrc[2 * l] >> 1U);
+      pSrc[2 * i] = ((pSrc[2 * i] >> 1U) + (pSrc[2 * l] >> 1U)) >> 1U;
+
+      yt = (pSrc[2 * i + 1] >> 1U) - (pSrc[2 * l + 1] >> 1U);
+      pSrc[2 * i + 1] = ((pSrc[2 * l + 1] >> 1U) +
+                         (pSrc[2 * i + 1] >> 1U)  ) >> 1U;
+
+      pSrc[2 * l] = (((int16_t) (((q31_t) xt * cosVal) >> 16)) +
+                     ((int16_t) (((q31_t) yt * sinVal) >> 16)));
+
+      pSrc[2U * l + 1] = (((int16_t) (((q31_t) yt * cosVal) >> 16)) -
+                          ((int16_t) (((q31_t) xt * sinVal) >> 16)));
+
+    } /* butterfly loop end */
+
+  } /* groups loop end */
+
+  twidCoefModifier = twidCoefModifier << 1U;
+
+  /* loop for stage */
+  for (k = fftLen / 2; k > 2; k = k >> 1)
+  {
+    n1 = n2;
+    n2 = n2 >> 1;
+    ia = 0;
+
+    /* loop for groups */
+    for (j = 0; j < n2; j++)
+    {
+      cosVal = pCoef[ia * 2];
+      sinVal = pCoef[(ia * 2) + 1];
+      ia = ia + twidCoefModifier;
+
+      /* loop for butterfly */
+      for (i = j; i < fftLen; i += n1)
+      {
+        l = i + n2;
+        xt = pSrc[2 * i] - pSrc[2 * l];
+        pSrc[2 * i] = (pSrc[2 * i] + pSrc[2 * l]) >> 1U;
+
+        yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
+        pSrc[2 * i + 1] = (pSrc[2 * l + 1] + pSrc[2 * i + 1]) >> 1U;
+
+        pSrc[2 * l] = (((int16_t) (((q31_t) xt * cosVal) >> 16)) +
+                       ((int16_t) (((q31_t) yt * sinVal) >> 16)));
+
+        pSrc[2U * l + 1] = (((int16_t) (((q31_t) yt * cosVal) >> 16)) -
+                            ((int16_t) (((q31_t) xt * sinVal) >> 16)));
+
+      } /* butterfly loop end */
+
+    } /* groups loop end */
+
+    twidCoefModifier = twidCoefModifier << 1U;
+  } /* stages loop end */
+
+  n1 = n2;
+  n2 = n2 >> 1;
+  ia = 0;
+
+  /* loop for groups */
+  for (j = 0; j < n2; j++)
+  {
+    cosVal = pCoef[ia * 2];
+    sinVal = pCoef[(ia * 2) + 1];
+
+    ia = ia + twidCoefModifier;
+
+    /* loop for butterfly */
+    for (i = j; i < fftLen; i += n1)
+    {
+      l = i + n2;
+      xt = pSrc[2 * i] - pSrc[2 * l];
+      pSrc[2 * i] = (pSrc[2 * i] + pSrc[2 * l]);
+
+      yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
+      pSrc[2 * i + 1] = (pSrc[2 * l + 1] + pSrc[2 * i + 1]);
+
+      pSrc[2 * l] = xt;
+
+      pSrc[2 * l + 1] = yt;
+
+    } /* butterfly loop end */
+
+  } /* groups loop end */
+
+  twidCoefModifier = twidCoefModifier << 1U;
+
+#endif /* #if defined (ARM_MATH_DSP) */
+
+}
+
+
+void arm_radix2_butterfly_inverse_q15(
+        q15_t * pSrc,
+        uint32_t fftLen,
+  const q15_t * pCoef,
+        uint16_t twidCoefModifier)
+{
+#if defined (ARM_MATH_DSP)
+
+        uint32_t i, j, k, l;
+        uint32_t n1, n2, ia;
+        q15_t in;
+        q31_t T, S, R;
+        q31_t coeff, out1, out2;
+
+  // N = fftLen;
+  n2 = fftLen;
+
+  n1 = n2;
+  n2 = n2 >> 1;
+  ia = 0;
+
+  /* loop for groups */
+  for (i = 0; i < n2; i++)
+  {
+    coeff = read_q15x2 ((q15_t *)pCoef + (ia * 2U));
+
+    ia = ia + twidCoefModifier;
+
+    l = i + n2;
+
+    T = read_q15x2 (pSrc + (2 * i));
+    in = ((int16_t) (T & 0xFFFF)) >> 1;
+    T = ((T >> 1) & 0xFFFF0000) | (in & 0xFFFF);
+
+    S = read_q15x2 (pSrc + (2 * l));
+    in = ((int16_t) (S & 0xFFFF)) >> 1;
+    S = ((S >> 1) & 0xFFFF0000) | (in & 0xFFFF);
+
+    R = __QSUB16(T, S);
+
+    write_q15x2 (pSrc + (2 * i), __SHADD16(T, S));
+
+#ifndef ARM_MATH_BIG_ENDIAN
+    out1 = __SMUSD(coeff, R) >> 16;
+    out2 = __SMUADX(coeff, R);
+#else
+    out1 = __SMUADX(R, coeff) >> 16U;
+    out2 = __SMUSD(__QSUB(0, coeff), R);
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+    write_q15x2 (pSrc + (2 * l), (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
+
+    coeff = read_q15x2 ((q15_t *)pCoef + (ia * 2U));
+
+    ia = ia + twidCoefModifier;
+
+    /* loop for butterfly */
+    i++;
+    l++;
+
+    T = read_q15x2 (pSrc + (2 * i));
+    in = ((int16_t) (T & 0xFFFF)) >> 1;
+    T = ((T >> 1) & 0xFFFF0000) | (in & 0xFFFF);
+
+    S = read_q15x2 (pSrc + (2 * l));
+    in = ((int16_t) (S & 0xFFFF)) >> 1;
+    S = ((S >> 1) & 0xFFFF0000) | (in & 0xFFFF);
+
+    R = __QSUB16(T, S);
+
+    write_q15x2 (pSrc + (2 * i), __SHADD16(T, S));
+
+#ifndef ARM_MATH_BIG_ENDIAN
+    out1 = __SMUSD(coeff, R) >> 16;
+    out2 = __SMUADX(coeff, R);
+#else
+    out1 = __SMUADX(R, coeff) >> 16U;
+    out2 = __SMUSD(__QSUB(0, coeff), R);
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+    write_q15x2 (pSrc + (2 * l), (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
+
+  } /* groups loop end */
+
+  twidCoefModifier = twidCoefModifier << 1U;
+
+  /* loop for stage */
+  for (k = fftLen / 2; k > 2; k = k >> 1)
+  {
+    n1 = n2;
+    n2 = n2 >> 1;
+    ia = 0;
+
+    /* loop for groups */
+    for (j = 0; j < n2; j++)
+    {
+      coeff = read_q15x2 ((q15_t *)pCoef + (ia * 2U));
+
+      ia = ia + twidCoefModifier;
+
+      /* loop for butterfly */
+      for (i = j; i < fftLen; i += n1)
+      {
+        l = i + n2;
+
+        T = read_q15x2 (pSrc + (2 * i));
+
+        S = read_q15x2 (pSrc + (2 * l));
+
+        R = __QSUB16(T, S);
+
+        write_q15x2 (pSrc + (2 * i), __SHADD16(T, S));
+
+#ifndef ARM_MATH_BIG_ENDIAN
+        out1 = __SMUSD(coeff, R) >> 16;
+        out2 = __SMUADX(coeff, R);
+#else
+        out1 = __SMUADX(R, coeff) >> 16U;
+        out2 = __SMUSD(__QSUB(0, coeff), R);
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+        write_q15x2 (pSrc + (2 * l), (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
+
+        i += n1;
+
+        l = i + n2;
+
+        T = read_q15x2 (pSrc + (2 * i));
+
+        S = read_q15x2 (pSrc + (2 * l));
+
+        R = __QSUB16(T, S);
+
+        write_q15x2 (pSrc + (2 * i), __SHADD16(T, S));
+
+#ifndef ARM_MATH_BIG_ENDIAN
+        out1 = __SMUSD(coeff, R) >> 16;
+        out2 = __SMUADX(coeff, R);
+#else
+        out1 = __SMUADX(R, coeff) >> 16U;
+        out2 = __SMUSD(__QSUB(0, coeff), R);
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+        write_q15x2 (pSrc + (2 * l), (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
+
+      } /* butterfly loop end */
+
+    } /* groups loop end */
+
+    twidCoefModifier = twidCoefModifier << 1U;
+  } /* stages loop end */
+
+  n1 = n2;
+  n2 = n2 >> 1;
+  ia = 0;
+
+  /* loop for groups */
+  for (j = 0; j < n2; j++)
+  {
+    coeff = read_q15x2 ((q15_t *)pCoef + (ia * 2U));
+
+    ia = ia + twidCoefModifier;
+
+    /* loop for butterfly */
+    for (i = j; i < fftLen; i += n1)
+    {
+      l = i + n2;
+
+      T = read_q15x2 (pSrc + (2 * i));
+
+      S = read_q15x2 (pSrc + (2 * l));
+
+      R = __QSUB16(T, S);
+
+      write_q15x2 (pSrc + (2 * i), __QADD16(T, S));
+
+      write_q15x2 (pSrc + (2 * l), R);
+
+    } /* butterfly loop end */
+
+  } /* groups loop end */
+
+  twidCoefModifier = twidCoefModifier << 1U;
+
+#else /* #if defined (ARM_MATH_DSP) */
+
+        uint32_t i, j, k, l;
+        uint32_t n1, n2, ia;
+        q15_t xt, yt, cosVal, sinVal;
+
+  // N = fftLen;
+  n2 = fftLen;
+
+  n1 = n2;
+  n2 = n2 >> 1;
+  ia = 0;
+
+  /* loop for groups */
+  for (j = 0; j < n2; j++)
+  {
+    cosVal = pCoef[(ia * 2)];
+    sinVal = pCoef[(ia * 2) + 1];
+    ia = ia + twidCoefModifier;
+
+    /* loop for butterfly */
+    for (i = j; i < fftLen; i += n1)
+    {
+      l = i + n2;
+      xt = (pSrc[2 * i] >> 1U) - (pSrc[2 * l] >> 1U);
+      pSrc[2 * i] = ((pSrc[2 * i] >> 1U) + (pSrc[2 * l] >> 1U)) >> 1U;
+
+      yt = (pSrc[2 * i + 1] >> 1U) - (pSrc[2 * l + 1] >> 1U);
+      pSrc[2 * i + 1] = ((pSrc[2 * l + 1] >> 1U) +
+                         (pSrc[2 * i + 1] >> 1U)  ) >> 1U;
+
+      pSrc[2 * l] = (((int16_t) (((q31_t) xt * cosVal) >> 16)) -
+                     ((int16_t) (((q31_t) yt * sinVal) >> 16)));
+
+      pSrc[2 * l + 1] = (((int16_t) (((q31_t) yt * cosVal) >> 16)) +
+                         ((int16_t) (((q31_t) xt * sinVal) >> 16)));
+
+    } /* butterfly loop end */
+
+  } /* groups loop end */
+
+  twidCoefModifier = twidCoefModifier << 1U;
+
+  /* loop for stage */
+  for (k = fftLen / 2; k > 2; k = k >> 1)
+  {
+    n1 = n2;
+    n2 = n2 >> 1;
+    ia = 0;
+
+    /* loop for groups */
+    for (j = 0; j < n2; j++)
+    {
+      cosVal = pCoef[(ia * 2)];
+      sinVal = pCoef[(ia * 2) + 1];
+      ia = ia + twidCoefModifier;
+
+      /* loop for butterfly */
+      for (i = j; i < fftLen; i += n1)
+      {
+        l = i + n2;
+        xt = pSrc[2 * i] - pSrc[2 * l];
+        pSrc[2 * i] = (pSrc[2 * i] + pSrc[2 * l]) >> 1U;
+
+        yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
+        pSrc[2 * i + 1] = (pSrc[2 * l + 1] + pSrc[2 * i + 1]) >> 1U;
+
+        pSrc[2 * l] = (((int16_t) (((q31_t) xt * cosVal) >> 16)) -
+                       ((int16_t) (((q31_t) yt * sinVal) >> 16))  );
+
+        pSrc[2 * l + 1] = (((int16_t) (((q31_t) yt * cosVal) >> 16)) +
+                           ((int16_t) (((q31_t) xt * sinVal) >> 16))  );
+
+      } /* butterfly loop end */
+
+    } /* groups loop end */
+
+    twidCoefModifier = twidCoefModifier << 1U;
+  } /* stages loop end */
+
+  n1 = n2;
+  n2 = n2 >> 1;
+  ia = 0;
+
+  cosVal = pCoef[(ia * 2)];
+  sinVal = pCoef[(ia * 2) + 1];
+
+  ia = ia + twidCoefModifier;
+
+  /* loop for butterfly */
+  for (i = 0; i < fftLen; i += n1)
+  {
+    l = i + n2;
+    xt = pSrc[2 * i] - pSrc[2 * l];
+    pSrc[2 * i] = (pSrc[2 * i] + pSrc[2 * l]);
+
+    yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
+    pSrc[2 * i + 1] = (pSrc[2 * l + 1] + pSrc[2 * i + 1]);
+
+    pSrc[2 * l] = xt;
+
+    pSrc[2 * l + 1] = yt;
+
+  } /* groups loop end */
+
+
+#endif /* #if defined (ARM_MATH_DSP) */
+
+}
diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_q31.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_q31.c
index 996e91d..27f1408 100644
--- a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_q31.c
+++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_q31.c
@@ -1,337 +1,337 @@
-/* ----------------------------------------------------------------------
- * Project:      CMSIS DSP Library
- * Title:        arm_cfft_radix2_q31.c
- * Description:  Radix-2 Decimation in Frequency CFFT & CIFFT Fixed point processing function
- *
- * $Date:        23 April 2021
- * $Revision:    V1.9.0
- *
- * Target Processor: Cortex-M and Cortex-A cores
- * -------------------------------------------------------------------- */
-/*
- * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "dsp/transform_functions.h"
-
-void arm_radix2_butterfly_q31(
-        q31_t * pSrc,
-        uint32_t fftLen,
-  const q31_t * pCoef,
-        uint16_t twidCoefModifier);
-
-void arm_radix2_butterfly_inverse_q31(
-        q31_t * pSrc,
-        uint32_t fftLen,
-  const q31_t * pCoef,
-        uint16_t twidCoefModifier);
-
-void arm_bitreversal_q31(
-        q31_t * pSrc,
-        uint32_t fftLen,
-        uint16_t bitRevFactor,
-  const uint16_t * pBitRevTab);
-
-/**
-  @ingroup groupTransforms
- */
-
-/**
-  @addtogroup ComplexFFT
-  @{
- */
-
-/**
-  @brief         Processing function for the fixed-point CFFT/CIFFT.
-  @deprecated    Do not use this function. It has been superseded by \ref arm_cfft_q31 and will be removed in the future.
-  @param[in]     S    points to an instance of the fixed-point CFFT/CIFFT structure
-  @param[in,out] pSrc points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place
-  @return        none
- */
-
-void arm_cfft_radix2_q31(
-  const arm_cfft_radix2_instance_q31 * S,
-        q31_t * pSrc)
-{
-
-   if (S->ifftFlag == 1U)
-   {
-      arm_radix2_butterfly_inverse_q31(pSrc, S->fftLen,
-      S->pTwiddle, S->twidCoefModifier);
-   }
-   else
-   {
-      arm_radix2_butterfly_q31(pSrc, S->fftLen,
-      S->pTwiddle, S->twidCoefModifier);
-   }
-
-   arm_bitreversal_q31(pSrc, S->fftLen, S->bitRevFactor, S->pBitRevTable);
-}
-
-/**
-  @} end of ComplexFFT group
- */
-
-void arm_radix2_butterfly_q31(
-        q31_t * pSrc,
-        uint32_t fftLen,
-  const q31_t * pCoef,
-        uint16_t twidCoefModifier)
-{
-
-   unsigned i, j, k, l, m;
-   unsigned n1, n2, ia;
-   q31_t xt, yt, cosVal, sinVal;
-   q31_t p0, p1;
-
-   //N = fftLen;
-   n2 = fftLen;
-
-   n1 = n2;
-   n2 = n2 >> 1;
-   ia = 0;
-
-   // loop for groups
-   for (i = 0; i < n2; i++)
-   {
-      cosVal = pCoef[ia * 2];
-      sinVal = pCoef[(ia * 2) + 1];
-      ia = ia + twidCoefModifier;
-
-      l = i + n2;
-      xt = (pSrc[2 * i] >> 1U) - (pSrc[2 * l] >> 1U);
-      pSrc[2 * i] = ((pSrc[2 * i] >> 1U) + (pSrc[2 * l] >> 1U)) >> 1U;
-
-      yt = (pSrc[2 * i + 1] >> 1U) - (pSrc[2 * l + 1] >> 1U);
-      pSrc[2 * i + 1] =
-        ((pSrc[2 * l + 1] >> 1U) + (pSrc[2 * i + 1] >> 1U)) >> 1U;
-
-      mult_32x32_keep32_R(p0, xt, cosVal);
-      mult_32x32_keep32_R(p1, yt, cosVal);
-      multAcc_32x32_keep32_R(p0, yt, sinVal);
-      multSub_32x32_keep32_R(p1, xt, sinVal);
-
-      pSrc[2U * l] = p0;
-      pSrc[2U * l + 1U] = p1;
-
-   }                             // groups loop end
-
-   twidCoefModifier <<= 1U;
-
-   // loop for stage
-   for (k = fftLen / 2; k > 2; k = k >> 1)
-   {
-      n1 = n2;
-      n2 = n2 >> 1;
-      ia = 0;
-
-      // loop for groups
-      for (j = 0; j < n2; j++)
-      {
-         cosVal = pCoef[ia * 2];
-         sinVal = pCoef[(ia * 2) + 1];
-         ia = ia + twidCoefModifier;
-
-         // loop for butterfly
-         i = j;
-         m = fftLen / n1;
-         do
-         {
-            l = i + n2;
-            xt = pSrc[2 * i] - pSrc[2 * l];
-            pSrc[2 * i] = (pSrc[2 * i] + pSrc[2 * l]) >> 1U;
-
-            yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
-            pSrc[2 * i + 1] = (pSrc[2 * l + 1] + pSrc[2 * i + 1]) >> 1U;
-
-            mult_32x32_keep32_R(p0, xt, cosVal);
-            mult_32x32_keep32_R(p1, yt, cosVal);
-            multAcc_32x32_keep32_R(p0, yt, sinVal);
-            multSub_32x32_keep32_R(p1, xt, sinVal);
-
-            pSrc[2U * l] = p0;
-            pSrc[2U * l + 1U] = p1;
-            i += n1;
-            m--;
-         } while ( m > 0);                   // butterfly loop end
-
-      }                           // groups loop end
-
-      twidCoefModifier <<= 1U;
-   }                             // stages loop end
-
-   n1 = n2;
-   n2 = n2 >> 1;
-   ia = 0;
-
-   cosVal = pCoef[ia * 2];
-   sinVal = pCoef[(ia * 2) + 1];
-   ia = ia + twidCoefModifier;
-
-   // loop for butterfly
-   for (i = 0; i < fftLen; i += n1)
-   {
-      l = i + n2;
-      xt = pSrc[2 * i] - pSrc[2 * l];
-      pSrc[2 * i] = (pSrc[2 * i] + pSrc[2 * l]);
-
-      yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
-      pSrc[2 * i + 1] = (pSrc[2 * l + 1] + pSrc[2 * i + 1]);
-
-      pSrc[2U * l] = xt;
-
-      pSrc[2U * l + 1U] = yt;
-
-      i += n1;
-      l = i + n2;
-
-      xt = pSrc[2 * i] - pSrc[2 * l];
-      pSrc[2 * i] = (pSrc[2 * i] + pSrc[2 * l]);
-
-      yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
-      pSrc[2 * i + 1] = (pSrc[2 * l + 1] + pSrc[2 * i + 1]);
-
-      pSrc[2U * l] = xt;
-
-      pSrc[2U * l + 1U] = yt;
-
-   }                             // butterfly loop end
-
-}
-
-
-void arm_radix2_butterfly_inverse_q31(
-        q31_t * pSrc,
-        uint32_t fftLen,
-  const q31_t * pCoef,
-        uint16_t twidCoefModifier)
-{
-
-   unsigned i, j, k, l;
-   unsigned n1, n2, ia;
-   q31_t xt, yt, cosVal, sinVal;
-   q31_t p0, p1;
-
-   //N = fftLen;
-   n2 = fftLen;
-
-   n1 = n2;
-   n2 = n2 >> 1;
-   ia = 0;
-
-   // loop for groups
-   for (i = 0; i < n2; i++)
-   {
-      cosVal = pCoef[ia * 2];
-      sinVal = pCoef[(ia * 2) + 1];
-      ia = ia + twidCoefModifier;
-
-      l = i + n2;
-      xt = (pSrc[2 * i] >> 1U) - (pSrc[2 * l] >> 1U);
-      pSrc[2 * i] = ((pSrc[2 * i] >> 1U) + (pSrc[2 * l] >> 1U)) >> 1U;
-
-      yt = (pSrc[2 * i + 1] >> 1U) - (pSrc[2 * l + 1] >> 1U);
-      pSrc[2 * i + 1] =
-        ((pSrc[2 * l + 1] >> 1U) + (pSrc[2 * i + 1] >> 1U)) >> 1U;
-
-      mult_32x32_keep32_R(p0, xt, cosVal);
-      mult_32x32_keep32_R(p1, yt, cosVal);
-      multSub_32x32_keep32_R(p0, yt, sinVal);
-      multAcc_32x32_keep32_R(p1, xt, sinVal);
-
-      pSrc[2U * l] = p0;
-      pSrc[2U * l + 1U] = p1;
-   }                             // groups loop end
-
-   twidCoefModifier = twidCoefModifier << 1U;
-
-   // loop for stage
-   for (k = fftLen / 2; k > 2; k = k >> 1)
-   {
-      n1 = n2;
-      n2 = n2 >> 1;
-      ia = 0;
-
-      // loop for groups
-      for (j = 0; j < n2; j++)
-      {
-         cosVal = pCoef[ia * 2];
-         sinVal = pCoef[(ia * 2) + 1];
-         ia = ia + twidCoefModifier;
-
-         // loop for butterfly
-         for (i = j; i < fftLen; i += n1)
-         {
-            l = i + n2;
-            xt = pSrc[2 * i] - pSrc[2 * l];
-            pSrc[2 * i] = (pSrc[2 * i] + pSrc[2 * l]) >> 1U;
-
-            yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
-            pSrc[2 * i + 1] = (pSrc[2 * l + 1] + pSrc[2 * i + 1]) >> 1U;
-
-            mult_32x32_keep32_R(p0, xt, cosVal);
-            mult_32x32_keep32_R(p1, yt, cosVal);
-            multSub_32x32_keep32_R(p0, yt, sinVal);
-            multAcc_32x32_keep32_R(p1, xt, sinVal);
-
-            pSrc[2U * l] = p0;
-            pSrc[2U * l + 1U] = p1;
-         }                         // butterfly loop end
-
-      }                           // groups loop end
-
-      twidCoefModifier = twidCoefModifier << 1U;
-   }                             // stages loop end
-
-   n1 = n2;
-   n2 = n2 >> 1;
-   ia = 0;
-
-   cosVal = pCoef[ia * 2];
-   sinVal = pCoef[(ia * 2) + 1];
-   ia = ia + twidCoefModifier;
-
-   // loop for butterfly
-   for (i = 0; i < fftLen; i += n1)
-   {
-      l = i + n2;
-      xt = pSrc[2 * i] - pSrc[2 * l];
-      pSrc[2 * i] = (pSrc[2 * i] + pSrc[2 * l]);
-
-      yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
-      pSrc[2 * i + 1] = (pSrc[2 * l + 1] + pSrc[2 * i + 1]);
-
-      pSrc[2U * l] = xt;
-
-      pSrc[2U * l + 1U] = yt;
-
-      i += n1;
-      l = i + n2;
-
-      xt = pSrc[2 * i] - pSrc[2 * l];
-      pSrc[2 * i] = (pSrc[2 * i] + pSrc[2 * l]);
-
-      yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
-      pSrc[2 * i + 1] = (pSrc[2 * l + 1] + pSrc[2 * i + 1]);
-
-      pSrc[2U * l] = xt;
-
-      pSrc[2U * l + 1U] = yt;
-
-   }                             // butterfly loop end
-
-}
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_cfft_radix2_q31.c
+ * Description:  Radix-2 Decimation in Frequency CFFT & CIFFT Fixed point processing function
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+void arm_radix2_butterfly_q31(
+        q31_t * pSrc,
+        uint32_t fftLen,
+  const q31_t * pCoef,
+        uint16_t twidCoefModifier);
+
+void arm_radix2_butterfly_inverse_q31(
+        q31_t * pSrc,
+        uint32_t fftLen,
+  const q31_t * pCoef,
+        uint16_t twidCoefModifier);
+
+void arm_bitreversal_q31(
+        q31_t * pSrc,
+        uint32_t fftLen,
+        uint16_t bitRevFactor,
+  const uint16_t * pBitRevTab);
+
+/**
+  @ingroup groupTransforms
+ */
+
+/**
+  @addtogroup ComplexFFT
+  @{
+ */
+
+/**
+  @brief         Processing function for the fixed-point CFFT/CIFFT.
+  @deprecated    Do not use this function. It has been superseded by \ref arm_cfft_q31 and will be removed in the future.
+  @param[in]     S    points to an instance of the fixed-point CFFT/CIFFT structure
+  @param[in,out] pSrc points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place
+  @return        none
+ */
+
+void arm_cfft_radix2_q31(
+  const arm_cfft_radix2_instance_q31 * S,
+        q31_t * pSrc)
+{
+
+   if (S->ifftFlag == 1U)
+   {
+      arm_radix2_butterfly_inverse_q31(pSrc, S->fftLen,
+      S->pTwiddle, S->twidCoefModifier);
+   }
+   else
+   {
+      arm_radix2_butterfly_q31(pSrc, S->fftLen,
+      S->pTwiddle, S->twidCoefModifier);
+   }
+
+   arm_bitreversal_q31(pSrc, S->fftLen, S->bitRevFactor, S->pBitRevTable);
+}
+
+/**
+  @} end of ComplexFFT group
+ */
+
+void arm_radix2_butterfly_q31(
+        q31_t * pSrc,
+        uint32_t fftLen,
+  const q31_t * pCoef,
+        uint16_t twidCoefModifier)
+{
+
+   unsigned i, j, k, l, m;
+   unsigned n1, n2, ia;
+   q31_t xt, yt, cosVal, sinVal;
+   q31_t p0, p1;
+
+   //N = fftLen;
+   n2 = fftLen;
+
+   n1 = n2;
+   n2 = n2 >> 1;
+   ia = 0;
+
+   // loop for groups
+   for (i = 0; i < n2; i++)
+   {
+      cosVal = pCoef[ia * 2];
+      sinVal = pCoef[(ia * 2) + 1];
+      ia = ia + twidCoefModifier;
+
+      l = i + n2;
+      xt = (pSrc[2 * i] >> 1U) - (pSrc[2 * l] >> 1U);
+      pSrc[2 * i] = ((pSrc[2 * i] >> 1U) + (pSrc[2 * l] >> 1U)) >> 1U;
+
+      yt = (pSrc[2 * i + 1] >> 1U) - (pSrc[2 * l + 1] >> 1U);
+      pSrc[2 * i + 1] =
+        ((pSrc[2 * l + 1] >> 1U) + (pSrc[2 * i + 1] >> 1U)) >> 1U;
+
+      mult_32x32_keep32_R(p0, xt, cosVal);
+      mult_32x32_keep32_R(p1, yt, cosVal);
+      multAcc_32x32_keep32_R(p0, yt, sinVal);
+      multSub_32x32_keep32_R(p1, xt, sinVal);
+
+      pSrc[2U * l] = p0;
+      pSrc[2U * l + 1U] = p1;
+
+   }                             // groups loop end
+
+   twidCoefModifier <<= 1U;
+
+   // loop for stage
+   for (k = fftLen / 2; k > 2; k = k >> 1)
+   {
+      n1 = n2;
+      n2 = n2 >> 1;
+      ia = 0;
+
+      // loop for groups
+      for (j = 0; j < n2; j++)
+      {
+         cosVal = pCoef[ia * 2];
+         sinVal = pCoef[(ia * 2) + 1];
+         ia = ia + twidCoefModifier;
+
+         // loop for butterfly
+         i = j;
+         m = fftLen / n1;
+         do
+         {
+            l = i + n2;
+            xt = pSrc[2 * i] - pSrc[2 * l];
+            pSrc[2 * i] = (pSrc[2 * i] + pSrc[2 * l]) >> 1U;
+
+            yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
+            pSrc[2 * i + 1] = (pSrc[2 * l + 1] + pSrc[2 * i + 1]) >> 1U;
+
+            mult_32x32_keep32_R(p0, xt, cosVal);
+            mult_32x32_keep32_R(p1, yt, cosVal);
+            multAcc_32x32_keep32_R(p0, yt, sinVal);
+            multSub_32x32_keep32_R(p1, xt, sinVal);
+
+            pSrc[2U * l] = p0;
+            pSrc[2U * l + 1U] = p1;
+            i += n1;
+            m--;
+         } while ( m > 0);                   // butterfly loop end
+
+      }                           // groups loop end
+
+      twidCoefModifier <<= 1U;
+   }                             // stages loop end
+
+   n1 = n2;
+   n2 = n2 >> 1;
+   ia = 0;
+
+   cosVal = pCoef[ia * 2];
+   sinVal = pCoef[(ia * 2) + 1];
+   ia = ia + twidCoefModifier;
+
+   // loop for butterfly
+   for (i = 0; i < fftLen; i += n1)
+   {
+      l = i + n2;
+      xt = pSrc[2 * i] - pSrc[2 * l];
+      pSrc[2 * i] = (pSrc[2 * i] + pSrc[2 * l]);
+
+      yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
+      pSrc[2 * i + 1] = (pSrc[2 * l + 1] + pSrc[2 * i + 1]);
+
+      pSrc[2U * l] = xt;
+
+      pSrc[2U * l + 1U] = yt;
+
+      i += n1;
+      l = i + n2;
+
+      xt = pSrc[2 * i] - pSrc[2 * l];
+      pSrc[2 * i] = (pSrc[2 * i] + pSrc[2 * l]);
+
+      yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
+      pSrc[2 * i + 1] = (pSrc[2 * l + 1] + pSrc[2 * i + 1]);
+
+      pSrc[2U * l] = xt;
+
+      pSrc[2U * l + 1U] = yt;
+
+   }                             // butterfly loop end
+
+}
+
+
+void arm_radix2_butterfly_inverse_q31(
+        q31_t * pSrc,
+        uint32_t fftLen,
+  const q31_t * pCoef,
+        uint16_t twidCoefModifier)
+{
+
+   unsigned i, j, k, l;
+   unsigned n1, n2, ia;
+   q31_t xt, yt, cosVal, sinVal;
+   q31_t p0, p1;
+
+   //N = fftLen;
+   n2 = fftLen;
+
+   n1 = n2;
+   n2 = n2 >> 1;
+   ia = 0;
+
+   // loop for groups
+   for (i = 0; i < n2; i++)
+   {
+      cosVal = pCoef[ia * 2];
+      sinVal = pCoef[(ia * 2) + 1];
+      ia = ia + twidCoefModifier;
+
+      l = i + n2;
+      xt = (pSrc[2 * i] >> 1U) - (pSrc[2 * l] >> 1U);
+      pSrc[2 * i] = ((pSrc[2 * i] >> 1U) + (pSrc[2 * l] >> 1U)) >> 1U;
+
+      yt = (pSrc[2 * i + 1] >> 1U) - (pSrc[2 * l + 1] >> 1U);
+      pSrc[2 * i + 1] =
+        ((pSrc[2 * l + 1] >> 1U) + (pSrc[2 * i + 1] >> 1U)) >> 1U;
+
+      mult_32x32_keep32_R(p0, xt, cosVal);
+      mult_32x32_keep32_R(p1, yt, cosVal);
+      multSub_32x32_keep32_R(p0, yt, sinVal);
+      multAcc_32x32_keep32_R(p1, xt, sinVal);
+
+      pSrc[2U * l] = p0;
+      pSrc[2U * l + 1U] = p1;
+   }                             // groups loop end
+
+   twidCoefModifier = twidCoefModifier << 1U;
+
+   // loop for stage
+   for (k = fftLen / 2; k > 2; k = k >> 1)
+   {
+      n1 = n2;
+      n2 = n2 >> 1;
+      ia = 0;
+
+      // loop for groups
+      for (j = 0; j < n2; j++)
+      {
+         cosVal = pCoef[ia * 2];
+         sinVal = pCoef[(ia * 2) + 1];
+         ia = ia + twidCoefModifier;
+
+         // loop for butterfly
+         for (i = j; i < fftLen; i += n1)
+         {
+            l = i + n2;
+            xt = pSrc[2 * i] - pSrc[2 * l];
+            pSrc[2 * i] = (pSrc[2 * i] + pSrc[2 * l]) >> 1U;
+
+            yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
+            pSrc[2 * i + 1] = (pSrc[2 * l + 1] + pSrc[2 * i + 1]) >> 1U;
+
+            mult_32x32_keep32_R(p0, xt, cosVal);
+            mult_32x32_keep32_R(p1, yt, cosVal);
+            multSub_32x32_keep32_R(p0, yt, sinVal);
+            multAcc_32x32_keep32_R(p1, xt, sinVal);
+
+            pSrc[2U * l] = p0;
+            pSrc[2U * l + 1U] = p1;
+         }                         // butterfly loop end
+
+      }                           // groups loop end
+
+      twidCoefModifier = twidCoefModifier << 1U;
+   }                             // stages loop end
+
+   n1 = n2;
+   n2 = n2 >> 1;
+   ia = 0;
+
+   cosVal = pCoef[ia * 2];
+   sinVal = pCoef[(ia * 2) + 1];
+   ia = ia + twidCoefModifier;
+
+   // loop for butterfly
+   for (i = 0; i < fftLen; i += n1)
+   {
+      l = i + n2;
+      xt = pSrc[2 * i] - pSrc[2 * l];
+      pSrc[2 * i] = (pSrc[2 * i] + pSrc[2 * l]);
+
+      yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
+      pSrc[2 * i + 1] = (pSrc[2 * l + 1] + pSrc[2 * i + 1]);
+
+      pSrc[2U * l] = xt;
+
+      pSrc[2U * l + 1U] = yt;
+
+      i += n1;
+      l = i + n2;
+
+      xt = pSrc[2 * i] - pSrc[2 * l];
+      pSrc[2 * i] = (pSrc[2 * i] + pSrc[2 * l]);
+
+      yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
+      pSrc[2 * i + 1] = (pSrc[2 * l + 1] + pSrc[2 * i + 1]);
+
+      pSrc[2U * l] = xt;
+
+      pSrc[2U * l + 1U] = yt;
+
+   }                             // butterfly loop end
+
+}
diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_f32.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_f32.c
index 9d9d4d5..675a303 100644
--- a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_f32.c
+++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_f32.c
@@ -1,1203 +1,1200 @@
-/* ----------------------------------------------------------------------
- * Project:      CMSIS DSP Library
- * Title:        arm_cfft_radix4_f32.c
- * Description:  Radix-4 Decimation in Frequency CFFT & CIFFT Floating point processing function
- *
- * $Date:        23 April 2021
- * $Revision:    V1.9.0
- *
- * Target Processor: Cortex-M and Cortex-A cores
- * -------------------------------------------------------------------- */
-/*
- * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "dsp/transform_functions.h"
-
-extern void arm_bitreversal_f32(
-        float32_t * pSrc,
-        uint16_t fftSize,
-        uint16_t bitRevFactor,
-  const uint16_t * pBitRevTab);
-
-void arm_radix4_butterfly_f32(
-        float32_t * pSrc,
-        uint16_t fftLen,
-  const float32_t * pCoef,
-        uint16_t twidCoefModifier);
-
-void arm_radix4_butterfly_inverse_f32(
-        float32_t * pSrc,
-        uint16_t fftLen,
-  const float32_t * pCoef,
-        uint16_t twidCoefModifier,
-        float32_t onebyfftLen);
-
-
-
-
-/**
-  @ingroup groupTransforms
- */
-
-/**
-  @addtogroup ComplexFFT
-  @{
- */
-
-
-/**
-  @brief         Processing function for the floating-point Radix-4 CFFT/CIFFT.
-  @deprecated    Do not use this function. It has been superseded by \ref arm_cfft_f32 and will be removed in the future.
-  @param[in]     S    points to an instance of the floating-point Radix-4 CFFT/CIFFT structure
-  @param[in,out] pSrc points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place
-  @return        none
- */
-
-void arm_cfft_radix4_f32(
-  const arm_cfft_radix4_instance_f32 * S,
-        float32_t * pSrc)
-{
-   if (S->ifftFlag == 1U)
-   {
-      /*  Complex IFFT radix-4  */
-      arm_radix4_butterfly_inverse_f32(pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier, S->onebyfftLen);
-   }
-   else
-   {
-      /*  Complex FFT radix-4  */
-      arm_radix4_butterfly_f32(pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier);
-   }
-
-   if (S->bitReverseFlag == 1U)
-   {
-      /*  Bit Reversal */
-      arm_bitreversal_f32(pSrc, S->fftLen, S->bitRevFactor, S->pBitRevTable);
-   }
-
-}
-
-/**
-  @} end of ComplexFFT group
- */
-
-/* ----------------------------------------------------------------------
- * Internal helper function used by the FFTs
- * ---------------------------------------------------------------------- */
-
-/**
-  brief         Core function for the floating-point CFFT butterfly process.
-  param[in,out] pSrc             points to the in-place buffer of floating-point data type
-  param[in]     fftLen           length of the FFT
-  param[in]     pCoef            points to the twiddle coefficient buffer
-  param[in]     twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table
-  return        none
- */
-
-void arm_radix4_butterfly_f32(
-        float32_t * pSrc,
-        uint16_t fftLen,
-  const float32_t * pCoef,
-        uint16_t twidCoefModifier)
-{
-        float32_t co1, co2, co3, si1, si2, si3;
-        uint32_t ia1, ia2, ia3;
-        uint32_t i0, i1, i2, i3;
-        uint32_t n1, n2, j, k;
-
-#if defined (ARM_MATH_LOOPUNROLL)
-
-        float32_t xaIn, yaIn, xbIn, ybIn, xcIn, ycIn, xdIn, ydIn;
-        float32_t Xaplusc, Xbplusd, Yaplusc, Ybplusd, Xaminusc, Xbminusd, Yaminusc,
-        Ybminusd;
-        float32_t Xb12C_out, Yb12C_out, Xc12C_out, Yc12C_out, Xd12C_out, Yd12C_out;
-        float32_t Xb12_out, Yb12_out, Xc12_out, Yc12_out, Xd12_out, Yd12_out;
-        float32_t *ptr1;
-        float32_t p0,p1,p2,p3,p4,p5;
-        float32_t a0,a1,a2,a3,a4,a5,a6,a7;
-
-   /*  Initializations for the first stage */
-   n2 = fftLen;
-   n1 = n2;
-
-   /* n2 = fftLen/4 */
-   n2 >>= 2U;
-   i0 = 0U;
-   ia1 = 0U;
-
-   j = n2;
-
-   /*  Calculation of first stage */
-   do
-   {
-      /*  index calculation for the input as, */
-      /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2], pSrc[i0 + 3fftLen/4] */
-      i1 = i0 + n2;
-      i2 = i1 + n2;
-      i3 = i2 + n2;
-
-      xaIn = pSrc[(2U * i0)];
-      yaIn = pSrc[(2U * i0) + 1U];
-
-      xbIn = pSrc[(2U * i1)];
-      ybIn = pSrc[(2U * i1) + 1U];
-
-      xcIn = pSrc[(2U * i2)];
-      ycIn = pSrc[(2U * i2) + 1U];
-
-      xdIn = pSrc[(2U * i3)];
-      ydIn = pSrc[(2U * i3) + 1U];
-
-      /* xa + xc */
-      Xaplusc = xaIn + xcIn;
-      /* xb + xd */
-      Xbplusd = xbIn + xdIn;
-      /* ya + yc */
-      Yaplusc = yaIn + ycIn;
-      /* yb + yd */
-      Ybplusd = ybIn + ydIn;
-
-      /*  index calculation for the coefficients */
-      ia2 = ia1 + ia1;
-      co2 = pCoef[ia2 * 2U];
-      si2 = pCoef[(ia2 * 2U) + 1U];
-
-      /* xa - xc */
-      Xaminusc = xaIn - xcIn;
-      /* xb - xd */
-      Xbminusd = xbIn - xdIn;
-      /* ya - yc */
-      Yaminusc = yaIn - ycIn;
-      /* yb - yd */
-      Ybminusd = ybIn - ydIn;
-
-      /* xa' = xa + xb + xc + xd */
-      pSrc[(2U * i0)] = Xaplusc + Xbplusd;
-      /* ya' = ya + yb + yc + yd */
-      pSrc[(2U * i0) + 1U] = Yaplusc + Ybplusd;
-
-      /* (xa - xc) + (yb - yd) */
-      Xb12C_out = (Xaminusc + Ybminusd);
-      /* (ya - yc) + (xb - xd) */
-      Yb12C_out = (Yaminusc - Xbminusd);
-      /* (xa + xc) - (xb + xd) */
-      Xc12C_out = (Xaplusc - Xbplusd);
-      /* (ya + yc) - (yb + yd) */
-      Yc12C_out = (Yaplusc - Ybplusd);
-      /* (xa - xc) - (yb - yd) */
-      Xd12C_out = (Xaminusc - Ybminusd);
-      /* (ya - yc) + (xb - xd) */
-      Yd12C_out = (Xbminusd + Yaminusc);
-
-      co1 = pCoef[ia1 * 2U];
-      si1 = pCoef[(ia1 * 2U) + 1U];
-
-      /*  index calculation for the coefficients */
-      ia3 = ia2 + ia1;
-      co3 = pCoef[ia3 * 2U];
-      si3 = pCoef[(ia3 * 2U) + 1U];
-
-      Xb12_out = Xb12C_out * co1;
-      Yb12_out = Yb12C_out * co1;
-      Xc12_out = Xc12C_out * co2;
-      Yc12_out = Yc12C_out * co2;
-      Xd12_out = Xd12C_out * co3;
-      Yd12_out = Yd12C_out * co3;
-
-      /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
-      //Xb12_out -= Yb12C_out * si1;
-      p0 = Yb12C_out * si1;
-      /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
-      //Yb12_out += Xb12C_out * si1;
-      p1 = Xb12C_out * si1;
-      /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
-      //Xc12_out -= Yc12C_out * si2;
-      p2 = Yc12C_out * si2;
-      /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
-      //Yc12_out += Xc12C_out * si2;
-      p3 = Xc12C_out * si2;
-      /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
-      //Xd12_out -= Yd12C_out * si3;
-      p4 = Yd12C_out * si3;
-      /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
-      //Yd12_out += Xd12C_out * si3;
-      p5 = Xd12C_out * si3;
-
-      Xb12_out += p0;
-      Yb12_out -= p1;
-      Xc12_out += p2;
-      Yc12_out -= p3;
-      Xd12_out += p4;
-      Yd12_out -= p5;
-
-      /* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */
-      pSrc[2U * i1] = Xc12_out;
-
-      /* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */
-      pSrc[(2U * i1) + 1U] = Yc12_out;
-
-      /* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */
-      pSrc[2U * i2] = Xb12_out;
-
-      /* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */
-      pSrc[(2U * i2) + 1U] = Yb12_out;
-
-      /* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */
-      pSrc[2U * i3] = Xd12_out;
-
-      /* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */
-      pSrc[(2U * i3) + 1U] = Yd12_out;
-
-      /*  Twiddle coefficients index modifier */
-      ia1 += twidCoefModifier;
-
-      /*  Updating input index */
-      i0++;
-
-   }
-   while (--j);
-
-   twidCoefModifier <<= 2U;
-
-   /*  Calculation of second stage to excluding last stage */
-   for (k = fftLen >> 2U; k > 4U; k >>= 2U)
-   {
-      /*  Initializations for the first stage */
-      n1 = n2;
-      n2 >>= 2U;
-      ia1 = 0U;
-
-      /*  Calculation of first stage */
-      j = 0;
-      do
-      {
-         /*  index calculation for the coefficients */
-         ia2 = ia1 + ia1;
-         ia3 = ia2 + ia1;
-         co1 = pCoef[(ia1 * 2U)];
-         si1 = pCoef[(ia1 * 2U) + 1U];
-         co2 = pCoef[(ia2 * 2U)];
-         si2 = pCoef[(ia2 * 2U) + 1U];
-         co3 = pCoef[(ia3 * 2U)];
-         si3 = pCoef[(ia3 * 2U) + 1U];
-
-         /*  Twiddle coefficients index modifier */
-         ia1 += twidCoefModifier;
-
-         i0 = j;
-         do
-         {
-            /*  index calculation for the input as, */
-            /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2], pSrc[i0 + 3fftLen/4] */
-            i1 = i0 + n2;
-            i2 = i1 + n2;
-            i3 = i2 + n2;
-
-            xaIn = pSrc[(2U * i0)];
-            yaIn = pSrc[(2U * i0) + 1U];
-
-            xbIn = pSrc[(2U * i1)];
-            ybIn = pSrc[(2U * i1) + 1U];
-
-            xcIn = pSrc[(2U * i2)];
-            ycIn = pSrc[(2U * i2) + 1U];
-
-            xdIn = pSrc[(2U * i3)];
-            ydIn = pSrc[(2U * i3) + 1U];
-
-            /* xa - xc */
-            Xaminusc = xaIn - xcIn;
-            /* (xb - xd) */
-            Xbminusd = xbIn - xdIn;
-            /* ya - yc */
-            Yaminusc = yaIn - ycIn;
-            /* (yb - yd) */
-            Ybminusd = ybIn - ydIn;
-
-            /* xa + xc */
-            Xaplusc = xaIn + xcIn;
-            /* xb + xd */
-            Xbplusd = xbIn + xdIn;
-            /* ya + yc */
-            Yaplusc = yaIn + ycIn;
-            /* yb + yd */
-            Ybplusd = ybIn + ydIn;
-
-            /* (xa - xc) + (yb - yd) */
-            Xb12C_out = (Xaminusc + Ybminusd);
-            /* (ya - yc) -  (xb - xd) */
-            Yb12C_out = (Yaminusc - Xbminusd);
-            /* xa + xc -(xb + xd) */
-            Xc12C_out = (Xaplusc - Xbplusd);
-            /* (ya + yc) - (yb + yd) */
-            Yc12C_out = (Yaplusc - Ybplusd);
-            /* (xa - xc) - (yb - yd) */
-            Xd12C_out = (Xaminusc - Ybminusd);
-            /* (ya - yc) +  (xb - xd) */
-            Yd12C_out = (Xbminusd + Yaminusc);
-
-            pSrc[(2U * i0)] = Xaplusc + Xbplusd;
-            pSrc[(2U * i0) + 1U] = Yaplusc + Ybplusd;
-
-            Xb12_out = Xb12C_out * co1;
-            Yb12_out = Yb12C_out * co1;
-            Xc12_out = Xc12C_out * co2;
-            Yc12_out = Yc12C_out * co2;
-            Xd12_out = Xd12C_out * co3;
-            Yd12_out = Yd12C_out * co3;
-
-            /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
-            //Xb12_out -= Yb12C_out * si1;
-            p0 = Yb12C_out * si1;
-            /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
-            //Yb12_out += Xb12C_out * si1;
-            p1 = Xb12C_out * si1;
-            /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
-            //Xc12_out -= Yc12C_out * si2;
-            p2 = Yc12C_out * si2;
-            /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
-            //Yc12_out += Xc12C_out * si2;
-            p3 = Xc12C_out * si2;
-            /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
-            //Xd12_out -= Yd12C_out * si3;
-            p4 = Yd12C_out * si3;
-            /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
-            //Yd12_out += Xd12C_out * si3;
-            p5 = Xd12C_out * si3;
-
-            Xb12_out += p0;
-            Yb12_out -= p1;
-            Xc12_out += p2;
-            Yc12_out -= p3;
-            Xd12_out += p4;
-            Yd12_out -= p5;
-
-            /* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */
-            pSrc[2U * i1] = Xc12_out;
-
-            /* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */
-            pSrc[(2U * i1) + 1U] = Yc12_out;
-
-            /* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */
-            pSrc[2U * i2] = Xb12_out;
-
-            /* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */
-            pSrc[(2U * i2) + 1U] = Yb12_out;
-
-            /* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */
-            pSrc[2U * i3] = Xd12_out;
-
-            /* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */
-            pSrc[(2U * i3) + 1U] = Yd12_out;
-
-            i0 += n1;
-         } while (i0 < fftLen);
-         j++;
-      } while (j <= (n2 - 1U));
-      twidCoefModifier <<= 2U;
-   }
-
-   j = fftLen >> 2;
-   ptr1 = &pSrc[0];
-
-   /*  Calculations of last stage */
-   do
-   {
-      xaIn = ptr1[0];
-      yaIn = ptr1[1];
-      xbIn = ptr1[2];
-      ybIn = ptr1[3];
-      xcIn = ptr1[4];
-      ycIn = ptr1[5];
-      xdIn = ptr1[6];
-      ydIn = ptr1[7];
-
-      /* xa + xc */
-      Xaplusc = xaIn + xcIn;
-
-      /* xa - xc */
-      Xaminusc = xaIn - xcIn;
-
-      /* ya + yc */
-      Yaplusc = yaIn + ycIn;
-
-      /* ya - yc */
-      Yaminusc = yaIn - ycIn;
-
-      /* xb + xd */
-      Xbplusd = xbIn + xdIn;
-
-      /* yb + yd */
-      Ybplusd = ybIn + ydIn;
-
-      /* (xb-xd) */
-      Xbminusd = xbIn - xdIn;
-
-      /* (yb-yd) */
-      Ybminusd = ybIn - ydIn;
-
-      /* xa' = xa + xb + xc + xd */
-      a0 = (Xaplusc + Xbplusd);
-      /* ya' = ya + yb + yc + yd */
-      a1 = (Yaplusc + Ybplusd);
-      /* xc' = (xa-xb+xc-xd) */
-      a2 = (Xaplusc - Xbplusd);
-      /* yc' = (ya-yb+yc-yd) */
-      a3 = (Yaplusc - Ybplusd);
-      /* xb' = (xa+yb-xc-yd) */
-      a4 = (Xaminusc + Ybminusd);
-      /* yb' = (ya-xb-yc+xd) */
-      a5 = (Yaminusc - Xbminusd);
-      /* xd' = (xa-yb-xc+yd)) */
-      a6 = (Xaminusc - Ybminusd);
-      /* yd' = (ya+xb-yc-xd) */
-      a7 = (Xbminusd + Yaminusc);
-
-      ptr1[0] = a0;
-      ptr1[1] = a1;
-      ptr1[2] = a2;
-      ptr1[3] = a3;
-      ptr1[4] = a4;
-      ptr1[5] = a5;
-      ptr1[6] = a6;
-      ptr1[7] = a7;
-
-      /* increment pointer by 8 */
-      ptr1 += 8U;
-   } while (--j);
-
-#else
-
-        float32_t t1, t2, r1, r2, s1, s2;
-
-   /* Initializations for the fft calculation */
-   n2 = fftLen;
-   n1 = n2;
-   for (k = fftLen; k > 1U; k >>= 2U)
-   {
-      /*  Initializations for the fft calculation */
-      n1 = n2;
-      n2 >>= 2U;
-      ia1 = 0U;
-
-      /*  FFT Calculation */
-      j = 0;
-      do
-      {
-         /*  index calculation for the coefficients */
-         ia2 = ia1 + ia1;
-         ia3 = ia2 + ia1;
-         co1 = pCoef[ia1 * 2U];
-         si1 = pCoef[(ia1 * 2U) + 1U];
-         co2 = pCoef[ia2 * 2U];
-         si2 = pCoef[(ia2 * 2U) + 1U];
-         co3 = pCoef[ia3 * 2U];
-         si3 = pCoef[(ia3 * 2U) + 1U];
-
-         /*  Twiddle coefficients index modifier */
-         ia1 = ia1 + twidCoefModifier;
-
-         i0 = j;
-         do
-         {
-            /*  index calculation for the input as, */
-            /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2], pSrc[i0 + 3fftLen/4] */
-            i1 = i0 + n2;
-            i2 = i1 + n2;
-            i3 = i2 + n2;
-
-            /* xa + xc */
-            r1 = pSrc[(2U * i0)] + pSrc[(2U * i2)];
-
-            /* xa - xc */
-            r2 = pSrc[(2U * i0)] - pSrc[(2U * i2)];
-
-            /* ya + yc */
-            s1 = pSrc[(2U * i0) + 1U] + pSrc[(2U * i2) + 1U];
-
-            /* ya - yc */
-            s2 = pSrc[(2U * i0) + 1U] - pSrc[(2U * i2) + 1U];
-
-            /* xb + xd */
-            t1 = pSrc[2U * i1] + pSrc[2U * i3];
-
-            /* xa' = xa + xb + xc + xd */
-            pSrc[2U * i0] = r1 + t1;
-
-            /* xa + xc -(xb + xd) */
-            r1 = r1 - t1;
-
-            /* yb + yd */
-            t2 = pSrc[(2U * i1) + 1U] + pSrc[(2U * i3) + 1U];
-
-            /* ya' = ya + yb + yc + yd */
-            pSrc[(2U * i0) + 1U] = s1 + t2;
-
-            /* (ya + yc) - (yb + yd) */
-            s1 = s1 - t2;
-
-            /* (yb - yd) */
-            t1 = pSrc[(2U * i1) + 1U] - pSrc[(2U * i3) + 1U];
-
-            /* (xb - xd) */
-            t2 = pSrc[2U * i1] - pSrc[2U * i3];
-
-            /* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */
-            pSrc[2U * i1] = (r1 * co2) + (s1 * si2);
-
-            /* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */
-            pSrc[(2U * i1) + 1U] = (s1 * co2) - (r1 * si2);
-
-            /* (xa - xc) + (yb - yd) */
-            r1 = r2 + t1;
-
-            /* (xa - xc) - (yb - yd) */
-            r2 = r2 - t1;
-
-            /* (ya - yc) -  (xb - xd) */
-            s1 = s2 - t2;
-
-            /* (ya - yc) +  (xb - xd) */
-            s2 = s2 + t2;
-
-            /* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */
-            pSrc[2U * i2] = (r1 * co1) + (s1 * si1);
-
-            /* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */
-            pSrc[(2U * i2) + 1U] = (s1 * co1) - (r1 * si1);
-
-            /* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */
-            pSrc[2U * i3] = (r2 * co3) + (s2 * si3);
-
-            /* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */
-            pSrc[(2U * i3) + 1U] = (s2 * co3) - (r2 * si3);
-
-            i0 += n1;
-         } while ( i0 < fftLen);
-         j++;
-      } while (j <= (n2 - 1U));
-      twidCoefModifier <<= 2U;
-   }
-
-#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
-
-}
-
-/**
-  brief         Core function for the floating-point CIFFT butterfly process.
-  param[in,out] pSrc             points to the in-place buffer of floating-point data type
-  param[in]     fftLen           length of the FFT
-  param[in]     pCoef            points to twiddle coefficient buffer
-  param[in]     twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
-  param[in]     onebyfftLen      value of 1/fftLen
-  return        none
- */
-
-void arm_radix4_butterfly_inverse_f32(
-        float32_t * pSrc,
-        uint16_t fftLen,
-  const float32_t * pCoef,
-        uint16_t twidCoefModifier,
-        float32_t onebyfftLen)
-{
-        float32_t co1, co2, co3, si1, si2, si3;
-        uint32_t ia1, ia2, ia3;
-        uint32_t i0, i1, i2, i3;
-        uint32_t n1, n2, j, k;
-
-#if defined (ARM_MATH_LOOPUNROLL)
-
-        float32_t xaIn, yaIn, xbIn, ybIn, xcIn, ycIn, xdIn, ydIn;
-        float32_t Xaplusc, Xbplusd, Yaplusc, Ybplusd, Xaminusc, Xbminusd, Yaminusc,
-        Ybminusd;
-        float32_t Xb12C_out, Yb12C_out, Xc12C_out, Yc12C_out, Xd12C_out, Yd12C_out;
-        float32_t Xb12_out, Yb12_out, Xc12_out, Yc12_out, Xd12_out, Yd12_out;
-        float32_t *ptr1;
-        float32_t p0,p1,p2,p3,p4,p5,p6,p7;
-        float32_t a0,a1,a2,a3,a4,a5,a6,a7;
-
-
-   /*  Initializations for the first stage */
-   n2 = fftLen;
-   n1 = n2;
-
-   /* n2 = fftLen/4 */
-   n2 >>= 2U;
-   i0 = 0U;
-   ia1 = 0U;
-
-   j = n2;
-
-   /*  Calculation of first stage */
-   do
-   {
-      /*  index calculation for the input as, */
-      /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2], pSrc[i0 + 3fftLen/4] */
-      i1 = i0 + n2;
-      i2 = i1 + n2;
-      i3 = i2 + n2;
-
-      /*  Butterfly implementation */
-      xaIn = pSrc[(2U * i0)];
-      yaIn = pSrc[(2U * i0) + 1U];
-
-      xcIn = pSrc[(2U * i2)];
-      ycIn = pSrc[(2U * i2) + 1U];
-
-      xbIn = pSrc[(2U * i1)];
-      ybIn = pSrc[(2U * i1) + 1U];
-
-      xdIn = pSrc[(2U * i3)];
-      ydIn = pSrc[(2U * i3) + 1U];
-
-      /* xa + xc */
-      Xaplusc = xaIn + xcIn;
-      /* xb + xd */
-      Xbplusd = xbIn + xdIn;
-      /* ya + yc */
-      Yaplusc = yaIn + ycIn;
-      /* yb + yd */
-      Ybplusd = ybIn + ydIn;
-
-      /*  index calculation for the coefficients */
-      ia2 = ia1 + ia1;
-      co2 = pCoef[ia2 * 2U];
-      si2 = pCoef[(ia2 * 2U) + 1U];
-
-      /* xa - xc */
-      Xaminusc = xaIn - xcIn;
-      /* xb - xd */
-      Xbminusd = xbIn - xdIn;
-      /* ya - yc */
-      Yaminusc = yaIn - ycIn;
-      /* yb - yd */
-      Ybminusd = ybIn - ydIn;
-
-      /* xa' = xa + xb + xc + xd */
-      pSrc[(2U * i0)] = Xaplusc + Xbplusd;
-
-      /* ya' = ya + yb + yc + yd */
-      pSrc[(2U * i0) + 1U] = Yaplusc + Ybplusd;
-
-      /* (xa - xc) - (yb - yd) */
-      Xb12C_out = (Xaminusc - Ybminusd);
-      /* (ya - yc) + (xb - xd) */
-      Yb12C_out = (Yaminusc + Xbminusd);
-      /* (xa + xc) - (xb + xd) */
-      Xc12C_out = (Xaplusc - Xbplusd);
-      /* (ya + yc) - (yb + yd) */
-      Yc12C_out = (Yaplusc - Ybplusd);
-      /* (xa - xc) + (yb - yd) */
-      Xd12C_out = (Xaminusc + Ybminusd);
-      /* (ya - yc) - (xb - xd) */
-      Yd12C_out = (Yaminusc - Xbminusd);
-
-      co1 = pCoef[ia1 * 2U];
-      si1 = pCoef[(ia1 * 2U) + 1U];
-
-      /*  index calculation for the coefficients */
-      ia3 = ia2 + ia1;
-      co3 = pCoef[ia3 * 2U];
-      si3 = pCoef[(ia3 * 2U) + 1U];
-
-      Xb12_out = Xb12C_out * co1;
-      Yb12_out = Yb12C_out * co1;
-      Xc12_out = Xc12C_out * co2;
-      Yc12_out = Yc12C_out * co2;
-      Xd12_out = Xd12C_out * co3;
-      Yd12_out = Yd12C_out * co3;
-
-      /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
-      //Xb12_out -= Yb12C_out * si1;
-      p0 = Yb12C_out * si1;
-      /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
-      //Yb12_out += Xb12C_out * si1;
-      p1 = Xb12C_out * si1;
-      /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
-      //Xc12_out -= Yc12C_out * si2;
-      p2 = Yc12C_out * si2;
-      /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
-      //Yc12_out += Xc12C_out * si2;
-      p3 = Xc12C_out * si2;
-      /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
-      //Xd12_out -= Yd12C_out * si3;
-      p4 = Yd12C_out * si3;
-      /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
-      //Yd12_out += Xd12C_out * si3;
-      p5 = Xd12C_out * si3;
-
-      Xb12_out -= p0;
-      Yb12_out += p1;
-      Xc12_out -= p2;
-      Yc12_out += p3;
-      Xd12_out -= p4;
-      Yd12_out += p5;
-
-      /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
-      pSrc[2U * i1] = Xc12_out;
-
-      /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
-      pSrc[(2U * i1) + 1U] = Yc12_out;
-
-      /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
-      pSrc[2U * i2] = Xb12_out;
-
-      /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
-      pSrc[(2U * i2) + 1U] = Yb12_out;
-
-      /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
-      pSrc[2U * i3] = Xd12_out;
-
-      /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
-      pSrc[(2U * i3) + 1U] = Yd12_out;
-
-      /*  Twiddle coefficients index modifier */
-      ia1 = ia1 + twidCoefModifier;
-
-      /*  Updating input index */
-      i0 = i0 + 1U;
-
-   } while (--j);
-
-   twidCoefModifier <<= 2U;
-
-   /*  Calculation of second stage to excluding last stage */
-   for (k = fftLen >> 2U; k > 4U; k >>= 2U)
-   {
-      /*  Initializations for the first stage */
-      n1 = n2;
-      n2 >>= 2U;
-      ia1 = 0U;
-
-      /*  Calculation of first stage */
-      j = 0;
-      do
-      {
-         /*  index calculation for the coefficients */
-         ia2 = ia1 + ia1;
-         ia3 = ia2 + ia1;
-         co1 = pCoef[ia1 * 2U];
-         si1 = pCoef[(ia1 * 2U) + 1U];
-         co2 = pCoef[ia2 * 2U];
-         si2 = pCoef[(ia2 * 2U) + 1U];
-         co3 = pCoef[ia3 * 2U];
-         si3 = pCoef[(ia3 * 2U) + 1U];
-
-         /*  Twiddle coefficients index modifier */
-         ia1 = ia1 + twidCoefModifier;
-
-         i0 = j;
-         do
-         {
-            /*  index calculation for the input as, */
-            /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2], pSrc[i0 + 3fftLen/4] */
-            i1 = i0 + n2;
-            i2 = i1 + n2;
-            i3 = i2 + n2;
-
-            xaIn = pSrc[(2U * i0)];
-            yaIn = pSrc[(2U * i0) + 1U];
-
-            xbIn = pSrc[(2U * i1)];
-            ybIn = pSrc[(2U * i1) + 1U];
-
-            xcIn = pSrc[(2U * i2)];
-            ycIn = pSrc[(2U * i2) + 1U];
-
-            xdIn = pSrc[(2U * i3)];
-            ydIn = pSrc[(2U * i3) + 1U];
-
-            /* xa - xc */
-            Xaminusc = xaIn - xcIn;
-            /* (xb - xd) */
-            Xbminusd = xbIn - xdIn;
-            /* ya - yc */
-            Yaminusc = yaIn - ycIn;
-            /* (yb - yd) */
-            Ybminusd = ybIn - ydIn;
-
-            /* xa + xc */
-            Xaplusc = xaIn + xcIn;
-            /* xb + xd */
-            Xbplusd = xbIn + xdIn;
-            /* ya + yc */
-            Yaplusc = yaIn + ycIn;
-            /* yb + yd */
-            Ybplusd = ybIn + ydIn;
-
-            /* (xa - xc) - (yb - yd) */
-            Xb12C_out = (Xaminusc - Ybminusd);
-            /* (ya - yc) +  (xb - xd) */
-            Yb12C_out = (Yaminusc + Xbminusd);
-            /* xa + xc -(xb + xd) */
-            Xc12C_out = (Xaplusc - Xbplusd);
-            /* (ya + yc) - (yb + yd) */
-            Yc12C_out = (Yaplusc - Ybplusd);
-            /* (xa - xc) + (yb - yd) */
-            Xd12C_out = (Xaminusc + Ybminusd);
-            /* (ya - yc) -  (xb - xd) */
-            Yd12C_out = (Yaminusc - Xbminusd);
-
-            pSrc[(2U * i0)] = Xaplusc + Xbplusd;
-            pSrc[(2U * i0) + 1U] = Yaplusc + Ybplusd;
-
-            Xb12_out = Xb12C_out * co1;
-            Yb12_out = Yb12C_out * co1;
-            Xc12_out = Xc12C_out * co2;
-            Yc12_out = Yc12C_out * co2;
-            Xd12_out = Xd12C_out * co3;
-            Yd12_out = Yd12C_out * co3;
-
-            /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
-            //Xb12_out -= Yb12C_out * si1;
-            p0 = Yb12C_out * si1;
-            /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
-            //Yb12_out += Xb12C_out * si1;
-            p1 = Xb12C_out * si1;
-            /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
-            //Xc12_out -= Yc12C_out * si2;
-            p2 = Yc12C_out * si2;
-            /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
-            //Yc12_out += Xc12C_out * si2;
-            p3 = Xc12C_out * si2;
-            /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
-            //Xd12_out -= Yd12C_out * si3;
-            p4 = Yd12C_out * si3;
-            /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
-            //Yd12_out += Xd12C_out * si3;
-            p5 = Xd12C_out * si3;
-
-            Xb12_out -= p0;
-            Yb12_out += p1;
-            Xc12_out -= p2;
-            Yc12_out += p3;
-            Xd12_out -= p4;
-            Yd12_out += p5;
-
-            /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
-            pSrc[2U * i1] = Xc12_out;
-
-            /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
-            pSrc[(2U * i1) + 1U] = Yc12_out;
-
-            /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
-            pSrc[2U * i2] = Xb12_out;
-
-            /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
-            pSrc[(2U * i2) + 1U] = Yb12_out;
-
-            /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
-            pSrc[2U * i3] = Xd12_out;
-
-            /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
-            pSrc[(2U * i3) + 1U] = Yd12_out;
-
-            i0 += n1;
-         } while (i0 < fftLen);
-         j++;
-      } while (j <= (n2 - 1U));
-      twidCoefModifier <<= 2U;
-   }
-   /*  Initializations of last stage */
-
-   j = fftLen >> 2;
-   ptr1 = &pSrc[0];
-
-   /*  Calculations of last stage */
-   do
-   {
-      xaIn = ptr1[0];
-      yaIn = ptr1[1];
-      xbIn = ptr1[2];
-      ybIn = ptr1[3];
-      xcIn = ptr1[4];
-      ycIn = ptr1[5];
-      xdIn = ptr1[6];
-      ydIn = ptr1[7];
-
-      /*  Butterfly implementation */
-      /* xa + xc */
-      Xaplusc = xaIn + xcIn;
-
-      /* xa - xc */
-      Xaminusc = xaIn - xcIn;
-
-      /* ya + yc */
-      Yaplusc = yaIn + ycIn;
-
-      /* ya - yc */
-      Yaminusc = yaIn - ycIn;
-
-      /* xb + xd */
-      Xbplusd = xbIn + xdIn;
-
-      /* yb + yd */
-      Ybplusd = ybIn + ydIn;
-
-      /* (xb-xd) */
-      Xbminusd = xbIn - xdIn;
-
-      /* (yb-yd) */
-      Ybminusd = ybIn - ydIn;
-
-      /* xa' = (xa+xb+xc+xd) * onebyfftLen */
-      a0 = (Xaplusc + Xbplusd);
-      /* ya' = (ya+yb+yc+yd) * onebyfftLen */
-      a1 = (Yaplusc + Ybplusd);
-      /* xc' = (xa-xb+xc-xd) * onebyfftLen */
-      a2 = (Xaplusc - Xbplusd);
-      /* yc' = (ya-yb+yc-yd) * onebyfftLen  */
-      a3 = (Yaplusc - Ybplusd);
-      /* xb' = (xa-yb-xc+yd) * onebyfftLen */
-      a4 = (Xaminusc - Ybminusd);
-      /* yb' = (ya+xb-yc-xd) * onebyfftLen */
-      a5 = (Yaminusc + Xbminusd);
-      /* xd' = (xa-yb-xc+yd) * onebyfftLen */
-      a6 = (Xaminusc + Ybminusd);
-      /* yd' = (ya-xb-yc+xd) * onebyfftLen */
-      a7 = (Yaminusc - Xbminusd);
-
-      p0 = a0 * onebyfftLen;
-      p1 = a1 * onebyfftLen;
-      p2 = a2 * onebyfftLen;
-      p3 = a3 * onebyfftLen;
-      p4 = a4 * onebyfftLen;
-      p5 = a5 * onebyfftLen;
-      p6 = a6 * onebyfftLen;
-      p7 = a7 * onebyfftLen;
-
-      /* xa' = (xa+xb+xc+xd) * onebyfftLen */
-      ptr1[0] = p0;
-      /* ya' = (ya+yb+yc+yd) * onebyfftLen */
-      ptr1[1] = p1;
-      /* xc' = (xa-xb+xc-xd) * onebyfftLen */
-      ptr1[2] = p2;
-      /* yc' = (ya-yb+yc-yd) * onebyfftLen  */
-      ptr1[3] = p3;
-      /* xb' = (xa-yb-xc+yd) * onebyfftLen */
-      ptr1[4] = p4;
-      /* yb' = (ya+xb-yc-xd) * onebyfftLen */
-      ptr1[5] = p5;
-      /* xd' = (xa-yb-xc+yd) * onebyfftLen */
-      ptr1[6] = p6;
-      /* yd' = (ya-xb-yc+xd) * onebyfftLen */
-      ptr1[7] = p7;
-
-      /* increment source pointer by 8 for next calculations */
-      ptr1 = ptr1 + 8U;
-
-   } while (--j);
-
-#else
-
-        float32_t t1, t2, r1, r2, s1, s2;
-
-   /*  Initializations for the first stage */
-   n2 = fftLen;
-   n1 = n2;
-
-   /*  Calculation of first stage */
-   for (k = fftLen; k > 4U; k >>= 2U)
-   {
-      /*  Initializations for the first stage */
-      n1 = n2;
-      n2 >>= 2U;
-      ia1 = 0U;
-
-      /*  Calculation of first stage */
-      j = 0;
-      do
-      {
-         /*  index calculation for the coefficients */
-         ia2 = ia1 + ia1;
-         ia3 = ia2 + ia1;
-         co1 = pCoef[ia1 * 2U];
-         si1 = pCoef[(ia1 * 2U) + 1U];
-         co2 = pCoef[ia2 * 2U];
-         si2 = pCoef[(ia2 * 2U) + 1U];
-         co3 = pCoef[ia3 * 2U];
-         si3 = pCoef[(ia3 * 2U) + 1U];
-
-         /*  Twiddle coefficients index modifier */
-         ia1 = ia1 + twidCoefModifier;
-
-         i0 = j;
-         do
-         {
-            /*  index calculation for the input as, */
-            /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2], pSrc[i0 + 3fftLen/4] */
-            i1 = i0 + n2;
-            i2 = i1 + n2;
-            i3 = i2 + n2;
-
-            /* xa + xc */
-            r1 = pSrc[(2U * i0)] + pSrc[(2U * i2)];
-
-            /* xa - xc */
-            r2 = pSrc[(2U * i0)] - pSrc[(2U * i2)];
-
-            /* ya + yc */
-            s1 = pSrc[(2U * i0) + 1U] + pSrc[(2U * i2) + 1U];
-
-            /* ya - yc */
-            s2 = pSrc[(2U * i0) + 1U] - pSrc[(2U * i2) + 1U];
-
-            /* xb + xd */
-            t1 = pSrc[2U * i1] + pSrc[2U * i3];
-
-            /* xa' = xa + xb + xc + xd */
-            pSrc[2U * i0] = r1 + t1;
-
-            /* xa + xc -(xb + xd) */
-            r1 = r1 - t1;
-
-            /* yb + yd */
-            t2 = pSrc[(2U * i1) + 1U] + pSrc[(2U * i3) + 1U];
-
-            /* ya' = ya + yb + yc + yd */
-            pSrc[(2U * i0) + 1U] = s1 + t2;
-
-            /* (ya + yc) - (yb + yd) */
-            s1 = s1 - t2;
-
-            /* (yb - yd) */
-            t1 = pSrc[(2U * i1) + 1U] - pSrc[(2U * i3) + 1U];
-
-            /* (xb - xd) */
-            t2 = pSrc[2U * i1] - pSrc[2U * i3];
-
-            /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
-            pSrc[2U * i1] = (r1 * co2) - (s1 * si2);
-
-            /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
-            pSrc[(2U * i1) + 1U] = (s1 * co2) + (r1 * si2);
-
-            /* (xa - xc) - (yb - yd) */
-            r1 = r2 - t1;
-
-            /* (xa - xc) + (yb - yd) */
-            r2 = r2 + t1;
-
-            /* (ya - yc) +  (xb - xd) */
-            s1 = s2 + t2;
-
-            /* (ya - yc) -  (xb - xd) */
-            s2 = s2 - t2;
-
-            /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
-            pSrc[2U * i2] = (r1 * co1) - (s1 * si1);
-
-            /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
-            pSrc[(2U * i2) + 1U] = (s1 * co1) + (r1 * si1);
-
-            /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
-            pSrc[2U * i3] = (r2 * co3) - (s2 * si3);
-
-            /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
-            pSrc[(2U * i3) + 1U] = (s2 * co3) + (r2 * si3);
-
-            i0 += n1;
-         } while ( i0 < fftLen);
-         j++;
-      } while (j <= (n2 - 1U));
-      twidCoefModifier <<= 2U;
-   }
-   /*  Initializations of last stage */
-   n1 = n2;
-   n2 >>= 2U;
-
-   /*  Calculations of last stage */
-   for (i0 = 0U; i0 <= (fftLen - n1); i0 += n1)
-   {
-      /*  index calculation for the input as, */
-      /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2], pSrc[i0 + 3fftLen/4] */
-      i1 = i0 + n2;
-      i2 = i1 + n2;
-      i3 = i2 + n2;
-
-      /*  Butterfly implementation */
-      /* xa + xc */
-      r1 = pSrc[2U * i0] + pSrc[2U * i2];
-
-      /* xa - xc */
-      r2 = pSrc[2U * i0] - pSrc[2U * i2];
-
-      /* ya + yc */
-      s1 = pSrc[(2U * i0) + 1U] + pSrc[(2U * i2) + 1U];
-
-      /* ya - yc */
-      s2 = pSrc[(2U * i0) + 1U] - pSrc[(2U * i2) + 1U];
-
-      /* xc + xd */
-      t1 = pSrc[2U * i1] + pSrc[2U * i3];
-
-      /* xa' = xa + xb + xc + xd */
-      pSrc[2U * i0] = (r1 + t1) * onebyfftLen;
-
-      /* (xa + xb) - (xc + xd) */
-      r1 = r1 - t1;
-
-      /* yb + yd */
-      t2 = pSrc[(2U * i1) + 1U] + pSrc[(2U * i3) + 1U];
-
-      /* ya' = ya + yb + yc + yd */
-      pSrc[(2U * i0) + 1U] = (s1 + t2) * onebyfftLen;
-
-      /* (ya + yc) - (yb + yd) */
-      s1 = s1 - t2;
-
-      /* (yb-yd) */
-      t1 = pSrc[(2U * i1) + 1U] - pSrc[(2U * i3) + 1U];
-
-      /* (xb-xd) */
-      t2 = pSrc[2U * i1] - pSrc[2U * i3];
-
-      /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
-      pSrc[2U * i1] = r1 * onebyfftLen;
-
-      /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
-      pSrc[(2U * i1) + 1U] = s1 * onebyfftLen;
-
-      /* (xa - xc) - (yb-yd) */
-      r1 = r2 - t1;
-
-      /* (xa - xc) + (yb-yd) */
-      r2 = r2 + t1;
-
-      /* (ya - yc) + (xb-xd) */
-      s1 = s2 + t2;
-
-      /* (ya - yc) - (xb-xd) */
-      s2 = s2 - t2;
-
-      /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
-      pSrc[2U * i2] = r1 * onebyfftLen;
-
-      /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
-      pSrc[(2U * i2) + 1U] = s1 * onebyfftLen;
-
-      /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
-      pSrc[2U * i3] = r2 * onebyfftLen;
-
-      /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
-      pSrc[(2U * i3) + 1U] = s2 * onebyfftLen;
-   }
-
-#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
-}
-
-
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_cfft_radix4_f32.c
+ * Description:  Radix-4 Decimation in Frequency CFFT & CIFFT Floating point processing function
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+extern void arm_bitreversal_f32(
+        float32_t * pSrc,
+        uint16_t fftSize,
+        uint16_t bitRevFactor,
+  const uint16_t * pBitRevTab);
+
+void arm_radix4_butterfly_f32(
+        float32_t * pSrc,
+        uint16_t fftLen,
+  const float32_t * pCoef,
+        uint16_t twidCoefModifier);
+
+void arm_radix4_butterfly_inverse_f32(
+        float32_t * pSrc,
+        uint16_t fftLen,
+  const float32_t * pCoef,
+        uint16_t twidCoefModifier,
+        float32_t onebyfftLen);
+
+
+/**
+  @ingroup groupTransforms
+ */
+
+/**
+  @addtogroup ComplexFFT
+  @{
+ */
+
+/**
+  @brief         Processing function for the floating-point Radix-4 CFFT/CIFFT.
+  @deprecated    Do not use this function. It has been superseded by \ref arm_cfft_f32 and will be removed in the future.
+  @param[in]     S    points to an instance of the floating-point Radix-4 CFFT/CIFFT structure
+  @param[in,out] pSrc points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place
+  @return        none
+ */
+
+void arm_cfft_radix4_f32(
+  const arm_cfft_radix4_instance_f32 * S,
+        float32_t * pSrc)
+{
+   if (S->ifftFlag == 1U)
+   {
+      /*  Complex IFFT radix-4  */
+      arm_radix4_butterfly_inverse_f32(pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier, S->onebyfftLen);
+   }
+   else
+   {
+      /*  Complex FFT radix-4  */
+      arm_radix4_butterfly_f32(pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier);
+   }
+
+   if (S->bitReverseFlag == 1U)
+   {
+      /*  Bit Reversal */
+      arm_bitreversal_f32(pSrc, S->fftLen, S->bitRevFactor, S->pBitRevTable);
+   }
+
+}
+
+/**
+  @} end of ComplexFFT group
+ */
+
+/* ----------------------------------------------------------------------
+ * Internal helper function used by the FFTs
+ * ---------------------------------------------------------------------- */
+
+/**
+  brief         Core function for the floating-point CFFT butterfly process.
+  param[in,out] pSrc             points to the in-place buffer of floating-point data type
+  param[in]     fftLen           length of the FFT
+  param[in]     pCoef            points to the twiddle coefficient buffer
+  param[in]     twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table
+  return        none
+ */
+
+void arm_radix4_butterfly_f32(
+        float32_t * pSrc,
+        uint16_t fftLen,
+  const float32_t * pCoef,
+        uint16_t twidCoefModifier)
+{
+        float32_t co1, co2, co3, si1, si2, si3;
+        uint32_t ia1, ia2, ia3;
+        uint32_t i0, i1, i2, i3;
+        uint32_t n1, n2, j, k;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+        float32_t xaIn, yaIn, xbIn, ybIn, xcIn, ycIn, xdIn, ydIn;
+        float32_t Xaplusc, Xbplusd, Yaplusc, Ybplusd, Xaminusc, Xbminusd, Yaminusc,
+        Ybminusd;
+        float32_t Xb12C_out, Yb12C_out, Xc12C_out, Yc12C_out, Xd12C_out, Yd12C_out;
+        float32_t Xb12_out, Yb12_out, Xc12_out, Yc12_out, Xd12_out, Yd12_out;
+        float32_t *ptr1;
+        float32_t p0,p1,p2,p3,p4,p5;
+        float32_t a0,a1,a2,a3,a4,a5,a6,a7;
+
+   /*  Initializations for the first stage */
+   n2 = fftLen;
+   n1 = n2;
+
+   /* n2 = fftLen/4 */
+   n2 >>= 2U;
+   i0 = 0U;
+   ia1 = 0U;
+
+   j = n2;
+
+   /*  Calculation of first stage */
+   do
+   {
+      /*  index calculation for the input as, */
+      /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2], pSrc[i0 + 3fftLen/4] */
+      i1 = i0 + n2;
+      i2 = i1 + n2;
+      i3 = i2 + n2;
+
+      xaIn = pSrc[(2U * i0)];
+      yaIn = pSrc[(2U * i0) + 1U];
+
+      xbIn = pSrc[(2U * i1)];
+      ybIn = pSrc[(2U * i1) + 1U];
+
+      xcIn = pSrc[(2U * i2)];
+      ycIn = pSrc[(2U * i2) + 1U];
+
+      xdIn = pSrc[(2U * i3)];
+      ydIn = pSrc[(2U * i3) + 1U];
+
+      /* xa + xc */
+      Xaplusc = xaIn + xcIn;
+      /* xb + xd */
+      Xbplusd = xbIn + xdIn;
+      /* ya + yc */
+      Yaplusc = yaIn + ycIn;
+      /* yb + yd */
+      Ybplusd = ybIn + ydIn;
+
+      /*  index calculation for the coefficients */
+      ia2 = ia1 + ia1;
+      co2 = pCoef[ia2 * 2U];
+      si2 = pCoef[(ia2 * 2U) + 1U];
+
+      /* xa - xc */
+      Xaminusc = xaIn - xcIn;
+      /* xb - xd */
+      Xbminusd = xbIn - xdIn;
+      /* ya - yc */
+      Yaminusc = yaIn - ycIn;
+      /* yb - yd */
+      Ybminusd = ybIn - ydIn;
+
+      /* xa' = xa + xb + xc + xd */
+      pSrc[(2U * i0)] = Xaplusc + Xbplusd;
+      /* ya' = ya + yb + yc + yd */
+      pSrc[(2U * i0) + 1U] = Yaplusc + Ybplusd;
+
+      /* (xa - xc) + (yb - yd) */
+      Xb12C_out = (Xaminusc + Ybminusd);
+      /* (ya - yc) + (xb - xd) */
+      Yb12C_out = (Yaminusc - Xbminusd);
+      /* (xa + xc) - (xb + xd) */
+      Xc12C_out = (Xaplusc - Xbplusd);
+      /* (ya + yc) - (yb + yd) */
+      Yc12C_out = (Yaplusc - Ybplusd);
+      /* (xa - xc) - (yb - yd) */
+      Xd12C_out = (Xaminusc - Ybminusd);
+      /* (ya - yc) + (xb - xd) */
+      Yd12C_out = (Xbminusd + Yaminusc);
+
+      co1 = pCoef[ia1 * 2U];
+      si1 = pCoef[(ia1 * 2U) + 1U];
+
+      /*  index calculation for the coefficients */
+      ia3 = ia2 + ia1;
+      co3 = pCoef[ia3 * 2U];
+      si3 = pCoef[(ia3 * 2U) + 1U];
+
+      Xb12_out = Xb12C_out * co1;
+      Yb12_out = Yb12C_out * co1;
+      Xc12_out = Xc12C_out * co2;
+      Yc12_out = Yc12C_out * co2;
+      Xd12_out = Xd12C_out * co3;
+      Yd12_out = Yd12C_out * co3;
+
+      /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
+      //Xb12_out -= Yb12C_out * si1;
+      p0 = Yb12C_out * si1;
+      /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
+      //Yb12_out += Xb12C_out * si1;
+      p1 = Xb12C_out * si1;
+      /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
+      //Xc12_out -= Yc12C_out * si2;
+      p2 = Yc12C_out * si2;
+      /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
+      //Yc12_out += Xc12C_out * si2;
+      p3 = Xc12C_out * si2;
+      /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
+      //Xd12_out -= Yd12C_out * si3;
+      p4 = Yd12C_out * si3;
+      /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
+      //Yd12_out += Xd12C_out * si3;
+      p5 = Xd12C_out * si3;
+
+      Xb12_out += p0;
+      Yb12_out -= p1;
+      Xc12_out += p2;
+      Yc12_out -= p3;
+      Xd12_out += p4;
+      Yd12_out -= p5;
+
+      /* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */
+      pSrc[2U * i1] = Xc12_out;
+
+      /* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */
+      pSrc[(2U * i1) + 1U] = Yc12_out;
+
+      /* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */
+      pSrc[2U * i2] = Xb12_out;
+
+      /* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */
+      pSrc[(2U * i2) + 1U] = Yb12_out;
+
+      /* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */
+      pSrc[2U * i3] = Xd12_out;
+
+      /* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */
+      pSrc[(2U * i3) + 1U] = Yd12_out;
+
+      /*  Twiddle coefficients index modifier */
+      ia1 += twidCoefModifier;
+
+      /*  Updating input index */
+      i0++;
+
+   }
+   while (--j);
+
+   twidCoefModifier <<= 2U;
+
+   /*  Calculation of second stage to excluding last stage */
+   for (k = fftLen >> 2U; k > 4U; k >>= 2U)
+   {
+      /*  Initializations for the first stage */
+      n1 = n2;
+      n2 >>= 2U;
+      ia1 = 0U;
+
+      /*  Calculation of first stage */
+      j = 0;
+      do
+      {
+         /*  index calculation for the coefficients */
+         ia2 = ia1 + ia1;
+         ia3 = ia2 + ia1;
+         co1 = pCoef[(ia1 * 2U)];
+         si1 = pCoef[(ia1 * 2U) + 1U];
+         co2 = pCoef[(ia2 * 2U)];
+         si2 = pCoef[(ia2 * 2U) + 1U];
+         co3 = pCoef[(ia3 * 2U)];
+         si3 = pCoef[(ia3 * 2U) + 1U];
+
+         /*  Twiddle coefficients index modifier */
+         ia1 += twidCoefModifier;
+
+         i0 = j;
+         do
+         {
+            /*  index calculation for the input as, */
+            /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2], pSrc[i0 + 3fftLen/4] */
+            i1 = i0 + n2;
+            i2 = i1 + n2;
+            i3 = i2 + n2;
+
+            xaIn = pSrc[(2U * i0)];
+            yaIn = pSrc[(2U * i0) + 1U];
+
+            xbIn = pSrc[(2U * i1)];
+            ybIn = pSrc[(2U * i1) + 1U];
+
+            xcIn = pSrc[(2U * i2)];
+            ycIn = pSrc[(2U * i2) + 1U];
+
+            xdIn = pSrc[(2U * i3)];
+            ydIn = pSrc[(2U * i3) + 1U];
+
+            /* xa - xc */
+            Xaminusc = xaIn - xcIn;
+            /* (xb - xd) */
+            Xbminusd = xbIn - xdIn;
+            /* ya - yc */
+            Yaminusc = yaIn - ycIn;
+            /* (yb - yd) */
+            Ybminusd = ybIn - ydIn;
+
+            /* xa + xc */
+            Xaplusc = xaIn + xcIn;
+            /* xb + xd */
+            Xbplusd = xbIn + xdIn;
+            /* ya + yc */
+            Yaplusc = yaIn + ycIn;
+            /* yb + yd */
+            Ybplusd = ybIn + ydIn;
+
+            /* (xa - xc) + (yb - yd) */
+            Xb12C_out = (Xaminusc + Ybminusd);
+            /* (ya - yc) -  (xb - xd) */
+            Yb12C_out = (Yaminusc - Xbminusd);
+            /* xa + xc -(xb + xd) */
+            Xc12C_out = (Xaplusc - Xbplusd);
+            /* (ya + yc) - (yb + yd) */
+            Yc12C_out = (Yaplusc - Ybplusd);
+            /* (xa - xc) - (yb - yd) */
+            Xd12C_out = (Xaminusc - Ybminusd);
+            /* (ya - yc) +  (xb - xd) */
+            Yd12C_out = (Xbminusd + Yaminusc);
+
+            pSrc[(2U * i0)] = Xaplusc + Xbplusd;
+            pSrc[(2U * i0) + 1U] = Yaplusc + Ybplusd;
+
+            Xb12_out = Xb12C_out * co1;
+            Yb12_out = Yb12C_out * co1;
+            Xc12_out = Xc12C_out * co2;
+            Yc12_out = Yc12C_out * co2;
+            Xd12_out = Xd12C_out * co3;
+            Yd12_out = Yd12C_out * co3;
+
+            /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
+            //Xb12_out -= Yb12C_out * si1;
+            p0 = Yb12C_out * si1;
+            /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
+            //Yb12_out += Xb12C_out * si1;
+            p1 = Xb12C_out * si1;
+            /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
+            //Xc12_out -= Yc12C_out * si2;
+            p2 = Yc12C_out * si2;
+            /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
+            //Yc12_out += Xc12C_out * si2;
+            p3 = Xc12C_out * si2;
+            /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
+            //Xd12_out -= Yd12C_out * si3;
+            p4 = Yd12C_out * si3;
+            /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
+            //Yd12_out += Xd12C_out * si3;
+            p5 = Xd12C_out * si3;
+
+            Xb12_out += p0;
+            Yb12_out -= p1;
+            Xc12_out += p2;
+            Yc12_out -= p3;
+            Xd12_out += p4;
+            Yd12_out -= p5;
+
+            /* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */
+            pSrc[2U * i1] = Xc12_out;
+
+            /* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */
+            pSrc[(2U * i1) + 1U] = Yc12_out;
+
+            /* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */
+            pSrc[2U * i2] = Xb12_out;
+
+            /* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */
+            pSrc[(2U * i2) + 1U] = Yb12_out;
+
+            /* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */
+            pSrc[2U * i3] = Xd12_out;
+
+            /* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */
+            pSrc[(2U * i3) + 1U] = Yd12_out;
+
+            i0 += n1;
+         } while (i0 < fftLen);
+         j++;
+      } while (j <= (n2 - 1U));
+      twidCoefModifier <<= 2U;
+   }
+
+   j = fftLen >> 2;
+   ptr1 = &pSrc[0];
+
+   /*  Calculations of last stage */
+   do
+   {
+      xaIn = ptr1[0];
+      yaIn = ptr1[1];
+      xbIn = ptr1[2];
+      ybIn = ptr1[3];
+      xcIn = ptr1[4];
+      ycIn = ptr1[5];
+      xdIn = ptr1[6];
+      ydIn = ptr1[7];
+
+      /* xa + xc */
+      Xaplusc = xaIn + xcIn;
+
+      /* xa - xc */
+      Xaminusc = xaIn - xcIn;
+
+      /* ya + yc */
+      Yaplusc = yaIn + ycIn;
+
+      /* ya - yc */
+      Yaminusc = yaIn - ycIn;
+
+      /* xb + xd */
+      Xbplusd = xbIn + xdIn;
+
+      /* yb + yd */
+      Ybplusd = ybIn + ydIn;
+
+      /* (xb-xd) */
+      Xbminusd = xbIn - xdIn;
+
+      /* (yb-yd) */
+      Ybminusd = ybIn - ydIn;
+
+      /* xa' = xa + xb + xc + xd */
+      a0 = (Xaplusc + Xbplusd);
+      /* ya' = ya + yb + yc + yd */
+      a1 = (Yaplusc + Ybplusd);
+      /* xc' = (xa-xb+xc-xd) */
+      a2 = (Xaplusc - Xbplusd);
+      /* yc' = (ya-yb+yc-yd) */
+      a3 = (Yaplusc - Ybplusd);
+      /* xb' = (xa+yb-xc-yd) */
+      a4 = (Xaminusc + Ybminusd);
+      /* yb' = (ya-xb-yc+xd) */
+      a5 = (Yaminusc - Xbminusd);
+      /* xd' = (xa-yb-xc+yd)) */
+      a6 = (Xaminusc - Ybminusd);
+      /* yd' = (ya+xb-yc-xd) */
+      a7 = (Xbminusd + Yaminusc);
+
+      ptr1[0] = a0;
+      ptr1[1] = a1;
+      ptr1[2] = a2;
+      ptr1[3] = a3;
+      ptr1[4] = a4;
+      ptr1[5] = a5;
+      ptr1[6] = a6;
+      ptr1[7] = a7;
+
+      /* increment pointer by 8 */
+      ptr1 += 8U;
+   } while (--j);
+
+#else
+
+        float32_t t1, t2, r1, r2, s1, s2;
+
+   /* Initializations for the fft calculation */
+   n2 = fftLen;
+   n1 = n2;
+   for (k = fftLen; k > 1U; k >>= 2U)
+   {
+      /*  Initializations for the fft calculation */
+      n1 = n2;
+      n2 >>= 2U;
+      ia1 = 0U;
+
+      /*  FFT Calculation */
+      j = 0;
+      do
+      {
+         /*  index calculation for the coefficients */
+         ia2 = ia1 + ia1;
+         ia3 = ia2 + ia1;
+         co1 = pCoef[ia1 * 2U];
+         si1 = pCoef[(ia1 * 2U) + 1U];
+         co2 = pCoef[ia2 * 2U];
+         si2 = pCoef[(ia2 * 2U) + 1U];
+         co3 = pCoef[ia3 * 2U];
+         si3 = pCoef[(ia3 * 2U) + 1U];
+
+         /*  Twiddle coefficients index modifier */
+         ia1 = ia1 + twidCoefModifier;
+
+         i0 = j;
+         do
+         {
+            /*  index calculation for the input as, */
+            /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2], pSrc[i0 + 3fftLen/4] */
+            i1 = i0 + n2;
+            i2 = i1 + n2;
+            i3 = i2 + n2;
+
+            /* xa + xc */
+            r1 = pSrc[(2U * i0)] + pSrc[(2U * i2)];
+
+            /* xa - xc */
+            r2 = pSrc[(2U * i0)] - pSrc[(2U * i2)];
+
+            /* ya + yc */
+            s1 = pSrc[(2U * i0) + 1U] + pSrc[(2U * i2) + 1U];
+
+            /* ya - yc */
+            s2 = pSrc[(2U * i0) + 1U] - pSrc[(2U * i2) + 1U];
+
+            /* xb + xd */
+            t1 = pSrc[2U * i1] + pSrc[2U * i3];
+
+            /* xa' = xa + xb + xc + xd */
+            pSrc[2U * i0] = r1 + t1;
+
+            /* xa + xc -(xb + xd) */
+            r1 = r1 - t1;
+
+            /* yb + yd */
+            t2 = pSrc[(2U * i1) + 1U] + pSrc[(2U * i3) + 1U];
+
+            /* ya' = ya + yb + yc + yd */
+            pSrc[(2U * i0) + 1U] = s1 + t2;
+
+            /* (ya + yc) - (yb + yd) */
+            s1 = s1 - t2;
+
+            /* (yb - yd) */
+            t1 = pSrc[(2U * i1) + 1U] - pSrc[(2U * i3) + 1U];
+
+            /* (xb - xd) */
+            t2 = pSrc[2U * i1] - pSrc[2U * i3];
+
+            /* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */
+            pSrc[2U * i1] = (r1 * co2) + (s1 * si2);
+
+            /* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */
+            pSrc[(2U * i1) + 1U] = (s1 * co2) - (r1 * si2);
+
+            /* (xa - xc) + (yb - yd) */
+            r1 = r2 + t1;
+
+            /* (xa - xc) - (yb - yd) */
+            r2 = r2 - t1;
+
+            /* (ya - yc) -  (xb - xd) */
+            s1 = s2 - t2;
+
+            /* (ya - yc) +  (xb - xd) */
+            s2 = s2 + t2;
+
+            /* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */
+            pSrc[2U * i2] = (r1 * co1) + (s1 * si1);
+
+            /* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */
+            pSrc[(2U * i2) + 1U] = (s1 * co1) - (r1 * si1);
+
+            /* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */
+            pSrc[2U * i3] = (r2 * co3) + (s2 * si3);
+
+            /* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */
+            pSrc[(2U * i3) + 1U] = (s2 * co3) - (r2 * si3);
+
+            i0 += n1;
+         } while ( i0 < fftLen);
+         j++;
+      } while (j <= (n2 - 1U));
+      twidCoefModifier <<= 2U;
+   }
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+}
+
+/**
+  brief         Core function for the floating-point CIFFT butterfly process.
+  param[in,out] pSrc             points to the in-place buffer of floating-point data type
+  param[in]     fftLen           length of the FFT
+  param[in]     pCoef            points to twiddle coefficient buffer
+  param[in]     twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
+  param[in]     onebyfftLen      value of 1/fftLen
+  return        none
+ */
+
+void arm_radix4_butterfly_inverse_f32(
+        float32_t * pSrc,
+        uint16_t fftLen,
+  const float32_t * pCoef,
+        uint16_t twidCoefModifier,
+        float32_t onebyfftLen)
+{
+        float32_t co1, co2, co3, si1, si2, si3;
+        uint32_t ia1, ia2, ia3;
+        uint32_t i0, i1, i2, i3;
+        uint32_t n1, n2, j, k;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+        float32_t xaIn, yaIn, xbIn, ybIn, xcIn, ycIn, xdIn, ydIn;
+        float32_t Xaplusc, Xbplusd, Yaplusc, Ybplusd, Xaminusc, Xbminusd, Yaminusc,
+        Ybminusd;
+        float32_t Xb12C_out, Yb12C_out, Xc12C_out, Yc12C_out, Xd12C_out, Yd12C_out;
+        float32_t Xb12_out, Yb12_out, Xc12_out, Yc12_out, Xd12_out, Yd12_out;
+        float32_t *ptr1;
+        float32_t p0,p1,p2,p3,p4,p5,p6,p7;
+        float32_t a0,a1,a2,a3,a4,a5,a6,a7;
+
+
+   /*  Initializations for the first stage */
+   n2 = fftLen;
+   n1 = n2;
+
+   /* n2 = fftLen/4 */
+   n2 >>= 2U;
+   i0 = 0U;
+   ia1 = 0U;
+
+   j = n2;
+
+   /*  Calculation of first stage */
+   do
+   {
+      /*  index calculation for the input as, */
+      /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2], pSrc[i0 + 3fftLen/4] */
+      i1 = i0 + n2;
+      i2 = i1 + n2;
+      i3 = i2 + n2;
+
+      /*  Butterfly implementation */
+      xaIn = pSrc[(2U * i0)];
+      yaIn = pSrc[(2U * i0) + 1U];
+
+      xcIn = pSrc[(2U * i2)];
+      ycIn = pSrc[(2U * i2) + 1U];
+
+      xbIn = pSrc[(2U * i1)];
+      ybIn = pSrc[(2U * i1) + 1U];
+
+      xdIn = pSrc[(2U * i3)];
+      ydIn = pSrc[(2U * i3) + 1U];
+
+      /* xa + xc */
+      Xaplusc = xaIn + xcIn;
+      /* xb + xd */
+      Xbplusd = xbIn + xdIn;
+      /* ya + yc */
+      Yaplusc = yaIn + ycIn;
+      /* yb + yd */
+      Ybplusd = ybIn + ydIn;
+
+      /*  index calculation for the coefficients */
+      ia2 = ia1 + ia1;
+      co2 = pCoef[ia2 * 2U];
+      si2 = pCoef[(ia2 * 2U) + 1U];
+
+      /* xa - xc */
+      Xaminusc = xaIn - xcIn;
+      /* xb - xd */
+      Xbminusd = xbIn - xdIn;
+      /* ya - yc */
+      Yaminusc = yaIn - ycIn;
+      /* yb - yd */
+      Ybminusd = ybIn - ydIn;
+
+      /* xa' = xa + xb + xc + xd */
+      pSrc[(2U * i0)] = Xaplusc + Xbplusd;
+
+      /* ya' = ya + yb + yc + yd */
+      pSrc[(2U * i0) + 1U] = Yaplusc + Ybplusd;
+
+      /* (xa - xc) - (yb - yd) */
+      Xb12C_out = (Xaminusc - Ybminusd);
+      /* (ya - yc) + (xb - xd) */
+      Yb12C_out = (Yaminusc + Xbminusd);
+      /* (xa + xc) - (xb + xd) */
+      Xc12C_out = (Xaplusc - Xbplusd);
+      /* (ya + yc) - (yb + yd) */
+      Yc12C_out = (Yaplusc - Ybplusd);
+      /* (xa - xc) + (yb - yd) */
+      Xd12C_out = (Xaminusc + Ybminusd);
+      /* (ya - yc) - (xb - xd) */
+      Yd12C_out = (Yaminusc - Xbminusd);
+
+      co1 = pCoef[ia1 * 2U];
+      si1 = pCoef[(ia1 * 2U) + 1U];
+
+      /*  index calculation for the coefficients */
+      ia3 = ia2 + ia1;
+      co3 = pCoef[ia3 * 2U];
+      si3 = pCoef[(ia3 * 2U) + 1U];
+
+      Xb12_out = Xb12C_out * co1;
+      Yb12_out = Yb12C_out * co1;
+      Xc12_out = Xc12C_out * co2;
+      Yc12_out = Yc12C_out * co2;
+      Xd12_out = Xd12C_out * co3;
+      Yd12_out = Yd12C_out * co3;
+
+      /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
+      //Xb12_out -= Yb12C_out * si1;
+      p0 = Yb12C_out * si1;
+      /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
+      //Yb12_out += Xb12C_out * si1;
+      p1 = Xb12C_out * si1;
+      /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
+      //Xc12_out -= Yc12C_out * si2;
+      p2 = Yc12C_out * si2;
+      /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
+      //Yc12_out += Xc12C_out * si2;
+      p3 = Xc12C_out * si2;
+      /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
+      //Xd12_out -= Yd12C_out * si3;
+      p4 = Yd12C_out * si3;
+      /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
+      //Yd12_out += Xd12C_out * si3;
+      p5 = Xd12C_out * si3;
+
+      Xb12_out -= p0;
+      Yb12_out += p1;
+      Xc12_out -= p2;
+      Yc12_out += p3;
+      Xd12_out -= p4;
+      Yd12_out += p5;
+
+      /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
+      pSrc[2U * i1] = Xc12_out;
+
+      /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
+      pSrc[(2U * i1) + 1U] = Yc12_out;
+
+      /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
+      pSrc[2U * i2] = Xb12_out;
+
+      /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
+      pSrc[(2U * i2) + 1U] = Yb12_out;
+
+      /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
+      pSrc[2U * i3] = Xd12_out;
+
+      /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
+      pSrc[(2U * i3) + 1U] = Yd12_out;
+
+      /*  Twiddle coefficients index modifier */
+      ia1 = ia1 + twidCoefModifier;
+
+      /*  Updating input index */
+      i0 = i0 + 1U;
+
+   } while (--j);
+
+   twidCoefModifier <<= 2U;
+
+   /*  Calculation of second stage to excluding last stage */
+   for (k = fftLen >> 2U; k > 4U; k >>= 2U)
+   {
+      /*  Initializations for the first stage */
+      n1 = n2;
+      n2 >>= 2U;
+      ia1 = 0U;
+
+      /*  Calculation of first stage */
+      j = 0;
+      do
+      {
+         /*  index calculation for the coefficients */
+         ia2 = ia1 + ia1;
+         ia3 = ia2 + ia1;
+         co1 = pCoef[ia1 * 2U];
+         si1 = pCoef[(ia1 * 2U) + 1U];
+         co2 = pCoef[ia2 * 2U];
+         si2 = pCoef[(ia2 * 2U) + 1U];
+         co3 = pCoef[ia3 * 2U];
+         si3 = pCoef[(ia3 * 2U) + 1U];
+
+         /*  Twiddle coefficients index modifier */
+         ia1 = ia1 + twidCoefModifier;
+
+         i0 = j;
+         do
+         {
+            /*  index calculation for the input as, */
+            /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2], pSrc[i0 + 3fftLen/4] */
+            i1 = i0 + n2;
+            i2 = i1 + n2;
+            i3 = i2 + n2;
+
+            xaIn = pSrc[(2U * i0)];
+            yaIn = pSrc[(2U * i0) + 1U];
+
+            xbIn = pSrc[(2U * i1)];
+            ybIn = pSrc[(2U * i1) + 1U];
+
+            xcIn = pSrc[(2U * i2)];
+            ycIn = pSrc[(2U * i2) + 1U];
+
+            xdIn = pSrc[(2U * i3)];
+            ydIn = pSrc[(2U * i3) + 1U];
+
+            /* xa - xc */
+            Xaminusc = xaIn - xcIn;
+            /* (xb - xd) */
+            Xbminusd = xbIn - xdIn;
+            /* ya - yc */
+            Yaminusc = yaIn - ycIn;
+            /* (yb - yd) */
+            Ybminusd = ybIn - ydIn;
+
+            /* xa + xc */
+            Xaplusc = xaIn + xcIn;
+            /* xb + xd */
+            Xbplusd = xbIn + xdIn;
+            /* ya + yc */
+            Yaplusc = yaIn + ycIn;
+            /* yb + yd */
+            Ybplusd = ybIn + ydIn;
+
+            /* (xa - xc) - (yb - yd) */
+            Xb12C_out = (Xaminusc - Ybminusd);
+            /* (ya - yc) +  (xb - xd) */
+            Yb12C_out = (Yaminusc + Xbminusd);
+            /* xa + xc -(xb + xd) */
+            Xc12C_out = (Xaplusc - Xbplusd);
+            /* (ya + yc) - (yb + yd) */
+            Yc12C_out = (Yaplusc - Ybplusd);
+            /* (xa - xc) + (yb - yd) */
+            Xd12C_out = (Xaminusc + Ybminusd);
+            /* (ya - yc) -  (xb - xd) */
+            Yd12C_out = (Yaminusc - Xbminusd);
+
+            pSrc[(2U * i0)] = Xaplusc + Xbplusd;
+            pSrc[(2U * i0) + 1U] = Yaplusc + Ybplusd;
+
+            Xb12_out = Xb12C_out * co1;
+            Yb12_out = Yb12C_out * co1;
+            Xc12_out = Xc12C_out * co2;
+            Yc12_out = Yc12C_out * co2;
+            Xd12_out = Xd12C_out * co3;
+            Yd12_out = Yd12C_out * co3;
+
+            /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
+            //Xb12_out -= Yb12C_out * si1;
+            p0 = Yb12C_out * si1;
+            /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
+            //Yb12_out += Xb12C_out * si1;
+            p1 = Xb12C_out * si1;
+            /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
+            //Xc12_out -= Yc12C_out * si2;
+            p2 = Yc12C_out * si2;
+            /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
+            //Yc12_out += Xc12C_out * si2;
+            p3 = Xc12C_out * si2;
+            /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
+            //Xd12_out -= Yd12C_out * si3;
+            p4 = Yd12C_out * si3;
+            /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
+            //Yd12_out += Xd12C_out * si3;
+            p5 = Xd12C_out * si3;
+
+            Xb12_out -= p0;
+            Yb12_out += p1;
+            Xc12_out -= p2;
+            Yc12_out += p3;
+            Xd12_out -= p4;
+            Yd12_out += p5;
+
+            /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
+            pSrc[2U * i1] = Xc12_out;
+
+            /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
+            pSrc[(2U * i1) + 1U] = Yc12_out;
+
+            /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
+            pSrc[2U * i2] = Xb12_out;
+
+            /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
+            pSrc[(2U * i2) + 1U] = Yb12_out;
+
+            /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
+            pSrc[2U * i3] = Xd12_out;
+
+            /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
+            pSrc[(2U * i3) + 1U] = Yd12_out;
+
+            i0 += n1;
+         } while (i0 < fftLen);
+         j++;
+      } while (j <= (n2 - 1U));
+      twidCoefModifier <<= 2U;
+   }
+   /*  Initializations of last stage */
+
+   j = fftLen >> 2;
+   ptr1 = &pSrc[0];
+
+   /*  Calculations of last stage */
+   do
+   {
+      xaIn = ptr1[0];
+      yaIn = ptr1[1];
+      xbIn = ptr1[2];
+      ybIn = ptr1[3];
+      xcIn = ptr1[4];
+      ycIn = ptr1[5];
+      xdIn = ptr1[6];
+      ydIn = ptr1[7];
+
+      /*  Butterfly implementation */
+      /* xa + xc */
+      Xaplusc = xaIn + xcIn;
+
+      /* xa - xc */
+      Xaminusc = xaIn - xcIn;
+
+      /* ya + yc */
+      Yaplusc = yaIn + ycIn;
+
+      /* ya - yc */
+      Yaminusc = yaIn - ycIn;
+
+      /* xb + xd */
+      Xbplusd = xbIn + xdIn;
+
+      /* yb + yd */
+      Ybplusd = ybIn + ydIn;
+
+      /* (xb-xd) */
+      Xbminusd = xbIn - xdIn;
+
+      /* (yb-yd) */
+      Ybminusd = ybIn - ydIn;
+
+      /* xa' = (xa+xb+xc+xd) * onebyfftLen */
+      a0 = (Xaplusc + Xbplusd);
+      /* ya' = (ya+yb+yc+yd) * onebyfftLen */
+      a1 = (Yaplusc + Ybplusd);
+      /* xc' = (xa-xb+xc-xd) * onebyfftLen */
+      a2 = (Xaplusc - Xbplusd);
+      /* yc' = (ya-yb+yc-yd) * onebyfftLen  */
+      a3 = (Yaplusc - Ybplusd);
+      /* xb' = (xa-yb-xc+yd) * onebyfftLen */
+      a4 = (Xaminusc - Ybminusd);
+      /* yb' = (ya+xb-yc-xd) * onebyfftLen */
+      a5 = (Yaminusc + Xbminusd);
+      /* xd' = (xa-yb-xc+yd) * onebyfftLen */
+      a6 = (Xaminusc + Ybminusd);
+      /* yd' = (ya-xb-yc+xd) * onebyfftLen */
+      a7 = (Yaminusc - Xbminusd);
+
+      p0 = a0 * onebyfftLen;
+      p1 = a1 * onebyfftLen;
+      p2 = a2 * onebyfftLen;
+      p3 = a3 * onebyfftLen;
+      p4 = a4 * onebyfftLen;
+      p5 = a5 * onebyfftLen;
+      p6 = a6 * onebyfftLen;
+      p7 = a7 * onebyfftLen;
+
+      /* xa' = (xa+xb+xc+xd) * onebyfftLen */
+      ptr1[0] = p0;
+      /* ya' = (ya+yb+yc+yd) * onebyfftLen */
+      ptr1[1] = p1;
+      /* xc' = (xa-xb+xc-xd) * onebyfftLen */
+      ptr1[2] = p2;
+      /* yc' = (ya-yb+yc-yd) * onebyfftLen  */
+      ptr1[3] = p3;
+      /* xb' = (xa-yb-xc+yd) * onebyfftLen */
+      ptr1[4] = p4;
+      /* yb' = (ya+xb-yc-xd) * onebyfftLen */
+      ptr1[5] = p5;
+      /* xd' = (xa-yb-xc+yd) * onebyfftLen */
+      ptr1[6] = p6;
+      /* yd' = (ya-xb-yc+xd) * onebyfftLen */
+      ptr1[7] = p7;
+
+      /* increment source pointer by 8 for next calculations */
+      ptr1 = ptr1 + 8U;
+
+   } while (--j);
+
+#else
+
+        float32_t t1, t2, r1, r2, s1, s2;
+
+   /*  Initializations for the first stage */
+   n2 = fftLen;
+   n1 = n2;
+
+   /*  Calculation of first stage */
+   for (k = fftLen; k > 4U; k >>= 2U)
+   {
+      /*  Initializations for the first stage */
+      n1 = n2;
+      n2 >>= 2U;
+      ia1 = 0U;
+
+      /*  Calculation of first stage */
+      j = 0;
+      do
+      {
+         /*  index calculation for the coefficients */
+         ia2 = ia1 + ia1;
+         ia3 = ia2 + ia1;
+         co1 = pCoef[ia1 * 2U];
+         si1 = pCoef[(ia1 * 2U) + 1U];
+         co2 = pCoef[ia2 * 2U];
+         si2 = pCoef[(ia2 * 2U) + 1U];
+         co3 = pCoef[ia3 * 2U];
+         si3 = pCoef[(ia3 * 2U) + 1U];
+
+         /*  Twiddle coefficients index modifier */
+         ia1 = ia1 + twidCoefModifier;
+
+         i0 = j;
+         do
+         {
+            /*  index calculation for the input as, */
+            /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2], pSrc[i0 + 3fftLen/4] */
+            i1 = i0 + n2;
+            i2 = i1 + n2;
+            i3 = i2 + n2;
+
+            /* xa + xc */
+            r1 = pSrc[(2U * i0)] + pSrc[(2U * i2)];
+
+            /* xa - xc */
+            r2 = pSrc[(2U * i0)] - pSrc[(2U * i2)];
+
+            /* ya + yc */
+            s1 = pSrc[(2U * i0) + 1U] + pSrc[(2U * i2) + 1U];
+
+            /* ya - yc */
+            s2 = pSrc[(2U * i0) + 1U] - pSrc[(2U * i2) + 1U];
+
+            /* xb + xd */
+            t1 = pSrc[2U * i1] + pSrc[2U * i3];
+
+            /* xa' = xa + xb + xc + xd */
+            pSrc[2U * i0] = r1 + t1;
+
+            /* xa + xc -(xb + xd) */
+            r1 = r1 - t1;
+
+            /* yb + yd */
+            t2 = pSrc[(2U * i1) + 1U] + pSrc[(2U * i3) + 1U];
+
+            /* ya' = ya + yb + yc + yd */
+            pSrc[(2U * i0) + 1U] = s1 + t2;
+
+            /* (ya + yc) - (yb + yd) */
+            s1 = s1 - t2;
+
+            /* (yb - yd) */
+            t1 = pSrc[(2U * i1) + 1U] - pSrc[(2U * i3) + 1U];
+
+            /* (xb - xd) */
+            t2 = pSrc[2U * i1] - pSrc[2U * i3];
+
+            /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
+            pSrc[2U * i1] = (r1 * co2) - (s1 * si2);
+
+            /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
+            pSrc[(2U * i1) + 1U] = (s1 * co2) + (r1 * si2);
+
+            /* (xa - xc) - (yb - yd) */
+            r1 = r2 - t1;
+
+            /* (xa - xc) + (yb - yd) */
+            r2 = r2 + t1;
+
+            /* (ya - yc) +  (xb - xd) */
+            s1 = s2 + t2;
+
+            /* (ya - yc) -  (xb - xd) */
+            s2 = s2 - t2;
+
+            /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
+            pSrc[2U * i2] = (r1 * co1) - (s1 * si1);
+
+            /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
+            pSrc[(2U * i2) + 1U] = (s1 * co1) + (r1 * si1);
+
+            /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
+            pSrc[2U * i3] = (r2 * co3) - (s2 * si3);
+
+            /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
+            pSrc[(2U * i3) + 1U] = (s2 * co3) + (r2 * si3);
+
+            i0 += n1;
+         } while ( i0 < fftLen);
+         j++;
+      } while (j <= (n2 - 1U));
+      twidCoefModifier <<= 2U;
+   }
+   /*  Initializations of last stage */
+   n1 = n2;
+   n2 >>= 2U;
+
+   /*  Calculations of last stage */
+   for (i0 = 0U; i0 <= (fftLen - n1); i0 += n1)
+   {
+      /*  index calculation for the input as, */
+      /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2], pSrc[i0 + 3fftLen/4] */
+      i1 = i0 + n2;
+      i2 = i1 + n2;
+      i3 = i2 + n2;
+
+      /*  Butterfly implementation */
+      /* xa + xc */
+      r1 = pSrc[2U * i0] + pSrc[2U * i2];
+
+      /* xa - xc */
+      r2 = pSrc[2U * i0] - pSrc[2U * i2];
+
+      /* ya + yc */
+      s1 = pSrc[(2U * i0) + 1U] + pSrc[(2U * i2) + 1U];
+
+      /* ya - yc */
+      s2 = pSrc[(2U * i0) + 1U] - pSrc[(2U * i2) + 1U];
+
+      /* xc + xd */
+      t1 = pSrc[2U * i1] + pSrc[2U * i3];
+
+      /* xa' = xa + xb + xc + xd */
+      pSrc[2U * i0] = (r1 + t1) * onebyfftLen;
+
+      /* (xa + xb) - (xc + xd) */
+      r1 = r1 - t1;
+
+      /* yb + yd */
+      t2 = pSrc[(2U * i1) + 1U] + pSrc[(2U * i3) + 1U];
+
+      /* ya' = ya + yb + yc + yd */
+      pSrc[(2U * i0) + 1U] = (s1 + t2) * onebyfftLen;
+
+      /* (ya + yc) - (yb + yd) */
+      s1 = s1 - t2;
+
+      /* (yb-yd) */
+      t1 = pSrc[(2U * i1) + 1U] - pSrc[(2U * i3) + 1U];
+
+      /* (xb-xd) */
+      t2 = pSrc[2U * i1] - pSrc[2U * i3];
+
+      /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
+      pSrc[2U * i1] = r1 * onebyfftLen;
+
+      /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
+      pSrc[(2U * i1) + 1U] = s1 * onebyfftLen;
+
+      /* (xa - xc) - (yb-yd) */
+      r1 = r2 - t1;
+
+      /* (xa - xc) + (yb-yd) */
+      r2 = r2 + t1;
+
+      /* (ya - yc) + (xb-xd) */
+      s1 = s2 + t2;
+
+      /* (ya - yc) - (xb-xd) */
+      s2 = s2 - t2;
+
+      /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
+      pSrc[2U * i2] = r1 * onebyfftLen;
+
+      /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
+      pSrc[(2U * i2) + 1U] = s1 * onebyfftLen;
+
+      /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
+      pSrc[2U * i3] = r2 * onebyfftLen;
+
+      /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
+      pSrc[(2U * i3) + 1U] = s2 * onebyfftLen;
+   }
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+}
+
+
diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_init_f32.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_init_f32.c
index b3aabbb..539206d 100644
--- a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_init_f32.c
+++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_init_f32.c
@@ -1,168 +1,156 @@
-/* ----------------------------------------------------------------------
- * Project:      CMSIS DSP Library
- * Title:        arm_cfft_radix4_init_f32.c
- * Description:  Radix-4 Decimation in Frequency Floating-point CFFT & CIFFT Initialization function
- *
- * $Date:        23 April 2021
- * $Revision:    V1.9.0
- *
- * Target Processor: Cortex-M and Cortex-A cores
- * -------------------------------------------------------------------- */
-/*
- * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "dsp/transform_functions.h"
-#include "arm_common_tables.h"
-
-/**
-  @ingroup groupTransforms
- */
-
-/**
-  @addtogroup ComplexFFT
-  @{
- */
-
-/**
-  @brief         Initialization function for the floating-point CFFT/CIFFT.
-  @deprecated    Do not use this function. It has been superceded by \ref arm_cfft_f32 and will be removed in the future.
-  @param[in,out] S              points to an instance of the floating-point CFFT/CIFFT structure
-  @param[in]     fftLen         length of the FFT
-  @param[in]     ifftFlag       flag that selects transform direction
-                   - value = 0: forward transform
-                   - value = 1: inverse transform
-  @param[in]     bitReverseFlag flag that enables / disables bit reversal of output
-                   - value = 0: disables bit reversal of output
-                   - value = 1: enables bit reversal of output
-  @return        execution status
-                   - \ref ARM_MATH_SUCCESS        : Operation successful
-                   - \ref ARM_MATH_ARGUMENT_ERROR : <code>fftLen</code> is not a supported length
-
-  @par           Details
-                   The parameter <code>ifftFlag</code> controls whether a forward or inverse transform is computed.
-                   Set(=1) ifftFlag for calculation of CIFFT otherwise  CFFT is calculated
-  @par
-                   The parameter <code>bitReverseFlag</code> controls whether output is in normal order or bit reversed order.
-                   Set(=1) bitReverseFlag for output to be in normal order otherwise output is in bit reversed order.
-  @par
-                   The parameter <code>fftLen</code> Specifies length of CFFT/CIFFT process. Supported FFT Lengths are 16, 64, 256, 1024.
-  @par
-                   This Function also initializes Twiddle factor table pointer and Bit reversal table pointer.
- */
-
-arm_status arm_cfft_radix4_init_f32(
-  arm_cfft_radix4_instance_f32 * S,
-  uint16_t fftLen,
-  uint8_t ifftFlag,
-  uint8_t bitReverseFlag)
-{
-   /*  Initialise the default arm status */
-  arm_status status = ARM_MATH_ARGUMENT_ERROR;
-
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)
-
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F32_4096)
-
-  /*  Initialise the default arm status */
-  status = ARM_MATH_SUCCESS;
-
-  /*  Initialise the FFT length */
-  S->fftLen = fftLen;
-
-  /*  Initialise the Twiddle coefficient pointer */
-  S->pTwiddle = (float32_t *) twiddleCoef;
-
-  /*  Initialise the Flag for selection of CFFT or CIFFT */
-  S->ifftFlag = ifftFlag;
-
-  /*  Initialise the Flag for calculation Bit reversal or not */
-  S->bitReverseFlag = bitReverseFlag;
-
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F32_4096)
-
-  /*  Initializations of structure parameters depending on the FFT length */
-  switch (S->fftLen)
-  {
-
-  case 4096U:
-    /*  Initializations of structure parameters for 4096 point FFT */
-
-    /*  Initialise the twiddle coef modifier value */
-    S->twidCoefModifier = 1U;
-    /*  Initialise the bit reversal table modifier */
-    S->bitRevFactor = 1U;
-    /*  Initialise the bit reversal table pointer */
-    S->pBitRevTable = (uint16_t *) armBitRevTable;
-    /*  Initialise the 1/fftLen Value */
-    S->onebyfftLen = 0.000244140625;
-    break;
-
-  case 1024U:
-    /*  Initializations of structure parameters for 1024 point FFT */
-
-    /*  Initialise the twiddle coef modifier value */
-    S->twidCoefModifier = 4U;
-    /*  Initialise the bit reversal table modifier */
-    S->bitRevFactor = 4U;
-    /*  Initialise the bit reversal table pointer */
-    S->pBitRevTable = (uint16_t *) & armBitRevTable[3];
-    /*  Initialise the 1/fftLen Value */
-    S->onebyfftLen = 0.0009765625f;
-    break;
-
-
-  case 256U:
-    /*  Initializations of structure parameters for 256 point FFT */
-    S->twidCoefModifier = 16U;
-    S->bitRevFactor = 16U;
-    S->pBitRevTable = (uint16_t *) & armBitRevTable[15];
-    S->onebyfftLen = 0.00390625f;
-    break;
-
-  case 64U:
-    /*  Initializations of structure parameters for 64 point FFT */
-    S->twidCoefModifier = 64U;
-    S->bitRevFactor = 64U;
-    S->pBitRevTable = (uint16_t *) & armBitRevTable[63];
-    S->onebyfftLen = 0.015625f;
-    break;
-
-  case 16U:
-    /*  Initializations of structure parameters for 16 point FFT */
-    S->twidCoefModifier = 256U;
-    S->bitRevFactor = 256U;
-    S->pBitRevTable = (uint16_t *) & armBitRevTable[255];
-    S->onebyfftLen = 0.0625f;
-    break;
-
-
-  default:
-    /*  Reporting argument error if fftSize is not valid value */
-    status = ARM_MATH_ARGUMENT_ERROR;
-    break;
-  }
-#endif
-#endif
-#endif
-  
-  return (status);
-}
-
-/**
-  @} end of ComplexFFT group
- */
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_cfft_radix4_init_f32.c
+ * Description:  Radix-4 Decimation in Frequency Floating-point CFFT & CIFFT Initialization function
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+#include "arm_common_tables.h"
+
+/**
+  @ingroup groupTransforms
+ */
+
+/**
+  @addtogroup ComplexFFT
+  @{
+ */
+
+/**
+  @brief         Initialization function for the floating-point CFFT/CIFFT.
+  @deprecated    Do not use this function. It has been superceded by \ref arm_cfft_f32 and will be removed in the future.
+  @param[in,out] S              points to an instance of the floating-point CFFT/CIFFT structure
+  @param[in]     fftLen         length of the FFT
+  @param[in]     ifftFlag       flag that selects transform direction
+                   - value = 0: forward transform
+                   - value = 1: inverse transform
+  @param[in]     bitReverseFlag flag that enables / disables bit reversal of output
+                   - value = 0: disables bit reversal of output
+                   - value = 1: enables bit reversal of output
+  @return        execution status
+                   - \ref ARM_MATH_SUCCESS        : Operation successful
+                   - \ref ARM_MATH_ARGUMENT_ERROR : <code>fftLen</code> is not a supported length
+
+  @par           Details
+                   The parameter <code>ifftFlag</code> controls whether a forward or inverse transform is computed.
+                   Set(=1) ifftFlag for calculation of CIFFT otherwise  CFFT is calculated
+  @par
+                   The parameter <code>bitReverseFlag</code> controls whether output is in normal order or bit reversed order.
+                   Set(=1) bitReverseFlag for output to be in normal order otherwise output is in bit reversed order.
+  @par
+                   The parameter <code>fftLen</code> Specifies length of CFFT/CIFFT process. Supported FFT Lengths are 16, 64, 256, 1024.
+  @par
+                   This Function also initializes Twiddle factor table pointer and Bit reversal table pointer.
+ */
+
+arm_status arm_cfft_radix4_init_f32(
+  arm_cfft_radix4_instance_f32 * S,
+  uint16_t fftLen,
+  uint8_t ifftFlag,
+  uint8_t bitReverseFlag)
+{
+  /*  Initialise the default arm status */
+  arm_status status = ARM_MATH_SUCCESS;
+
+  /*  Initialise the FFT length */
+  S->fftLen = fftLen;
+
+  /*  Initialise the Twiddle coefficient pointer */
+  S->pTwiddle = (float32_t *) twiddleCoef;
+
+  /*  Initialise the Flag for selection of CFFT or CIFFT */
+  S->ifftFlag = ifftFlag;
+
+  /*  Initialise the Flag for calculation Bit reversal or not */
+  S->bitReverseFlag = bitReverseFlag;
+
+  /*  Initializations of structure parameters depending on the FFT length */
+  switch (S->fftLen)
+  {
+
+  case 4096U:
+    /*  Initializations of structure parameters for 4096 point FFT */
+
+    /*  Initialise the twiddle coef modifier value */
+    S->twidCoefModifier = 1U;
+    /*  Initialise the bit reversal table modifier */
+    S->bitRevFactor = 1U;
+    /*  Initialise the bit reversal table pointer */
+    S->pBitRevTable = (uint16_t *) armBitRevTable;
+    /*  Initialise the 1/fftLen Value */
+    S->onebyfftLen = 0.000244140625;
+    break;
+
+  case 1024U:
+    /*  Initializations of structure parameters for 1024 point FFT */
+
+    /*  Initialise the twiddle coef modifier value */
+    S->twidCoefModifier = 4U;
+    /*  Initialise the bit reversal table modifier */
+    S->bitRevFactor = 4U;
+    /*  Initialise the bit reversal table pointer */
+    S->pBitRevTable = (uint16_t *) & armBitRevTable[3];
+    /*  Initialise the 1/fftLen Value */
+    S->onebyfftLen = 0.0009765625f;
+    break;
+
+
+  case 256U:
+    /*  Initializations of structure parameters for 256 point FFT */
+    S->twidCoefModifier = 16U;
+    S->bitRevFactor = 16U;
+    S->pBitRevTable = (uint16_t *) & armBitRevTable[15];
+    S->onebyfftLen = 0.00390625f;
+    break;
+
+  case 64U:
+    /*  Initializations of structure parameters for 64 point FFT */
+    S->twidCoefModifier = 64U;
+    S->bitRevFactor = 64U;
+    S->pBitRevTable = (uint16_t *) & armBitRevTable[63];
+    S->onebyfftLen = 0.015625f;
+    break;
+
+  case 16U:
+    /*  Initializations of structure parameters for 16 point FFT */
+    S->twidCoefModifier = 256U;
+    S->bitRevFactor = 256U;
+    S->pBitRevTable = (uint16_t *) & armBitRevTable[255];
+    S->onebyfftLen = 0.0625f;
+    break;
+
+
+  default:
+    /*  Reporting argument error if fftSize is not valid value */
+    status = ARM_MATH_ARGUMENT_ERROR;
+    break;
+  }
+
+  return (status);
+}
+
+/**
+  @} end of ComplexFFT group
+ */
diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_init_q15.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_init_q15.c
index 7774243..1d83009 100644
--- a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_init_q15.c
+++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_init_q15.c
@@ -1,157 +1,145 @@
-/* ----------------------------------------------------------------------
- * Project:      CMSIS DSP Library
- * Title:        arm_cfft_radix4_init_q15.c
- * Description:  Radix-4 Decimation in Frequency Q15 FFT & IFFT initialization function
- *
- * $Date:        23 April 2021
- * $Revision:    V1.9.0
- *
- * Target Processor: Cortex-M and Cortex-A cores
- * -------------------------------------------------------------------- */
-/*
- * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "dsp/transform_functions.h"
-#include "arm_common_tables.h"
-
-/**
-  @ingroup groupTransforms
- */
-
-
-/**
-  @addtogroup ComplexFFT
-  @{
- */
-
-
-/**
-  @brief Initialization function for the Q15 CFFT/CIFFT.
-  @deprecated    Do not use this function. It has been superseded by \ref arm_cfft_q15 and will be removed in the future.
-  @param[in,out] S              points to an instance of the Q15 CFFT/CIFFT structure
-  @param[in]     fftLen         length of the FFT
-  @param[in]     ifftFlag       flag that selects transform direction
-                   - value = 0: forward transform
-                   - value = 1: inverse transform
-  @param[in]     bitReverseFlag flag that enables / disables bit reversal of output
-                   - value = 0: disables bit reversal of output
-                   - value = 1: enables bit reversal of output
-  @return        execution status
-                   - \ref ARM_MATH_SUCCESS        : Operation successful
-                   - \ref ARM_MATH_ARGUMENT_ERROR : <code>fftLen</code> is not a supported length
-
-  @par           Details
-                   The parameter <code>ifftFlag</code> controls whether a forward or inverse transform is computed.
-                   Set(=1) ifftFlag for calculation of CIFFT otherwise  CFFT is calculated
-  @par
-                   The parameter <code>bitReverseFlag</code> controls whether output is in normal order or bit reversed order.
-                   Set(=1) bitReverseFlag for output to be in normal order otherwise output is in bit reversed order.
-  @par
-                   The parameter <code>fftLen</code> Specifies length of CFFT/CIFFT process. Supported FFT Lengths are 16, 64, 256, 1024.
-  @par
-                   This Function also initializes Twiddle factor table pointer and Bit reversal table pointer.
- */
-
-arm_status arm_cfft_radix4_init_q15(
-  arm_cfft_radix4_instance_q15 * S,
-  uint16_t fftLen,
-  uint8_t ifftFlag,
-  uint8_t bitReverseFlag)
-{
-  /*  Initialise the default arm status */
-  arm_status status = ARM_MATH_ARGUMENT_ERROR;
-
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)
-
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q15_4096)
-
-  /*  Initialise the default arm status */
-  status = ARM_MATH_SUCCESS;
-  /*  Initialise the FFT length */
-  S->fftLen = fftLen;
-  /*  Initialise the Twiddle coefficient pointer */
-  S->pTwiddle = (q15_t *) twiddleCoef_4096_q15;
-  /*  Initialise the Flag for selection of CFFT or CIFFT */
-  S->ifftFlag = ifftFlag;
-  /*  Initialise the Flag for calculation Bit reversal or not */
-  S->bitReverseFlag = bitReverseFlag;
-
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREV_1024)
-
-  /*  Initializations of structure parameters depending on the FFT length */
-  switch (S->fftLen)
-  {
-  case 4096U:
-    /*  Initializations of structure parameters for 4096 point FFT */
-
-    /*  Initialise the twiddle coef modifier value */
-    S->twidCoefModifier = 1U;
-    /*  Initialise the bit reversal table modifier */
-    S->bitRevFactor = 1U;
-    /*  Initialise the bit reversal table pointer */
-    S->pBitRevTable = (uint16_t *) armBitRevTable;
-
-    break;
-
-  case 1024U:
-    /*  Initializations of structure parameters for 1024 point FFT */
-    S->twidCoefModifier = 4U;
-    S->bitRevFactor = 4U;
-    S->pBitRevTable = (uint16_t *) & armBitRevTable[3];
-
-    break;
-
-  case 256U:
-    /*  Initializations of structure parameters for 256 point FFT */
-    S->twidCoefModifier = 16U;
-    S->bitRevFactor = 16U;
-    S->pBitRevTable = (uint16_t *) & armBitRevTable[15];
-
-    break;
-
-  case 64U:
-    /*  Initializations of structure parameters for 64 point FFT */
-    S->twidCoefModifier = 64U;
-    S->bitRevFactor = 64U;
-    S->pBitRevTable = (uint16_t *) & armBitRevTable[63];
-
-    break;
-
-  case 16U:
-    /*  Initializations of structure parameters for 16 point FFT */
-    S->twidCoefModifier = 256U;
-    S->bitRevFactor = 256U;
-    S->pBitRevTable = (uint16_t *) & armBitRevTable[255];
-
-    break;
-
-  default:
-    /*  Reporting argument error if fftSize is not valid value */
-    status = ARM_MATH_ARGUMENT_ERROR;
-    break;
-  }
-
-#endif
-#endif 
-#endif
-  return (status);
-}
-
-/**
-  @} end of ComplexFFT group
- */
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_cfft_radix4_init_q15.c
+ * Description:  Radix-4 Decimation in Frequency Q15 FFT & IFFT initialization function
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+#include "arm_common_tables.h"
+
+/**
+  @ingroup groupTransforms
+ */
+
+
+/**
+  @addtogroup ComplexFFT
+  @{
+ */
+
+
+/**
+  @brief Initialization function for the Q15 CFFT/CIFFT.
+  @deprecated    Do not use this function. It has been superseded by \ref arm_cfft_q15 and will be removed in the future.
+  @param[in,out] S              points to an instance of the Q15 CFFT/CIFFT structure
+  @param[in]     fftLen         length of the FFT
+  @param[in]     ifftFlag       flag that selects transform direction
+                   - value = 0: forward transform
+                   - value = 1: inverse transform
+  @param[in]     bitReverseFlag flag that enables / disables bit reversal of output
+                   - value = 0: disables bit reversal of output
+                   - value = 1: enables bit reversal of output
+  @return        execution status
+                   - \ref ARM_MATH_SUCCESS        : Operation successful
+                   - \ref ARM_MATH_ARGUMENT_ERROR : <code>fftLen</code> is not a supported length
+
+  @par           Details
+                   The parameter <code>ifftFlag</code> controls whether a forward or inverse transform is computed.
+                   Set(=1) ifftFlag for calculation of CIFFT otherwise  CFFT is calculated
+  @par
+                   The parameter <code>bitReverseFlag</code> controls whether output is in normal order or bit reversed order.
+                   Set(=1) bitReverseFlag for output to be in normal order otherwise output is in bit reversed order.
+  @par
+                   The parameter <code>fftLen</code> Specifies length of CFFT/CIFFT process. Supported FFT Lengths are 16, 64, 256, 1024.
+  @par
+                   This Function also initializes Twiddle factor table pointer and Bit reversal table pointer.
+ */
+
+arm_status arm_cfft_radix4_init_q15(
+  arm_cfft_radix4_instance_q15 * S,
+  uint16_t fftLen,
+  uint8_t ifftFlag,
+  uint8_t bitReverseFlag)
+{
+  /*  Initialise the default arm status */
+  arm_status status = ARM_MATH_SUCCESS;
+  /*  Initialise the FFT length */
+  S->fftLen = fftLen;
+  /*  Initialise the Twiddle coefficient pointer */
+  S->pTwiddle = (q15_t *) twiddleCoef_4096_q15;
+  /*  Initialise the Flag for selection of CFFT or CIFFT */
+  S->ifftFlag = ifftFlag;
+  /*  Initialise the Flag for calculation Bit reversal or not */
+  S->bitReverseFlag = bitReverseFlag;
+
+  /*  Initializations of structure parameters depending on the FFT length */
+  switch (S->fftLen)
+  {
+  case 4096U:
+    /*  Initializations of structure parameters for 4096 point FFT */
+
+    /*  Initialise the twiddle coef modifier value */
+    S->twidCoefModifier = 1U;
+    /*  Initialise the bit reversal table modifier */
+    S->bitRevFactor = 1U;
+    /*  Initialise the bit reversal table pointer */
+    S->pBitRevTable = (uint16_t *) armBitRevTable;
+
+    break;
+
+  case 1024U:
+    /*  Initializations of structure parameters for 1024 point FFT */
+    S->twidCoefModifier = 4U;
+    S->bitRevFactor = 4U;
+    S->pBitRevTable = (uint16_t *) & armBitRevTable[3];
+
+    break;
+
+  case 256U:
+    /*  Initializations of structure parameters for 256 point FFT */
+    S->twidCoefModifier = 16U;
+    S->bitRevFactor = 16U;
+    S->pBitRevTable = (uint16_t *) & armBitRevTable[15];
+
+    break;
+
+  case 64U:
+    /*  Initializations of structure parameters for 64 point FFT */
+    S->twidCoefModifier = 64U;
+    S->bitRevFactor = 64U;
+    S->pBitRevTable = (uint16_t *) & armBitRevTable[63];
+
+    break;
+
+  case 16U:
+    /*  Initializations of structure parameters for 16 point FFT */
+    S->twidCoefModifier = 256U;
+    S->bitRevFactor = 256U;
+    S->pBitRevTable = (uint16_t *) & armBitRevTable[255];
+
+    break;
+
+  default:
+    /*  Reporting argument error if fftSize is not valid value */
+    status = ARM_MATH_ARGUMENT_ERROR;
+    break;
+  }
+
+  return (status);
+}
+
+/**
+  @} end of ComplexFFT group
+ */
diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_init_q31.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_init_q31.c
index 04ba393..964b4c8 100644
--- a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_init_q31.c
+++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_init_q31.c
@@ -1,154 +1,141 @@
-/* ----------------------------------------------------------------------
- * Project:      CMSIS DSP Library
- * Title:        arm_cfft_radix4_init_q31.c
- * Description:  Radix-4 Decimation in Frequency Q31 FFT & IFFT initialization function
- *
- * $Date:        23 April 2021
- * $Revision:    V1.9.0
- *
- * Target Processor: Cortex-M and Cortex-A cores
- * -------------------------------------------------------------------- */
-/*
- * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "dsp/transform_functions.h"
-#include "arm_common_tables.h"
-
-/**
-  @ingroup groupTransforms
- */
-
-/**
-  @addtogroup ComplexFFT
-  @{
- */
-
-/**
-
-  @brief         Initialization function for the Q31 CFFT/CIFFT.
-  @deprecated    Do not use this function. It has been superseded by \ref arm_cfft_q31 and will be removed in the future.
-  @param[in,out] S              points to an instance of the Q31 CFFT/CIFFT structure.
-  @param[in]     fftLen         length of the FFT.
-  @param[in]     ifftFlag       flag that selects transform direction
-                   - value = 0: forward transform
-                   - value = 1: inverse transform
-  @param[in]     bitReverseFlag flag that enables / disables bit reversal of output
-                   - value = 0: disables bit reversal of output
-                   - value = 1: enables bit reversal of output
-  @return        execution status
-                   - \ref ARM_MATH_SUCCESS        : Operation successful
-                   - \ref ARM_MATH_ARGUMENT_ERROR : <code>fftLen</code> is not a supported length
-
-  @par           Details
-                   The parameter <code>ifftFlag</code> controls whether a forward or inverse transform is computed.
-                   Set(=1) ifftFlag for calculation of CIFFT otherwise  CFFT is calculated
-  @par
-                   The parameter <code>bitReverseFlag</code> controls whether output is in normal order or bit reversed order.
-                   Set(=1) bitReverseFlag for output to be in normal order otherwise output is in bit reversed order.
-  @par
-                   The parameter <code>fftLen</code> Specifies length of CFFT/CIFFT process. Supported FFT Lengths are 16, 64, 256, 1024.
-  @par
-                   This Function also initializes Twiddle factor table pointer and Bit reversal table pointer.
-*/
-
-arm_status arm_cfft_radix4_init_q31(
-  arm_cfft_radix4_instance_q31 * S,
-  uint16_t fftLen,
-  uint8_t ifftFlag,
-  uint8_t bitReverseFlag)
-{
-
-  /*  Initialise the default arm status */
-  arm_status status = ARM_MATH_ARGUMENT_ERROR;
-
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)
-
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q31_4096)
-
-  /*  Initialise the default arm status */
-  status = ARM_MATH_SUCCESS;
-  /*  Initialise the FFT length */
-  S->fftLen = fftLen;
-  /*  Initialise the Twiddle coefficient pointer */
-  S->pTwiddle = (q31_t *) twiddleCoef_4096_q31;
-  /*  Initialise the Flag for selection of CFFT or CIFFT */
-  S->ifftFlag = ifftFlag;
-  /*  Initialise the Flag for calculation Bit reversal or not */
-  S->bitReverseFlag = bitReverseFlag;
-
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREV_1024)
-
-  /*  Initializations of Instance structure depending on the FFT length */
-  switch (S->fftLen)
-  {
-    /*  Initializations of structure parameters for 4096 point FFT */
-  case 4096U:
-    /*  Initialise the twiddle coef modifier value */
-    S->twidCoefModifier = 1U;
-    /*  Initialise the bit reversal table modifier */
-    S->bitRevFactor = 1U;
-    /*  Initialise the bit reversal table pointer */
-    S->pBitRevTable = (uint16_t *) armBitRevTable;
-    break;
-
-    /*  Initializations of structure parameters for 1024 point FFT */
-  case 1024U:
-    /*  Initialise the twiddle coef modifier value */
-    S->twidCoefModifier = 4U;
-    /*  Initialise the bit reversal table modifier */
-    S->bitRevFactor = 4U;
-    /*  Initialise the bit reversal table pointer */
-    S->pBitRevTable = (uint16_t *) & armBitRevTable[3];
-    break;
-
-  case 256U:
-    /*  Initializations of structure parameters for 256 point FFT */
-    S->twidCoefModifier = 16U;
-    S->bitRevFactor = 16U;
-    S->pBitRevTable = (uint16_t *) & armBitRevTable[15];
-    break;
-
-  case 64U:
-    /*  Initializations of structure parameters for 64 point FFT */
-    S->twidCoefModifier = 64U;
-    S->bitRevFactor = 64U;
-    S->pBitRevTable = (uint16_t *) & armBitRevTable[63];
-    break;
-
-  case 16U:
-    /*  Initializations of structure parameters for 16 point FFT */
-    S->twidCoefModifier = 256U;
-    S->bitRevFactor = 256U;
-    S->pBitRevTable = (uint16_t *) & armBitRevTable[255];
-    break;
-
-  default:
-    /*  Reporting argument error if fftSize is not valid value */
-    status = ARM_MATH_ARGUMENT_ERROR;
-    break;
-  }
-
-#endif
-#endif
-#endif
-  return (status);
-}
-
-/**
-  @} end of ComplexFFT group
- */
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_cfft_radix4_init_q31.c
+ * Description:  Radix-4 Decimation in Frequency Q31 FFT & IFFT initialization function
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+#include "arm_common_tables.h"
+
+/**
+  @ingroup groupTransforms
+ */
+
+/**
+  @addtogroup ComplexFFT
+  @{
+ */
+
+/**
+
+  @brief         Initialization function for the Q31 CFFT/CIFFT.
+  @deprecated    Do not use this function. It has been superseded by \ref arm_cfft_q31 and will be removed in the future.
+  @param[in,out] S              points to an instance of the Q31 CFFT/CIFFT structure.
+  @param[in]     fftLen         length of the FFT.
+  @param[in]     ifftFlag       flag that selects transform direction
+                   - value = 0: forward transform
+                   - value = 1: inverse transform
+  @param[in]     bitReverseFlag flag that enables / disables bit reversal of output
+                   - value = 0: disables bit reversal of output
+                   - value = 1: enables bit reversal of output
+  @return        execution status
+                   - \ref ARM_MATH_SUCCESS        : Operation successful
+                   - \ref ARM_MATH_ARGUMENT_ERROR : <code>fftLen</code> is not a supported length
+
+  @par           Details
+                   The parameter <code>ifftFlag</code> controls whether a forward or inverse transform is computed.
+                   Set(=1) ifftFlag for calculation of CIFFT otherwise  CFFT is calculated
+  @par
+                   The parameter <code>bitReverseFlag</code> controls whether output is in normal order or bit reversed order.
+                   Set(=1) bitReverseFlag for output to be in normal order otherwise output is in bit reversed order.
+  @par
+                   The parameter <code>fftLen</code> Specifies length of CFFT/CIFFT process. Supported FFT Lengths are 16, 64, 256, 1024.
+  @par
+                   This Function also initializes Twiddle factor table pointer and Bit reversal table pointer.
+*/
+
+arm_status arm_cfft_radix4_init_q31(
+  arm_cfft_radix4_instance_q31 * S,
+  uint16_t fftLen,
+  uint8_t ifftFlag,
+  uint8_t bitReverseFlag)
+{
+  /*  Initialise the default arm status */
+  arm_status status = ARM_MATH_SUCCESS;
+  /*  Initialise the FFT length */
+  S->fftLen = fftLen;
+  /*  Initialise the Twiddle coefficient pointer */
+  S->pTwiddle = (q31_t *) twiddleCoef_4096_q31;
+  /*  Initialise the Flag for selection of CFFT or CIFFT */
+  S->ifftFlag = ifftFlag;
+  /*  Initialise the Flag for calculation Bit reversal or not */
+  S->bitReverseFlag = bitReverseFlag;
+
+  /*  Initializations of Instance structure depending on the FFT length */
+  switch (S->fftLen)
+  {
+    /*  Initializations of structure parameters for 4096 point FFT */
+  case 4096U:
+    /*  Initialise the twiddle coef modifier value */
+    S->twidCoefModifier = 1U;
+    /*  Initialise the bit reversal table modifier */
+    S->bitRevFactor = 1U;
+    /*  Initialise the bit reversal table pointer */
+    S->pBitRevTable = (uint16_t *) armBitRevTable;
+    break;
+
+    /*  Initializations of structure parameters for 1024 point FFT */
+  case 1024U:
+    /*  Initialise the twiddle coef modifier value */
+    S->twidCoefModifier = 4U;
+    /*  Initialise the bit reversal table modifier */
+    S->bitRevFactor = 4U;
+    /*  Initialise the bit reversal table pointer */
+    S->pBitRevTable = (uint16_t *) & armBitRevTable[3];
+    break;
+
+  case 256U:
+    /*  Initializations of structure parameters for 256 point FFT */
+    S->twidCoefModifier = 16U;
+    S->bitRevFactor = 16U;
+    S->pBitRevTable = (uint16_t *) & armBitRevTable[15];
+    break;
+
+  case 64U:
+    /*  Initializations of structure parameters for 64 point FFT */
+    S->twidCoefModifier = 64U;
+    S->bitRevFactor = 64U;
+    S->pBitRevTable = (uint16_t *) & armBitRevTable[63];
+    break;
+
+  case 16U:
+    /*  Initializations of structure parameters for 16 point FFT */
+    S->twidCoefModifier = 256U;
+    S->bitRevFactor = 256U;
+    S->pBitRevTable = (uint16_t *) & armBitRevTable[255];
+    break;
+
+  default:
+    /*  Reporting argument error if fftSize is not valid value */
+    status = ARM_MATH_ARGUMENT_ERROR;
+    break;
+  }
+
+  return (status);
+}
+
+/**
+  @} end of ComplexFFT group
+ */
diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_q15.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_q15.c
index 280acca..825a16b 100644
--- a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_q15.c
+++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_q15.c
@@ -1,1809 +1,1809 @@
-/* ----------------------------------------------------------------------
- * Project:      CMSIS DSP Library
- * Title:        arm_cfft_radix4_q15.c
- * Description:  This file has function definition of Radix-4 FFT & IFFT function and
- *               In-place bit reversal using bit reversal table
- *
- * $Date:        23 April 2021
- * $Revision:    V1.9.0
- *
- * Target Processor: Cortex-M and Cortex-A cores
- * -------------------------------------------------------------------- */
-/*
- * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "dsp/transform_functions.h"
-
-
-void arm_radix4_butterfly_q15(
-        q15_t * pSrc16,
-        uint32_t fftLen,
-  const q15_t * pCoef16,
-        uint32_t twidCoefModifier);
-
-void arm_radix4_butterfly_inverse_q15(
-        q15_t * pSrc16,
-        uint32_t fftLen,
-  const q15_t * pCoef16,
-        uint32_t twidCoefModifier);
-
-void arm_bitreversal_q15(
-        q15_t * pSrc,
-        uint32_t fftLen,
-        uint16_t bitRevFactor,
-  const uint16_t * pBitRevTab);
-
-/**
-  @ingroup groupTransforms
- */
-
-/**
-  @addtogroup ComplexFFT
-  @{
- */
-
-
-/**
-  @brief               Processing function for the Q15 CFFT/CIFFT.
-  @deprecated          Do not use this function.  It has been superseded by \ref arm_cfft_q15 and will be removed in the future.
-  @param[in]     S     points to an instance of the Q15 CFFT/CIFFT structure.
-  @param[in,out] pSrc  points to the complex data buffer. Processing occurs in-place.
-  @return        none
- 
-  @par Input and output formats:
-                 Internally input is downscaled by 2 for every stage to avoid saturations inside CFFT/CIFFT process.
-                 Hence the output format is different for different FFT sizes.
-                 The input and output formats for different FFT sizes and number of bits to upscale are mentioned in the tables below for CFFT and CIFFT:
-  @par
-                 \image html CFFTQ15.gif "Input and Output Formats for Q15 CFFT"
-                 \image html CIFFTQ15.gif "Input and Output Formats for Q15 CIFFT"
- */
-
-void arm_cfft_radix4_q15(
-  const arm_cfft_radix4_instance_q15 * S,
-        q15_t * pSrc)
-{
-  if (S->ifftFlag == 1U)
-  {
-    /*  Complex IFFT radix-4  */
-    arm_radix4_butterfly_inverse_q15(pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier);
-  }
-  else
-  {
-    /*  Complex FFT radix-4  */
-    arm_radix4_butterfly_q15(pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier);
-  }
-
-  if (S->bitReverseFlag == 1U)
-  {
-    /*  Bit Reversal */
-    arm_bitreversal_q15(pSrc, S->fftLen, S->bitRevFactor, S->pBitRevTable);
-  }
-
-}
-
-/**
-  @} end of ComplexFFT group
- */
-
-/*
- * Radix-4 FFT algorithm used is :
- *
- * Input real and imaginary data:
- * x(n) = xa + j * ya
- * x(n+N/4 ) = xb + j * yb
- * x(n+N/2 ) = xc + j * yc
- * x(n+3N 4) = xd + j * yd
- *
- *
- * Output real and imaginary data:
- * x(4r) = xa'+ j * ya'
- * x(4r+1) = xb'+ j * yb'
- * x(4r+2) = xc'+ j * yc'
- * x(4r+3) = xd'+ j * yd'
- *
- *
- * Twiddle factors for radix-4 FFT:
- * Wn = co1 + j * (- si1)
- * W2n = co2 + j * (- si2)
- * W3n = co3 + j * (- si3)
- 
- * The real and imaginary output values for the radix-4 butterfly are
- * xa' = xa + xb + xc + xd
- * ya' = ya + yb + yc + yd
- * xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1)
- * yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1)
- * xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2)
- * yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2)
- * xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3)
- * yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3)
- *
- */
-
-/**
-  @brief         Core function for the Q15 CFFT butterfly process.
-  @param[in,out] pSrc16          points to the in-place buffer of Q15 data type
-  @param[in]     fftLen           length of the FFT
-  @param[in]     pCoef16         points to twiddle coefficient buffer
-  @param[in]     twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table
-  @return        none
- */
-
-void arm_radix4_butterfly_q15(
-        q15_t * pSrc16,
-        uint32_t fftLen,
-  const q15_t * pCoef16,
-        uint32_t twidCoefModifier)
-{
-
-#if defined (ARM_MATH_DSP)
-
-        q31_t R, S, T, U;
-        q31_t C1, C2, C3, out1, out2;
-        uint32_t n1, n2, ic, i0, j, k;
-
-        q15_t *ptr1;
-        q15_t *pSi0;
-        q15_t *pSi1;
-        q15_t *pSi2;
-        q15_t *pSi3;
-
-        q31_t xaya, xbyb, xcyc, xdyd;
-
-  /* Total process is divided into three stages */
-
-  /* process first stage, middle stages, & last stage */
-
-  /*  Initializations for the first stage */
-  n2 = fftLen;
-  n1 = n2;
-
-  /* n2 = fftLen/4 */
-  n2 >>= 2U;
-
-  /* Index for twiddle coefficient */
-  ic = 0U;
-
-  /* Index for input read and output write */
-  j = n2;
-
-  pSi0 = pSrc16;
-  pSi1 = pSi0 + 2 * n2;
-  pSi2 = pSi1 + 2 * n2;
-  pSi3 = pSi2 + 2 * n2;
-
-  /* Input is in 1.15(q15) format */
-
-  /*  start of first stage process */
-  do
-  {
-    /*  Butterfly implementation */
-
-    /* Reading i0, i0+fftLen/2 inputs */
-    /* Read ya (real), xa(imag) input */
-    T = read_q15x2 (pSi0);
-    T = __SHADD16(T, 0); /* this is just a SIMD arithmetic shift right by 1 */
-    T = __SHADD16(T, 0); /* it turns out doing this twice is 2 cycles, the alternative takes 3 cycles */
-/*
-    in = ((int16_t) (T & 0xFFFF)) >> 2;       // alternative code that takes 3 cycles
-     T = ((T >> 2) & 0xFFFF0000) | (in & 0xFFFF);
-*/
-
-    /* Read yc (real), xc(imag) input */
-    S = read_q15x2 (pSi2);
-    S = __SHADD16(S, 0);
-    S = __SHADD16(S, 0);
-
-    /* R = packed((ya + yc), (xa + xc) ) */
-    R = __QADD16(T, S);
-
-    /* S = packed((ya - yc), (xa - xc) ) */
-    S = __QSUB16(T, S);
-
-    /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
-    /* Read yb (real), xb(imag) input */
-    T = read_q15x2 (pSi1);
-    T = __SHADD16(T, 0);
-    T = __SHADD16(T, 0);
-
-    /* Read yd (real), xd(imag) input */
-    U = read_q15x2 (pSi3);
-    U = __SHADD16(U, 0);
-    U = __SHADD16(U, 0);
-
-    /* T = packed((yb + yd), (xb + xd) ) */
-    T = __QADD16(T, U);
-
-    /*  writing the butterfly processed i0 sample */
-    /* xa' = xa + xb + xc + xd */
-    /* ya' = ya + yb + yc + yd */
-    write_q15x2_ia (&pSi0, __SHADD16(R, T));
-
-    /* R = packed((ya + yc) - (yb + yd), (xa + xc)- (xb + xd)) */
-    R = __QSUB16(R, T);
-
-    /* co2 & si2 are read from SIMD Coefficient pointer */
-    C2 = read_q15x2 ((q15_t *) pCoef16 + (4U * ic));
-
-#ifndef ARM_MATH_BIG_ENDIAN
-    /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
-    out1 = __SMUAD(C2, R) >> 16U;
-    /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
-    out2 = __SMUSDX(C2, R);
-#else
-    /* xc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
-    out1 = __SMUSDX(R, C2) >> 16U;
-    /* yc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
-    out2 = __SMUAD(C2, R);
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
-    /*  Reading i0+fftLen/4 */
-    /* T = packed(yb, xb) */
-    T = read_q15x2 (pSi1);
-    T = __SHADD16(T, 0);
-    T = __SHADD16(T, 0);
-
-    /* writing the butterfly processed i0 + fftLen/4 sample */
-    /* writing output(xc', yc') in little endian format */
-    write_q15x2_ia (&pSi1, (q31_t) __PKHBT( out1, out2, 0 ));
-
-    /*  Butterfly calculations */
-    /* U = packed(yd, xd) */
-    U = read_q15x2 (pSi3);
-    U = __SHADD16(U, 0);
-    U = __SHADD16(U, 0);
-
-    /* T = packed(yb-yd, xb-xd) */
-    T = __QSUB16(T, U);
-
-#ifndef ARM_MATH_BIG_ENDIAN
-    /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
-    R = __QASX(S, T);
-    /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
-    S = __QSAX(S, T);
-#else
-    /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
-    R = __QSAX(S, T);
-    /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
-    S = __QASX(S, T);
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
-    /* co1 & si1 are read from SIMD Coefficient pointer */
-    C1 = read_q15x2 ((q15_t *) pCoef16 + (2U * ic));
-    /*  Butterfly process for the i0+fftLen/2 sample */
-
-#ifndef ARM_MATH_BIG_ENDIAN
-    /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
-    out1 = __SMUAD(C1, S) >> 16U;
-    /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
-    out2 = __SMUSDX(C1, S);
-#else
-    /* xb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
-    out1 = __SMUSDX(S, C1) >> 16U;
-    /* yb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
-    out2 = __SMUAD(C1, S);
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
-    /* writing output(xb', yb') in little endian format */
-    write_q15x2_ia (&pSi2, __PKHBT( out1, out2, 0 ));
-
-    /* co3 & si3 are read from SIMD Coefficient pointer */
-    C3 = read_q15x2 ((q15_t *) pCoef16 + (6U * ic));
-    /*  Butterfly process for the i0+3fftLen/4 sample */
-
-#ifndef ARM_MATH_BIG_ENDIAN
-    /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
-    out1 = __SMUAD(C3, R) >> 16U;
-    /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
-    out2 = __SMUSDX(C3, R);
-#else
-    /* xd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
-    out1 = __SMUSDX(R, C3) >> 16U;
-    /* yd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
-    out2 = __SMUAD(C3, R);
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
-    /* writing output(xd', yd') in little endian format */
-    write_q15x2_ia (&pSi3, __PKHBT( out1, out2, 0 ));
-
-    /*  Twiddle coefficients index modifier */
-    ic = ic + twidCoefModifier;
-
-  } while (--j);
-  /* data is in 4.11(q11) format */
-
-  /* end of first stage process */
-
-
-  /* start of middle stage process */
-
-  /*  Twiddle coefficients index modifier */
-  twidCoefModifier <<= 2U;
-
-  /*  Calculation of Middle stage */
-  for (k = fftLen / 4U; k > 4U; k >>= 2U)
-  {
-    /*  Initializations for the middle stage */
-    n1 = n2;
-    n2 >>= 2U;
-    ic = 0U;
-
-    for (j = 0U; j <= (n2 - 1U); j++)
-    {
-      /*  index calculation for the coefficients */
-      C1 = read_q15x2 ((q15_t *) pCoef16 + (2U * ic));
-      C2 = read_q15x2 ((q15_t *) pCoef16 + (4U * ic));
-      C3 = read_q15x2 ((q15_t *) pCoef16 + (6U * ic));
-
-      /*  Twiddle coefficients index modifier */
-      ic = ic + twidCoefModifier;
-
-      pSi0 = pSrc16 + 2 * j;
-      pSi1 = pSi0 + 2 * n2;
-      pSi2 = pSi1 + 2 * n2;
-      pSi3 = pSi2 + 2 * n2;
-
-      /*  Butterfly implementation */
-      for (i0 = j; i0 < fftLen; i0 += n1)
-      {
-        /*  Reading i0, i0+fftLen/2 inputs */
-        /* Read ya (real), xa(imag) input */
-        T = read_q15x2 (pSi0);
-
-        /* Read yc (real), xc(imag) input */
-        S = read_q15x2 (pSi2);
-
-        /* R = packed( (ya + yc), (xa + xc)) */
-        R = __QADD16(T, S);
-
-        /* S = packed((ya - yc), (xa - xc)) */
-        S = __QSUB16(T, S);
-
-        /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
-        /* Read yb (real), xb(imag) input */
-        T = read_q15x2 (pSi1);
-
-        /* Read yd (real), xd(imag) input */
-        U = read_q15x2 (pSi3);
-
-        /* T = packed( (yb + yd), (xb + xd)) */
-        T = __QADD16(T, U);
-
-        /*  writing the butterfly processed i0 sample */
-
-        /* xa' = xa + xb + xc + xd */
-        /* ya' = ya + yb + yc + yd */
-        out1 = __SHADD16(R, T);
-        out1 = __SHADD16(out1, 0);
-        write_q15x2 (pSi0, out1);
-        pSi0 += 2 * n1;
-
-        /* R = packed( (ya + yc) - (yb + yd), (xa + xc) - (xb + xd)) */
-        R = __SHSUB16(R, T);
-
-#ifndef ARM_MATH_BIG_ENDIAN
-        /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
-        out1 = __SMUAD(C2, R) >> 16U;
-
-        /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
-        out2 = __SMUSDX(C2, R);
-#else
-        /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
-        out1 = __SMUSDX(R, C2) >> 16U;
-
-        /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
-        out2 = __SMUAD(C2, R);
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
-        /*  Reading i0+3fftLen/4 */
-        /* Read yb (real), xb(imag) input */
-        T = read_q15x2 (pSi1);
-
-        /*  writing the butterfly processed i0 + fftLen/4 sample */
-        /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
-        /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
-        write_q15x2 (pSi1, __PKHBT( out1, out2, 0 ));
-        pSi1 += 2 * n1;
-
-        /*  Butterfly calculations */
-
-        /* Read yd (real), xd(imag) input */
-        U = read_q15x2 (pSi3);
-
-        /* T = packed(yb-yd, xb-xd) */
-        T = __QSUB16(T, U);
-
-#ifndef ARM_MATH_BIG_ENDIAN
-        /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
-        R = __SHASX(S, T);
-
-        /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
-        S = __SHSAX(S, T);
-
-
-        /*  Butterfly process for the i0+fftLen/2 sample */
-        out1 = __SMUAD(C1, S) >> 16U;
-        out2 = __SMUSDX(C1, S);
-#else
-        /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
-        R = __SHSAX(S, T);
-
-        /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
-        S = __SHASX(S, T);
-
-
-        /*  Butterfly process for the i0+fftLen/2 sample */
-        out1 = __SMUSDX(S, C1) >> 16U;
-        out2 = __SMUAD(C1, S);
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
-        /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
-        /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
-        write_q15x2 (pSi2, __PKHBT( out1, out2, 0 ));
-        pSi2 += 2 * n1;
-
-        /*  Butterfly process for the i0+3fftLen/4 sample */
-
-#ifndef ARM_MATH_BIG_ENDIAN
-        out1 = __SMUAD(C3, R) >> 16U;
-        out2 = __SMUSDX(C3, R);
-#else
-        out1 = __SMUSDX(R, C3) >> 16U;
-        out2 = __SMUAD(C3, R);
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
-        /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
-        /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
-        write_q15x2 (pSi3, __PKHBT( out1, out2, 0 ));
-        pSi3 += 2 * n1;
-      }
-    }
-    /*  Twiddle coefficients index modifier */
-    twidCoefModifier <<= 2U;
-  }
-  /* end of middle stage process */
-
-
-  /* data is in 10.6(q6) format for the 1024 point */
-  /* data is in 8.8(q8) format for the 256 point */
-  /* data is in 6.10(q10) format for the 64 point */
-  /* data is in 4.12(q12) format for the 16 point */
-
-  /*  Initializations for the last stage */
-  j = fftLen >> 2;
-
-  ptr1 = &pSrc16[0];
-
-  /* start of last stage process */
-
-  /*  Butterfly implementation */
-  do
-  {
-    /* Read xa (real), ya(imag) input */
-    xaya = read_q15x2_ia (&ptr1);
-
-    /* Read xb (real), yb(imag) input */
-    xbyb = read_q15x2_ia (&ptr1);
-
-    /* Read xc (real), yc(imag) input */
-    xcyc = read_q15x2_ia (&ptr1);
-
-    /* Read xd (real), yd(imag) input */
-    xdyd = read_q15x2_ia (&ptr1);
-
-    /* R = packed((ya + yc), (xa + xc)) */
-    R = __QADD16(xaya, xcyc);
-
-    /* T = packed((yb + yd), (xb + xd)) */
-    T = __QADD16(xbyb, xdyd);
-
-    /* pointer updation for writing */
-    ptr1 = ptr1 - 8U;
-
-
-    /* xa' = xa + xb + xc + xd */
-    /* ya' = ya + yb + yc + yd */
-    write_q15x2_ia (&ptr1, __SHADD16(R, T));
-
-    /* T = packed((yb + yd), (xb + xd)) */
-    T = __QADD16(xbyb, xdyd);
-
-    /* xc' = (xa-xb+xc-xd) */
-    /* yc' = (ya-yb+yc-yd) */
-    write_q15x2_ia (&ptr1, __SHSUB16(R, T));
-
-    /* S = packed((ya - yc), (xa - xc)) */
-    S = __QSUB16(xaya, xcyc);
-
-    /* Read yd (real), xd(imag) input */
-    /* T = packed( (yb - yd), (xb - xd))  */
-    U = __QSUB16(xbyb, xdyd);
-
-#ifndef ARM_MATH_BIG_ENDIAN
-    /* xb' = (xa+yb-xc-yd) */
-    /* yb' = (ya-xb-yc+xd) */
-    write_q15x2_ia (&ptr1, __SHSAX(S, U));
-
-    /* xd' = (xa-yb-xc+yd) */
-    /* yd' = (ya+xb-yc-xd) */
-    write_q15x2_ia (&ptr1, __SHASX(S, U));
-#else
-    /* xb' = (xa+yb-xc-yd) */
-    /* yb' = (ya-xb-yc+xd) */
-    write_q15x2_ia (&ptr1, __SHASX(S, U));
-
-    /* xd' = (xa-yb-xc+yd) */
-    /* yd' = (ya+xb-yc-xd) */
-    write_q15x2_ia (&ptr1, __SHSAX(S, U));
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
-  } while (--j);
-
-  /* end of last stage process */
-
-  /* output is in 11.5(q5) format for the 1024 point */
-  /* output is in 9.7(q7) format for the 256 point   */
-  /* output is in 7.9(q9) format for the 64 point  */
-  /* output is in 5.11(q11) format for the 16 point  */
-
-
-#else /* #if defined (ARM_MATH_DSP) */
-
-        q15_t R0, R1, S0, S1, T0, T1, U0, U1;
-        q15_t Co1, Si1, Co2, Si2, Co3, Si3, out1, out2;
-        uint32_t n1, n2, ic, i0, i1, i2, i3, j, k;
-
-  /* Total process is divided into three stages */
-
-  /* process first stage, middle stages, & last stage */
-
-  /*  Initializations for the first stage */
-  n2 = fftLen;
-  n1 = n2;
-
-  /* n2 = fftLen/4 */
-  n2 >>= 2U;
-
-  /* Index for twiddle coefficient */
-  ic = 0U;
-
-  /* Index for input read and output write */
-  i0 = 0U;
-  j = n2;
-
-  /* Input is in 1.15(q15) format */
-
-  /*  start of first stage process */
-  do
-  {
-    /*  Butterfly implementation */
-
-    /*  index calculation for the input as, */
-    /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
-    i1 = i0 + n2;
-    i2 = i1 + n2;
-    i3 = i2 + n2;
-
-    /*  Reading i0, i0+fftLen/2 inputs */
-
-    /* input is down scale by 4 to avoid overflow */
-    /* Read ya (real), xa(imag) input */
-    T0 = pSrc16[i0 * 2U] >> 2U;
-    T1 = pSrc16[(i0 * 2U) + 1U] >> 2U;
-
-    /* input is down scale by 4 to avoid overflow */
-    /* Read yc (real), xc(imag) input */
-    S0 = pSrc16[i2 * 2U] >> 2U;
-    S1 = pSrc16[(i2 * 2U) + 1U] >> 2U;
-
-    /* R0 = (ya + yc) */
-    R0 = __SSAT(T0 + S0, 16U);
-    /* R1 = (xa + xc) */
-    R1 = __SSAT(T1 + S1, 16U);
-
-    /* S0 = (ya - yc) */
-    S0 = __SSAT(T0 - S0, 16);
-    /* S1 = (xa - xc) */
-    S1 = __SSAT(T1 - S1, 16);
-
-    /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
-    /* input is down scale by 4 to avoid overflow */
-    /* Read yb (real), xb(imag) input */
-    T0 = pSrc16[i1 * 2U] >> 2U;
-    T1 = pSrc16[(i1 * 2U) + 1U] >> 2U;
-
-    /* input is down scale by 4 to avoid overflow */
-    /* Read yd (real), xd(imag) input */
-    U0 = pSrc16[i3 * 2U] >> 2U;
-    U1 = pSrc16[(i3 * 2U) + 1] >> 2U;
-
-    /* T0 = (yb + yd) */
-    T0 = __SSAT(T0 + U0, 16U);
-    /* T1 = (xb + xd) */
-    T1 = __SSAT(T1 + U1, 16U);
-
-    /*  writing the butterfly processed i0 sample */
-    /* ya' = ya + yb + yc + yd */
-    /* xa' = xa + xb + xc + xd */
-    pSrc16[i0 * 2U] = (R0 >> 1U) + (T0 >> 1U);
-    pSrc16[(i0 * 2U) + 1U] = (R1 >> 1U) + (T1 >> 1U);
-
-    /* R0 = (ya + yc) - (yb + yd) */
-    /* R1 = (xa + xc) - (xb + xd) */
-    R0 = __SSAT(R0 - T0, 16U);
-    R1 = __SSAT(R1 - T1, 16U);
-
-    /* co2 & si2 are read from Coefficient pointer */
-    Co2 = pCoef16[2U * ic * 2U];
-    Si2 = pCoef16[(2U * ic * 2U) + 1];
-
-    /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
-    out1 = (q15_t) ((Co2 * R0 + Si2 * R1) >> 16U);
-    /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
-    out2 = (q15_t) ((-Si2 * R0 + Co2 * R1) >> 16U);
-
-    /*  Reading i0+fftLen/4 */
-    /* input is down scale by 4 to avoid overflow */
-    /* T0 = yb, T1 =  xb */
-    T0 = pSrc16[i1 * 2U] >> 2;
-    T1 = pSrc16[(i1 * 2U) + 1] >> 2;
-
-    /* writing the butterfly processed i0 + fftLen/4 sample */
-    /* writing output(xc', yc') in little endian format */
-    pSrc16[i1 * 2U] = out1;
-    pSrc16[(i1 * 2U) + 1] = out2;
-
-    /*  Butterfly calculations */
-    /* input is down scale by 4 to avoid overflow */
-    /* U0 = yd, U1 = xd */
-    U0 = pSrc16[i3 * 2U] >> 2;
-    U1 = pSrc16[(i3 * 2U) + 1] >> 2;
-    /* T0 = yb-yd */
-    T0 = __SSAT(T0 - U0, 16);
-    /* T1 = xb-xd */
-    T1 = __SSAT(T1 - U1, 16);
-
-    /* R1 = (ya-yc) + (xb- xd),  R0 = (xa-xc) - (yb-yd)) */
-    R0 = (q15_t) __SSAT((q31_t) (S0 - T1), 16);
-    R1 = (q15_t) __SSAT((q31_t) (S1 + T0), 16);
-
-    /* S1 = (ya-yc) - (xb- xd), S0 = (xa-xc) + (yb-yd)) */
-    S0 = (q15_t) __SSAT(((q31_t) S0 + T1), 16U);
-    S1 = (q15_t) __SSAT(((q31_t) S1 - T0), 16U);
-
-    /* co1 & si1 are read from Coefficient pointer */
-    Co1 = pCoef16[ic * 2U];
-    Si1 = pCoef16[(ic * 2U) + 1];
-    /*  Butterfly process for the i0+fftLen/2 sample */
-    /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
-    out1 = (q15_t) ((Si1 * S1 + Co1 * S0) >> 16);
-    /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
-    out2 = (q15_t) ((-Si1 * S0 + Co1 * S1) >> 16);
-
-    /* writing output(xb', yb') in little endian format */
-    pSrc16[i2 * 2U] = out1;
-    pSrc16[(i2 * 2U) + 1] = out2;
-
-    /* Co3 & si3 are read from Coefficient pointer */
-    Co3 = pCoef16[3U * (ic * 2U)];
-    Si3 = pCoef16[(3U * (ic * 2U)) + 1];
-    /*  Butterfly process for the i0+3fftLen/4 sample */
-    /* xd' = (xa-yb-xc+yd)* Co3 + (ya+xb-yc-xd)* (si3) */
-    out1 = (q15_t) ((Si3 * R1 + Co3 * R0) >> 16U);
-    /* yd' = (ya+xb-yc-xd)* Co3 - (xa-yb-xc+yd)* (si3) */
-    out2 = (q15_t) ((-Si3 * R0 + Co3 * R1) >> 16U);
-    /* writing output(xd', yd') in little endian format */
-    pSrc16[i3 * 2U] = out1;
-    pSrc16[(i3 * 2U) + 1] = out2;
-
-    /*  Twiddle coefficients index modifier */
-    ic = ic + twidCoefModifier;
-
-    /*  Updating input index */
-    i0 = i0 + 1U;
-
-  } while (--j);
-  /* data is in 4.11(q11) format */
-
-  /* end of first stage process */
-
-
-  /* start of middle stage process */
-
-  /*  Twiddle coefficients index modifier */
-  twidCoefModifier <<= 2U;
-
-  /*  Calculation of Middle stage */
-  for (k = fftLen / 4U; k > 4U; k >>= 2U)
-  {
-    /*  Initializations for the middle stage */
-    n1 = n2;
-    n2 >>= 2U;
-    ic = 0U;
-
-    for (j = 0U; j <= (n2 - 1U); j++)
-    {
-      /*  index calculation for the coefficients */
-      Co1 = pCoef16[ic * 2U];
-      Si1 = pCoef16[(ic * 2U) + 1U];
-      Co2 = pCoef16[2U * (ic * 2U)];
-      Si2 = pCoef16[(2U * (ic * 2U)) + 1U];
-      Co3 = pCoef16[3U * (ic * 2U)];
-      Si3 = pCoef16[(3U * (ic * 2U)) + 1U];
-
-      /*  Twiddle coefficients index modifier */
-      ic = ic + twidCoefModifier;
-
-      /*  Butterfly implementation */
-      for (i0 = j; i0 < fftLen; i0 += n1)
-      {
-        /*  index calculation for the input as, */
-        /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
-        i1 = i0 + n2;
-        i2 = i1 + n2;
-        i3 = i2 + n2;
-
-        /*  Reading i0, i0+fftLen/2 inputs */
-        /* Read ya (real), xa(imag) input */
-        T0 = pSrc16[i0 * 2U];
-        T1 = pSrc16[(i0 * 2U) + 1U];
-
-        /* Read yc (real), xc(imag) input */
-        S0 = pSrc16[i2 * 2U];
-        S1 = pSrc16[(i2 * 2U) + 1U];
-
-        /* R0 = (ya + yc), R1 = (xa + xc) */
-        R0 = __SSAT(T0 + S0, 16);
-        R1 = __SSAT(T1 + S1, 16);
-
-        /* S0 = (ya - yc), S1 =(xa - xc) */
-        S0 = __SSAT(T0 - S0, 16);
-        S1 = __SSAT(T1 - S1, 16);
-
-        /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
-        /* Read yb (real), xb(imag) input */
-        T0 = pSrc16[i1 * 2U];
-        T1 = pSrc16[(i1 * 2U) + 1U];
-
-        /* Read yd (real), xd(imag) input */
-        U0 = pSrc16[i3 * 2U];
-        U1 = pSrc16[(i3 * 2U) + 1U];
-
-
-        /* T0 = (yb + yd), T1 = (xb + xd) */
-        T0 = __SSAT(T0 + U0, 16);
-        T1 = __SSAT(T1 + U1, 16);
-
-        /*  writing the butterfly processed i0 sample */
-
-        /* xa' = xa + xb + xc + xd */
-        /* ya' = ya + yb + yc + yd */
-        out1 = ((R0 >> 1U) + (T0 >> 1U)) >> 1U;
-        out2 = ((R1 >> 1U) + (T1 >> 1U)) >> 1U;
-
-        pSrc16[i0 * 2U] = out1;
-        pSrc16[(2U * i0) + 1U] = out2;
-
-        /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
-        R0 = (R0 >> 1U) - (T0 >> 1U);
-        R1 = (R1 >> 1U) - (T1 >> 1U);
-
-        /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
-        out1 = (q15_t) ((Co2 * R0 + Si2 * R1) >> 16U);
-
-        /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
-        out2 = (q15_t) ((-Si2 * R0 + Co2 * R1) >> 16U);
-
-        /*  Reading i0+3fftLen/4 */
-        /* Read yb (real), xb(imag) input */
-        T0 = pSrc16[i1 * 2U];
-        T1 = pSrc16[(i1 * 2U) + 1U];
-
-        /*  writing the butterfly processed i0 + fftLen/4 sample */
-        /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
-        /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
-        pSrc16[i1 * 2U] = out1;
-        pSrc16[(i1 * 2U) + 1U] = out2;
-
-        /*  Butterfly calculations */
-
-        /* Read yd (real), xd(imag) input */
-        U0 = pSrc16[i3 * 2U];
-        U1 = pSrc16[(i3 * 2U) + 1U];
-
-        /* T0 = yb-yd, T1 = xb-xd */
-        T0 = __SSAT(T0 - U0, 16);
-        T1 = __SSAT(T1 - U1, 16);
-
-        /* R0 = (ya-yc) + (xb- xd), R1 = (xa-xc) - (yb-yd)) */
-        R0 = (S0 >> 1U) - (T1 >> 1U);
-        R1 = (S1 >> 1U) + (T0 >> 1U);
-
-        /* S0 = (ya-yc) - (xb- xd), S1 = (xa-xc) + (yb-yd)) */
-        S0 = (S0 >> 1U) + (T1 >> 1U);
-        S1 = (S1 >> 1U) - (T0 >> 1U);
-
-        /*  Butterfly process for the i0+fftLen/2 sample */
-        out1 = (q15_t) ((Co1 * S0 + Si1 * S1) >> 16U);
-
-        out2 = (q15_t) ((-Si1 * S0 + Co1 * S1) >> 16U);
-
-        /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
-        /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
-        pSrc16[i2 * 2U] = out1;
-        pSrc16[(i2 * 2U) + 1U] = out2;
-
-        /*  Butterfly process for the i0+3fftLen/4 sample */
-        out1 = (q15_t) ((Si3 * R1 + Co3 * R0) >> 16U);
-
-        out2 = (q15_t) ((-Si3 * R0 + Co3 * R1) >> 16U);
-        /* xd' = (xa-yb-xc+yd)* Co3 + (ya+xb-yc-xd)* (si3) */
-        /* yd' = (ya+xb-yc-xd)* Co3 - (xa-yb-xc+yd)* (si3) */
-        pSrc16[i3 * 2U] = out1;
-        pSrc16[(i3 * 2U) + 1U] = out2;
-      }
-    }
-    /*  Twiddle coefficients index modifier */
-    twidCoefModifier <<= 2U;
-  }
-  /* end of middle stage process */
-
-
-  /* data is in 10.6(q6) format for the 1024 point */
-  /* data is in 8.8(q8) format for the 256 point */
-  /* data is in 6.10(q10) format for the 64 point */
-  /* data is in 4.12(q12) format for the 16 point */
-
-  /*  Initializations for the last stage */
-  n1 = n2;
-  n2 >>= 2U;
-
-  /* start of last stage process */
-
-  /*  Butterfly implementation */
-  for (i0 = 0U; i0 <= (fftLen - n1); i0 += n1)
-  {
-    /*  index calculation for the input as, */
-    /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
-    i1 = i0 + n2;
-    i2 = i1 + n2;
-    i3 = i2 + n2;
-
-    /*  Reading i0, i0+fftLen/2 inputs */
-    /* Read ya (real), xa(imag) input */
-    T0 = pSrc16[i0 * 2U];
-    T1 = pSrc16[(i0 * 2U) + 1U];
-
-    /* Read yc (real), xc(imag) input */
-    S0 = pSrc16[i2 * 2U];
-    S1 = pSrc16[(i2 * 2U) + 1U];
-
-    /* R0 = (ya + yc), R1 = (xa + xc) */
-    R0 = __SSAT(T0 + S0, 16U);
-    R1 = __SSAT(T1 + S1, 16U);
-
-    /* S0 = (ya - yc), S1 = (xa - xc) */
-    S0 = __SSAT(T0 - S0, 16U);
-    S1 = __SSAT(T1 - S1, 16U);
-
-    /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
-    /* Read yb (real), xb(imag) input */
-    T0 = pSrc16[i1 * 2U];
-    T1 = pSrc16[(i1 * 2U) + 1U];
-    /* Read yd (real), xd(imag) input */
-    U0 = pSrc16[i3 * 2U];
-    U1 = pSrc16[(i3 * 2U) + 1U];
-
-    /* T0 = (yb + yd), T1 = (xb + xd)) */
-    T0 = __SSAT(T0 + U0, 16U);
-    T1 = __SSAT(T1 + U1, 16U);
-
-    /*  writing the butterfly processed i0 sample */
-    /* xa' = xa + xb + xc + xd */
-    /* ya' = ya + yb + yc + yd */
-    pSrc16[i0 * 2U] = (R0 >> 1U) + (T0 >> 1U);
-    pSrc16[(i0 * 2U) + 1U] = (R1 >> 1U) + (T1 >> 1U);
-
-    /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
-    R0 = (R0 >> 1U) - (T0 >> 1U);
-    R1 = (R1 >> 1U) - (T1 >> 1U);
-    /* Read yb (real), xb(imag) input */
-    T0 = pSrc16[i1 * 2U];
-    T1 = pSrc16[(i1 * 2U) + 1U];
-
-    /*  writing the butterfly processed i0 + fftLen/4 sample */
-    /* xc' = (xa-xb+xc-xd) */
-    /* yc' = (ya-yb+yc-yd) */
-    pSrc16[i1 * 2U] = R0;
-    pSrc16[(i1 * 2U) + 1U] = R1;
-
-    /* Read yd (real), xd(imag) input */
-    U0 = pSrc16[i3 * 2U];
-    U1 = pSrc16[(i3 * 2U) + 1U];
-    /* T0 = (yb - yd), T1 = (xb - xd)  */
-    T0 = __SSAT(T0 - U0, 16U);
-    T1 = __SSAT(T1 - U1, 16U);
-
-    /*  writing the butterfly processed i0 + fftLen/2 sample */
-    /* xb' = (xa+yb-xc-yd) */
-    /* yb' = (ya-xb-yc+xd) */
-    pSrc16[i2 * 2U] = (S0 >> 1U) + (T1 >> 1U);
-    pSrc16[(i2 * 2U) + 1U] = (S1 >> 1U) - (T0 >> 1U);
-
-    /*  writing the butterfly processed i0 + 3fftLen/4 sample */
-    /* xd' = (xa-yb-xc+yd) */
-    /* yd' = (ya+xb-yc-xd) */
-    pSrc16[i3 * 2U] = (S0 >> 1U) - (T1 >> 1U);
-    pSrc16[(i3 * 2U) + 1U] = (S1 >> 1U) + (T0 >> 1U);
-
-  }
-
-  /* end of last stage process */
-
-  /* output is in 11.5(q5) format for the 1024 point */
-  /* output is in 9.7(q7) format for the 256 point   */
-  /* output is in 7.9(q9) format for the 64 point  */
-  /* output is in 5.11(q11) format for the 16 point  */
-
-#endif /* #if defined (ARM_MATH_DSP) */
-
-}
-
-
-/**
-  @brief         Core function for the Q15 CIFFT butterfly process.
-  @param[in,out] pSrc16           points to the in-place buffer of Q15 data type
-  @param[in]     fftLen           length of the FFT
-  @param[in]     pCoef16          points to twiddle coefficient buffer
-  @param[in]     twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
-  @return        none
- */
-
-/*
- * Radix-4 IFFT algorithm used is :
- *
- * CIFFT uses same twiddle coefficients as CFFT function
- *  x[k] = x[n] + (j)k * x[n + fftLen/4] + (-1)k * x[n+fftLen/2] + (-j)k * x[n+3*fftLen/4]
- *
- *
- * IFFT is implemented with following changes in equations from FFT
- *
- * Input real and imaginary data:
- * x(n) = xa + j * ya
- * x(n+N/4 ) = xb + j * yb
- * x(n+N/2 ) = xc + j * yc
- * x(n+3N 4) = xd + j * yd
- *
- *
- * Output real and imaginary data:
- * x(4r) = xa'+ j * ya'
- * x(4r+1) = xb'+ j * yb'
- * x(4r+2) = xc'+ j * yc'
- * x(4r+3) = xd'+ j * yd'
- *
- *
- * Twiddle factors for radix-4 IFFT:
- * Wn = co1 + j * (si1)
- * W2n = co2 + j * (si2)
- * W3n = co3 + j * (si3)
- 
- * The real and imaginary output values for the radix-4 butterfly are
- * xa' = xa + xb + xc + xd
- * ya' = ya + yb + yc + yd
- * xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1)
- * yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1)
- * xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2)
- * yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2)
- * xd' = (xa+yb-xc-yd)* co3 - (ya-xb-yc+xd)* (si3)
- * yd' = (ya-xb-yc+xd)* co3 + (xa+yb-xc-yd)* (si3)
- *
- */
-
-void arm_radix4_butterfly_inverse_q15(
-        q15_t * pSrc16,
-        uint32_t fftLen,
-  const q15_t * pCoef16,
-        uint32_t twidCoefModifier)
-{
-
-#if defined (ARM_MATH_DSP)
-
-        q31_t R, S, T, U;
-        q31_t C1, C2, C3, out1, out2;
-        uint32_t n1, n2, ic, i0, j, k;
-        
-        q15_t *ptr1;
-        q15_t *pSi0;
-        q15_t *pSi1;
-        q15_t *pSi2;
-        q15_t *pSi3;
-        
-        q31_t xaya, xbyb, xcyc, xdyd;
-
-  /* Total process is divided into three stages */
-
-  /* process first stage, middle stages, & last stage */
-
-  /*  Initializations for the first stage */
-  n2 = fftLen;
-  n1 = n2;
-
-  /* n2 = fftLen/4 */
-  n2 >>= 2U;
-
-  /* Index for twiddle coefficient */
-  ic = 0U;
-
-  /* Index for input read and output write */
-  j = n2;
-
-  pSi0 = pSrc16;
-  pSi1 = pSi0 + 2 * n2;
-  pSi2 = pSi1 + 2 * n2;
-  pSi3 = pSi2 + 2 * n2;
-
-  /* Input is in 1.15(q15) format */
-
-  /*  start of first stage process */
-  do
-  {
-    /*  Butterfly implementation */
-
-    /*  Reading i0, i0+fftLen/2 inputs */
-    /* Read ya (real), xa(imag) input */
-    T = read_q15x2 (pSi0);
-    T = __SHADD16(T, 0);
-    T = __SHADD16(T, 0);
-
-    /* Read yc (real), xc(imag) input */
-    S = read_q15x2 (pSi2);
-    S = __SHADD16(S, 0);
-    S = __SHADD16(S, 0);
-
-    /* R = packed((ya + yc), (xa + xc) ) */
-    R = __QADD16(T, S);
-
-    /* S = packed((ya - yc), (xa - xc) ) */
-    S = __QSUB16(T, S);
-
-    /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
-    /* Read yb (real), xb(imag) input */
-    T = read_q15x2 (pSi1);
-    T = __SHADD16(T, 0);
-    T = __SHADD16(T, 0);
-
-    /* Read yd (real), xd(imag) input */
-    U = read_q15x2 (pSi3);
-    U = __SHADD16(U, 0);
-    U = __SHADD16(U, 0);
-
-    /* T = packed((yb + yd), (xb + xd) ) */
-    T = __QADD16(T, U);
-
-    /*  writing the butterfly processed i0 sample */
-    /* xa' = xa + xb + xc + xd */
-    /* ya' = ya + yb + yc + yd */
-    write_q15x2_ia (&pSi0, __SHADD16(R, T));
-
-    /* R = packed((ya + yc) - (yb + yd), (xa + xc)- (xb + xd)) */
-    R = __QSUB16(R, T);
-
-    /* co2 & si2 are read from SIMD Coefficient pointer */
-    C2 = read_q15x2 ((q15_t *) pCoef16 + (4U * ic));
-
-#ifndef ARM_MATH_BIG_ENDIAN
-    /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
-    out1 = __SMUSD(C2, R) >> 16U;
-    /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
-    out2 = __SMUADX(C2, R);
-#else
-    /* xc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
-    out1 = __SMUADX(C2, R) >> 16U;
-    /* yc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
-    out2 = __SMUSD(__QSUB16(0, C2), R);
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
-    /*  Reading i0+fftLen/4 */
-    /* T = packed(yb, xb) */
-    T = read_q15x2 (pSi1);
-    T = __SHADD16(T, 0);
-    T = __SHADD16(T, 0);
-
-    /* writing the butterfly processed i0 + fftLen/4 sample */
-    /* writing output(xc', yc') in little endian format */
-    write_q15x2_ia (&pSi1, (q31_t) __PKHBT( out1, out2, 0 ));
-
-    /*  Butterfly calculations */
-    /* U = packed(yd, xd) */
-    U = read_q15x2 (pSi3);
-    U = __SHADD16(U, 0);
-    U = __SHADD16(U, 0);
-
-    /* T = packed(yb-yd, xb-xd) */
-    T = __QSUB16(T, U);
-
-#ifndef ARM_MATH_BIG_ENDIAN
-    /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
-    R = __QSAX(S, T);
-    /* S = packed((ya-yc) + (xb- xd),  (xa-xc) - (yb-yd)) */
-    S = __QASX(S, T);
-#else
-    /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
-    R = __QASX(S, T);
-    /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
-    S = __QSAX(S, T);
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
-    /* co1 & si1 are read from SIMD Coefficient pointer */
-    C1 = read_q15x2 ((q15_t *) pCoef16 + (2U * ic));
-    /*  Butterfly process for the i0+fftLen/2 sample */
-
-#ifndef ARM_MATH_BIG_ENDIAN
-    /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
-    out1 = __SMUSD(C1, S) >> 16U;
-    /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
-    out2 = __SMUADX(C1, S);
-#else
-    /* xb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
-    out1 = __SMUADX(C1, S) >> 16U;
-    /* yb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
-    out2 = __SMUSD(__QSUB16(0, C1), S);
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
-    /* writing output(xb', yb') in little endian format */
-    write_q15x2_ia (&pSi2, __PKHBT( out1, out2, 0 ));
-
-    /* co3 & si3 are read from SIMD Coefficient pointer */
-    C3 = read_q15x2 ((q15_t *) pCoef16 + (6U * ic));
-    /*  Butterfly process for the i0+3fftLen/4 sample */
-
-#ifndef ARM_MATH_BIG_ENDIAN
-    /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
-    out1 = __SMUSD(C3, R) >> 16U;
-    /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
-    out2 = __SMUADX(C3, R);
-#else
-    /* xd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
-    out1 = __SMUADX(C3, R) >> 16U;
-    /* yd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
-    out2 = __SMUSD(__QSUB16(0, C3), R);
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
-    /* writing output(xd', yd') in little endian format */
-    write_q15x2_ia (&pSi3, __PKHBT( out1, out2, 0 ));
-
-    /*  Twiddle coefficients index modifier */
-    ic = ic + twidCoefModifier;
-
-  } while (--j);
-  /* data is in 4.11(q11) format */
-
-  /* end of first stage process */
-
-
-  /* start of middle stage process */
-
-  /*  Twiddle coefficients index modifier */
-  twidCoefModifier <<= 2U;
-
-  /*  Calculation of Middle stage */
-  for (k = fftLen / 4U; k > 4U; k >>= 2U)
-  {
-    /*  Initializations for the middle stage */
-    n1 = n2;
-    n2 >>= 2U;
-    ic = 0U;
-
-    for (j = 0U; j <= (n2 - 1U); j++)
-    {
-      /*  index calculation for the coefficients */
-      C1 = read_q15x2 ((q15_t *) pCoef16 + (2U * ic));
-      C2 = read_q15x2 ((q15_t *) pCoef16 + (4U * ic));
-      C3 = read_q15x2 ((q15_t *) pCoef16 + (6U * ic));
-
-      /*  Twiddle coefficients index modifier */
-      ic = ic + twidCoefModifier;
-
-      pSi0 = pSrc16 + 2 * j;
-      pSi1 = pSi0 + 2 * n2;
-      pSi2 = pSi1 + 2 * n2;
-      pSi3 = pSi2 + 2 * n2;
-
-      /*  Butterfly implementation */
-      for (i0 = j; i0 < fftLen; i0 += n1)
-      {
-        /*  Reading i0, i0+fftLen/2 inputs */
-        /* Read ya (real), xa(imag) input */
-        T = read_q15x2 (pSi0);
-
-        /* Read yc (real), xc(imag) input */
-        S = read_q15x2 (pSi2);
-
-        /* R = packed( (ya + yc), (xa + xc)) */
-        R = __QADD16(T, S);
-
-        /* S = packed((ya - yc), (xa - xc)) */
-        S = __QSUB16(T, S);
-
-        /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
-        /* Read yb (real), xb(imag) input */
-        T = read_q15x2 (pSi1);
-
-        /* Read yd (real), xd(imag) input */
-        U = read_q15x2 (pSi3);
-
-        /* T = packed( (yb + yd), (xb + xd)) */
-        T = __QADD16(T, U);
-
-        /*  writing the butterfly processed i0 sample */
-
-        /* xa' = xa + xb + xc + xd */
-        /* ya' = ya + yb + yc + yd */
-        out1 = __SHADD16(R, T);
-        out1 = __SHADD16(out1, 0);
-        write_q15x2 (pSi0, out1);
-        pSi0 += 2 * n1;
-
-        /* R = packed( (ya + yc) - (yb + yd), (xa + xc) - (xb + xd)) */
-        R = __SHSUB16(R, T);
-
-#ifndef ARM_MATH_BIG_ENDIAN
-        /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
-        out1 = __SMUSD(C2, R) >> 16U;
-
-        /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
-        out2 = __SMUADX(C2, R);
-#else
-        /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
-        out1 = __SMUADX(R, C2) >> 16U;
-
-        /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
-        out2 = __SMUSD(__QSUB16(0, C2), R);
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
-        /*  Reading i0+3fftLen/4 */
-        /* Read yb (real), xb(imag) input */
-        T = read_q15x2 (pSi1);
-
-        /*  writing the butterfly processed i0 + fftLen/4 sample */
-        /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
-        /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
-        write_q15x2 (pSi1, __PKHBT( out1, out2, 0 ));
-        pSi1 += 2 * n1;
-
-        /*  Butterfly calculations */
-
-        /* Read yd (real), xd(imag) input */
-        U = read_q15x2 (pSi3);
-
-        /* T = packed(yb-yd, xb-xd) */
-        T = __QSUB16(T, U);
-
-#ifndef ARM_MATH_BIG_ENDIAN
-        /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
-        R = __SHSAX(S, T);
-
-        /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
-        S = __SHASX(S, T);
-
-        /*  Butterfly process for the i0+fftLen/2 sample */
-        out1 = __SMUSD(C1, S) >> 16U;
-        out2 = __SMUADX(C1, S);
-#else
-        /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
-        R = __SHASX(S, T);
-
-        /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
-        S = __SHSAX(S, T);
-
-        /*  Butterfly process for the i0+fftLen/2 sample */
-        out1 = __SMUADX(S, C1) >> 16U;
-        out2 = __SMUSD(__QSUB16(0, C1), S);
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
-        /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
-        /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
-        write_q15x2 (pSi2, __PKHBT( out1, out2, 0 ));
-        pSi2 += 2 * n1;
-
-        /*  Butterfly process for the i0+3fftLen/4 sample */
-
-#ifndef ARM_MATH_BIG_ENDIAN
-        out1 = __SMUSD(C3, R) >> 16U;
-        out2 = __SMUADX(C3, R);
-#else
-        out1 = __SMUADX(C3, R) >> 16U;
-        out2 = __SMUSD(__QSUB16(0, C3), R);
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
-        /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
-        /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
-        write_q15x2 (pSi3, __PKHBT( out1, out2, 0 ));
-        pSi3 += 2 * n1;
-      }
-    }
-    /*  Twiddle coefficients index modifier */
-    twidCoefModifier <<= 2U;
-  }
-  /* end of middle stage process */
-
-  /* data is in 10.6(q6) format for the 1024 point */
-  /* data is in 8.8(q8) format for the 256 point */
-  /* data is in 6.10(q10) format for the 64 point */
-  /* data is in 4.12(q12) format for the 16 point */
-
-  /*  Initializations for the last stage */
-  j = fftLen >> 2;
-
-  ptr1 = &pSrc16[0];
-
-  /* start of last stage process */
-
-  /*  Butterfly implementation */
-  do
-  {
-    /* Read xa (real), ya(imag) input */
-    xaya = read_q15x2_ia (&ptr1);
-
-    /* Read xb (real), yb(imag) input */
-    xbyb = read_q15x2_ia (&ptr1);
-
-    /* Read xc (real), yc(imag) input */
-    xcyc = read_q15x2_ia (&ptr1);
-
-    /* Read xd (real), yd(imag) input */
-    xdyd = read_q15x2_ia (&ptr1);
-
-    /* R = packed((ya + yc), (xa + xc)) */
-    R = __QADD16(xaya, xcyc);
-
-    /* T = packed((yb + yd), (xb + xd)) */
-    T = __QADD16(xbyb, xdyd);
-
-    /* pointer updation for writing */
-    ptr1 = ptr1 - 8U;
-
-
-    /* xa' = xa + xb + xc + xd */
-    /* ya' = ya + yb + yc + yd */
-    write_q15x2_ia (&ptr1, __SHADD16(R, T));
-
-    /* T = packed((yb + yd), (xb + xd)) */
-    T = __QADD16(xbyb, xdyd);
-
-    /* xc' = (xa-xb+xc-xd) */
-    /* yc' = (ya-yb+yc-yd) */
-    write_q15x2_ia (&ptr1, __SHSUB16(R, T));
-
-    /* S = packed((ya - yc), (xa - xc)) */
-    S = __QSUB16(xaya, xcyc);
-
-    /* Read yd (real), xd(imag) input */
-    /* T = packed( (yb - yd), (xb - xd))  */
-    U = __QSUB16(xbyb, xdyd);
-
-#ifndef ARM_MATH_BIG_ENDIAN
-    /* xb' = (xa+yb-xc-yd) */
-    /* yb' = (ya-xb-yc+xd) */
-    write_q15x2_ia (&ptr1, __SHASX(S, U));
-
-    /* xd' = (xa-yb-xc+yd) */
-    /* yd' = (ya+xb-yc-xd) */
-    write_q15x2_ia (&ptr1, __SHSAX(S, U));
-#else
-    /* xb' = (xa+yb-xc-yd) */
-    /* yb' = (ya-xb-yc+xd) */
-    write_q15x2_ia (&ptr1, __SHSAX(S, U));
-
-    /* xd' = (xa-yb-xc+yd) */
-    /* yd' = (ya+xb-yc-xd) */
-    write_q15x2_ia (&ptr1, __SHASX(S, U));
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
-  } while (--j);
-
-  /* end of last stage  process */
-
-  /* output is in 11.5(q5) format for the 1024 point */
-  /* output is in 9.7(q7) format for the 256 point   */
-  /* output is in 7.9(q9) format for the 64 point  */
-  /* output is in 5.11(q11) format for the 16 point  */
-
-
-#else /* arm_radix4_butterfly_inverse_q15 */
-
-        q15_t R0, R1, S0, S1, T0, T1, U0, U1;
-        q15_t Co1, Si1, Co2, Si2, Co3, Si3, out1, out2;
-        uint32_t n1, n2, ic, i0, i1, i2, i3, j, k;
-
-  /* Total process is divided into three stages */
-
-  /* process first stage, middle stages, & last stage */
-
-  /*  Initializations for the first stage */
-  n2 = fftLen;
-  n1 = n2;
-
-  /* n2 = fftLen/4 */
-  n2 >>= 2U;
-
-  /* Index for twiddle coefficient */
-  ic = 0U;
-
-  /* Index for input read and output write */
-  i0 = 0U;
-
-  j = n2;
-
-  /* Input is in 1.15(q15) format */
-
-  /*  Start of first stage process */
-  do
-  {
-    /*  Butterfly implementation */
-
-    /*  index calculation for the input as, */
-    /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
-    i1 = i0 + n2;
-    i2 = i1 + n2;
-    i3 = i2 + n2;
-
-    /*  Reading i0, i0+fftLen/2 inputs */
-    /* input is down scale by 4 to avoid overflow */
-    /* Read ya (real), xa(imag) input */
-    T0 = pSrc16[i0 * 2U] >> 2U;
-    T1 = pSrc16[(i0 * 2U) + 1U] >> 2U;
-    /* input is down scale by 4 to avoid overflow */
-    /* Read yc (real), xc(imag) input */
-    S0 = pSrc16[i2 * 2U] >> 2U;
-    S1 = pSrc16[(i2 * 2U) + 1U] >> 2U;
-
-    /* R0 = (ya + yc), R1 = (xa + xc) */
-    R0 = __SSAT(T0 + S0, 16U);
-    R1 = __SSAT(T1 + S1, 16U);
-    /* S0 = (ya - yc), S1 = (xa - xc) */
-    S0 = __SSAT(T0 - S0, 16U);
-    S1 = __SSAT(T1 - S1, 16U);
-
-    /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
-    /* input is down scale by 4 to avoid overflow */
-    /* Read yb (real), xb(imag) input */
-    T0 = pSrc16[i1 * 2U] >> 2U;
-    T1 = pSrc16[(i1 * 2U) + 1U] >> 2U;
-    /* Read yd (real), xd(imag) input */
-    /* input is down scale by 4 to avoid overflow */
-    U0 = pSrc16[i3 * 2U] >> 2U;
-    U1 = pSrc16[(i3 * 2U) + 1U] >> 2U;
-
-    /* T0 = (yb + yd), T1 = (xb + xd) */
-    T0 = __SSAT(T0 + U0, 16U);
-    T1 = __SSAT(T1 + U1, 16U);
-
-    /*  writing the butterfly processed i0 sample */
-    /* xa' = xa + xb + xc + xd */
-    /* ya' = ya + yb + yc + yd */
-    pSrc16[i0 * 2U] = (R0 >> 1U) + (T0 >> 1U);
-    pSrc16[(i0 * 2U) + 1U] = (R1 >> 1U) + (T1 >> 1U);
-
-    /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc)- (xb + xd) */
-    R0 = __SSAT(R0 - T0, 16U);
-    R1 = __SSAT(R1 - T1, 16U);
-    /* co2 & si2 are read from Coefficient pointer */
-    Co2 = pCoef16[2U * ic * 2U];
-    Si2 = pCoef16[(2U * ic * 2U) + 1U];
-    /* xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2) */
-    out1 = (q15_t) ((Co2 * R0 - Si2 * R1) >> 16U);
-    /* yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2) */
-    out2 = (q15_t) ((Si2 * R0 + Co2 * R1) >> 16U);
-
-    /*  Reading i0+fftLen/4 */
-    /* input is down scale by 4 to avoid overflow */
-    /* T0 = yb, T1 = xb */
-    T0 = pSrc16[i1 * 2U] >> 2U;
-    T1 = pSrc16[(i1 * 2U) + 1U] >> 2U;
-
-    /* writing the butterfly processed i0 + fftLen/4 sample */
-    /* writing output(xc', yc') in little endian format */
-    pSrc16[i1 * 2U] = out1;
-    pSrc16[(i1 * 2U) + 1U] = out2;
-
-    /*  Butterfly calculations */
-    /* input is down scale by 4 to avoid overflow */
-    /* U0 = yd, U1 = xd) */
-    U0 = pSrc16[i3 * 2U] >> 2U;
-    U1 = pSrc16[(i3 * 2U) + 1U] >> 2U;
-
-    /* T0 = yb-yd, T1 = xb-xd) */
-    T0 = __SSAT(T0 - U0, 16U);
-    T1 = __SSAT(T1 - U1, 16U);
-    /* R0 = (ya-yc) - (xb- xd) , R1 = (xa-xc) + (yb-yd) */
-    R0 = (q15_t) __SSAT((q31_t) (S0 + T1), 16);
-    R1 = (q15_t) __SSAT((q31_t) (S1 - T0), 16);
-    /* S = (ya-yc) + (xb- xd), S1 = (xa-xc) - (yb-yd) */
-    S0 = (q15_t) __SSAT((q31_t) (S0 - T1), 16);
-    S1 = (q15_t) __SSAT((q31_t) (S1 + T0), 16);
-
-    /* co1 & si1 are read from Coefficient pointer */
-    Co1 = pCoef16[ic * 2U];
-    Si1 = pCoef16[(ic * 2U) + 1U];
-    /*  Butterfly process for the i0+fftLen/2 sample */
-    /* xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1) */
-    out1 = (q15_t) ((Co1 * S0 - Si1 * S1) >> 16U);
-    /* yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1) */
-    out2 = (q15_t) ((Si1 * S0 + Co1 * S1) >> 16U);
-    /* writing output(xb', yb') in little endian format */
-    pSrc16[i2 * 2U] = out1;
-    pSrc16[(i2 * 2U) + 1U] = out2;
-
-    /* Co3 & si3 are read from Coefficient pointer */
-    Co3 = pCoef16[3U * ic * 2U];
-    Si3 = pCoef16[(3U * ic * 2U) + 1U];
-    /*  Butterfly process for the i0+3fftLen/4 sample */
-    /* xd' = (xa+yb-xc-yd)* Co3 - (ya-xb-yc+xd)* (si3) */
-    out1 = (q15_t) ((Co3 * R0 - Si3 * R1) >> 16U);
-    /* yd' = (ya-xb-yc+xd)* Co3 + (xa+yb-xc-yd)* (si3) */
-    out2 = (q15_t) ((Si3 * R0 + Co3 * R1) >> 16U);
-    /* writing output(xd', yd') in little endian format */
-    pSrc16[i3 * 2U] = out1;
-    pSrc16[(i3 * 2U) + 1U] = out2;
-
-    /*  Twiddle coefficients index modifier */
-    ic = ic + twidCoefModifier;
-
-    /*  Updating input index */
-    i0 = i0 + 1U;
-
-  } while (--j);
-
-  /*  End of first stage process */
-
-  /* data is in 4.11(q11) format */
-
-
-  /*  Start of Middle stage process */
-
-  /*  Twiddle coefficients index modifier */
-  twidCoefModifier <<= 2U;
-
-  /*  Calculation of Middle stage */
-  for (k = fftLen / 4U; k > 4U; k >>= 2U)
-  {
-    /*  Initializations for the middle stage */
-    n1 = n2;
-    n2 >>= 2U;
-    ic = 0U;
-
-    for (j = 0U; j <= (n2 - 1U); j++)
-    {
-      /*  index calculation for the coefficients */
-      Co1 = pCoef16[ic * 2U];
-      Si1 = pCoef16[(ic * 2U) + 1U];
-      Co2 = pCoef16[2U * ic * 2U];
-      Si2 = pCoef16[2U * ic * 2U + 1U];
-      Co3 = pCoef16[3U * ic * 2U];
-      Si3 = pCoef16[(3U * ic * 2U) + 1U];
-
-      /*  Twiddle coefficients index modifier */
-      ic = ic + twidCoefModifier;
-
-      /*  Butterfly implementation */
-      for (i0 = j; i0 < fftLen; i0 += n1)
-      {
-        /*  index calculation for the input as, */
-        /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
-        i1 = i0 + n2;
-        i2 = i1 + n2;
-        i3 = i2 + n2;
-
-        /*  Reading i0, i0+fftLen/2 inputs */
-        /* Read ya (real), xa(imag) input */
-        T0 = pSrc16[i0 * 2U];
-        T1 = pSrc16[(i0 * 2U) + 1U];
-
-        /* Read yc (real), xc(imag) input */
-        S0 = pSrc16[i2 * 2U];
-        S1 = pSrc16[(i2 * 2U) + 1U];
-
-
-        /* R0 = (ya + yc), R1 = (xa + xc) */
-        R0 = __SSAT(T0 + S0, 16U);
-        R1 = __SSAT(T1 + S1, 16U);
-        /* S0 = (ya - yc), S1 = (xa - xc) */
-        S0 = __SSAT(T0 - S0, 16U);
-        S1 = __SSAT(T1 - S1, 16U);
-
-        /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
-        /* Read yb (real), xb(imag) input */
-        T0 = pSrc16[i1 * 2U];
-        T1 = pSrc16[(i1 * 2U) + 1U];
-
-        /* Read yd (real), xd(imag) input */
-        U0 = pSrc16[i3 * 2U];
-        U1 = pSrc16[(i3 * 2U) + 1U];
-
-        /* T0 = (yb + yd), T1 = (xb + xd) */
-        T0 = __SSAT(T0 + U0, 16U);
-        T1 = __SSAT(T1 + U1, 16U);
-
-        /*  writing the butterfly processed i0 sample */
-        /* xa' = xa + xb + xc + xd */
-        /* ya' = ya + yb + yc + yd */
-        pSrc16[i0 * 2U] = ((R0 >> 1U) + (T0 >> 1U)) >> 1U;
-        pSrc16[(i0 * 2U) + 1U] = ((R1 >> 1U) + (T1 >> 1U)) >> 1U;
-
-        /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
-        R0 = (R0 >> 1U) - (T0 >> 1U);
-        R1 = (R1 >> 1U) - (T1 >> 1U);
-
-        /* (ya-yb+yc-yd)* (si2) - (xa-xb+xc-xd)* co2 */
-        out1 = (q15_t) ((Co2 * R0 - Si2 * R1) >> 16);
-        /* (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2) */
-        out2 = (q15_t) ((Si2 * R0 + Co2 * R1) >> 16);
-
-        /*  Reading i0+3fftLen/4 */
-        /* Read yb (real), xb(imag) input */
-        T0 = pSrc16[i1 * 2U];
-        T1 = pSrc16[(i1 * 2U) + 1U];
-
-        /*  writing the butterfly processed i0 + fftLen/4 sample */
-        /* xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2) */
-        /* yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2) */
-        pSrc16[i1 * 2U] = out1;
-        pSrc16[(i1 * 2U) + 1U] = out2;
-
-        /*  Butterfly calculations */
-        /* Read yd (real), xd(imag) input */
-        U0 = pSrc16[i3 * 2U];
-        U1 = pSrc16[(i3 * 2U) + 1U];
-
-        /* T0 = yb-yd, T1 = xb-xd) */
-        T0 = __SSAT(T0 - U0, 16U);
-        T1 = __SSAT(T1 - U1, 16U);
-
-        /* R0 = (ya-yc) - (xb- xd) , R1 = (xa-xc) + (yb-yd) */
-        R0 = (S0 >> 1U) + (T1 >> 1U);
-        R1 = (S1 >> 1U) - (T0 >> 1U);
-
-        /* S1 = (ya-yc) + (xb- xd), S1 = (xa-xc) - (yb-yd) */
-        S0 = (S0 >> 1U) - (T1 >> 1U);
-        S1 = (S1 >> 1U) + (T0 >> 1U);
-
-        /*  Butterfly process for the i0+fftLen/2 sample */
-        out1 = (q15_t) ((Co1 * S0 - Si1 * S1) >> 16U);
-        out2 = (q15_t) ((Si1 * S0 + Co1 * S1) >> 16U);
-        /* xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1) */
-        /* yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1) */
-        pSrc16[i2 * 2U] = out1;
-        pSrc16[(i2 * 2U) + 1U] = out2;
-
-        /*  Butterfly process for the i0+3fftLen/4 sample */
-        out1 = (q15_t) ((Co3 * R0 - Si3 * R1) >> 16U);
-
-        out2 = (q15_t) ((Si3 * R0 + Co3 * R1) >> 16U);
-        /* xd' = (xa+yb-xc-yd)* Co3 - (ya-xb-yc+xd)* (si3) */
-        /* yd' = (ya-xb-yc+xd)* Co3 + (xa+yb-xc-yd)* (si3) */
-        pSrc16[i3 * 2U] = out1;
-        pSrc16[(i3 * 2U) + 1U] = out2;
-
-
-      }
-    }
-    /*  Twiddle coefficients index modifier */
-    twidCoefModifier <<= 2U;
-  }
-  /*  End of Middle stages process */
-
-
-  /* data is in 10.6(q6) format for the 1024 point */
-  /* data is in 8.8(q8) format for the 256 point   */
-  /* data is in 6.10(q10) format for the 64 point  */
-  /* data is in 4.12(q12) format for the 16 point  */
-
-  /* start of last stage process */
-
-
-  /*  Initializations for the last stage */
-  n1 = n2;
-  n2 >>= 2U;
-
-  /*  Butterfly implementation */
-  for (i0 = 0U; i0 <= (fftLen - n1); i0 += n1)
-  {
-    /*  index calculation for the input as, */
-    /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
-    i1 = i0 + n2;
-    i2 = i1 + n2;
-    i3 = i2 + n2;
-
-    /*  Reading i0, i0+fftLen/2 inputs */
-    /* Read ya (real), xa(imag) input */
-    T0 = pSrc16[i0 * 2U];
-    T1 = pSrc16[(i0 * 2U) + 1U];
-    /* Read yc (real), xc(imag) input */
-    S0 = pSrc16[i2 * 2U];
-    S1 = pSrc16[(i2 * 2U) + 1U];
-
-    /* R0 = (ya + yc), R1 = (xa + xc) */
-    R0 = __SSAT(T0 + S0, 16U);
-    R1 = __SSAT(T1 + S1, 16U);
-    /* S0 = (ya - yc), S1 = (xa - xc) */
-    S0 = __SSAT(T0 - S0, 16U);
-    S1 = __SSAT(T1 - S1, 16U);
-
-    /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
-    /* Read yb (real), xb(imag) input */
-    T0 = pSrc16[i1 * 2U];
-    T1 = pSrc16[(i1 * 2U) + 1U];
-    /* Read yd (real), xd(imag) input */
-    U0 = pSrc16[i3 * 2U];
-    U1 = pSrc16[(i3 * 2U) + 1U];
-
-    /* T0 = (yb + yd), T1 = (xb + xd) */
-    T0 = __SSAT(T0 + U0, 16U);
-    T1 = __SSAT(T1 + U1, 16U);
-
-    /*  writing the butterfly processed i0 sample */
-    /* xa' = xa + xb + xc + xd */
-    /* ya' = ya + yb + yc + yd */
-    pSrc16[i0 * 2U] = (R0 >> 1U) + (T0 >> 1U);
-    pSrc16[(i0 * 2U) + 1U] = (R1 >> 1U) + (T1 >> 1U);
-
-    /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
-    R0 = (R0 >> 1U) - (T0 >> 1U);
-    R1 = (R1 >> 1U) - (T1 >> 1U);
-
-    /* Read yb (real), xb(imag) input */
-    T0 = pSrc16[i1 * 2U];
-    T1 = pSrc16[(i1 * 2U) + 1U];
-
-    /*  writing the butterfly processed i0 + fftLen/4 sample */
-    /* xc' = (xa-xb+xc-xd) */
-    /* yc' = (ya-yb+yc-yd) */
-    pSrc16[i1 * 2U] = R0;
-    pSrc16[(i1 * 2U) + 1U] = R1;
-
-    /* Read yd (real), xd(imag) input */
-    U0 = pSrc16[i3 * 2U];
-    U1 = pSrc16[(i3 * 2U) + 1U];
-    /* T0 = (yb - yd), T1 = (xb - xd) */
-    T0 = __SSAT(T0 - U0, 16U);
-    T1 = __SSAT(T1 - U1, 16U);
-
-    /*  writing the butterfly processed i0 + fftLen/2 sample */
-    /* xb' = (xa-yb-xc+yd) */
-    /* yb' = (ya+xb-yc-xd) */
-    pSrc16[i2 * 2U] = (S0 >> 1U) - (T1 >> 1U);
-    pSrc16[(i2 * 2U) + 1U] = (S1 >> 1U) + (T0 >> 1U);
-
-
-    /*  writing the butterfly processed i0 + 3fftLen/4 sample */
-    /* xd' = (xa+yb-xc-yd) */
-    /* yd' = (ya-xb-yc+xd) */
-    pSrc16[i3 * 2U] = (S0 >> 1U) + (T1 >> 1U);
-    pSrc16[(i3 * 2U) + 1U] = (S1 >> 1U) - (T0 >> 1U);
-  }
-  /* end of last stage  process */
-
-  /* output is in 11.5(q5) format for the 1024 point */
-  /* output is in 9.7(q7) format for the 256 point   */
-  /* output is in 7.9(q9) format for the 64 point  */
-  /* output is in 5.11(q11) format for the 16 point  */
-
-#endif /* #if defined (ARM_MATH_DSP) */
-
-}
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_cfft_radix4_q15.c
+ * Description:  This file has function definition of Radix-4 FFT & IFFT function and
+ *               In-place bit reversal using bit reversal table
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+
+void arm_radix4_butterfly_q15(
+        q15_t * pSrc16,
+        uint32_t fftLen,
+  const q15_t * pCoef16,
+        uint32_t twidCoefModifier);
+
+void arm_radix4_butterfly_inverse_q15(
+        q15_t * pSrc16,
+        uint32_t fftLen,
+  const q15_t * pCoef16,
+        uint32_t twidCoefModifier);
+
+void arm_bitreversal_q15(
+        q15_t * pSrc,
+        uint32_t fftLen,
+        uint16_t bitRevFactor,
+  const uint16_t * pBitRevTab);
+
+/**
+  @ingroup groupTransforms
+ */
+
+/**
+  @addtogroup ComplexFFT
+  @{
+ */
+
+
+/**
+  @brief               Processing function for the Q15 CFFT/CIFFT.
+  @deprecated          Do not use this function.  It has been superseded by \ref arm_cfft_q15 and will be removed in the future.
+  @param[in]     S     points to an instance of the Q15 CFFT/CIFFT structure.
+  @param[in,out] pSrc  points to the complex data buffer. Processing occurs in-place.
+  @return        none
+ 
+  @par Input and output formats:
+                 Internally input is downscaled by 2 for every stage to avoid saturations inside CFFT/CIFFT process.
+                 Hence the output format is different for different FFT sizes.
+                 The input and output formats for different FFT sizes and number of bits to upscale are mentioned in the tables below for CFFT and CIFFT:
+  @par
+                 \image html CFFTQ15.gif "Input and Output Formats for Q15 CFFT"
+                 \image html CIFFTQ15.gif "Input and Output Formats for Q15 CIFFT"
+ */
+
+void arm_cfft_radix4_q15(
+  const arm_cfft_radix4_instance_q15 * S,
+        q15_t * pSrc)
+{
+  if (S->ifftFlag == 1U)
+  {
+    /*  Complex IFFT radix-4  */
+    arm_radix4_butterfly_inverse_q15(pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier);
+  }
+  else
+  {
+    /*  Complex FFT radix-4  */
+    arm_radix4_butterfly_q15(pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier);
+  }
+
+  if (S->bitReverseFlag == 1U)
+  {
+    /*  Bit Reversal */
+    arm_bitreversal_q15(pSrc, S->fftLen, S->bitRevFactor, S->pBitRevTable);
+  }
+
+}
+
+/**
+  @} end of ComplexFFT group
+ */
+
+/*
+ * Radix-4 FFT algorithm used is :
+ *
+ * Input real and imaginary data:
+ * x(n) = xa + j * ya
+ * x(n+N/4 ) = xb + j * yb
+ * x(n+N/2 ) = xc + j * yc
+ * x(n+3N 4) = xd + j * yd
+ *
+ *
+ * Output real and imaginary data:
+ * x(4r) = xa'+ j * ya'
+ * x(4r+1) = xb'+ j * yb'
+ * x(4r+2) = xc'+ j * yc'
+ * x(4r+3) = xd'+ j * yd'
+ *
+ *
+ * Twiddle factors for radix-4 FFT:
+ * Wn = co1 + j * (- si1)
+ * W2n = co2 + j * (- si2)
+ * W3n = co3 + j * (- si3)
+ 
+ * The real and imaginary output values for the radix-4 butterfly are
+ * xa' = xa + xb + xc + xd
+ * ya' = ya + yb + yc + yd
+ * xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1)
+ * yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1)
+ * xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2)
+ * yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2)
+ * xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3)
+ * yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3)
+ *
+ */
+
+/**
+  @brief         Core function for the Q15 CFFT butterfly process.
+  @param[in,out] pSrc16          points to the in-place buffer of Q15 data type
+  @param[in]     fftLen           length of the FFT
+  @param[in]     pCoef16         points to twiddle coefficient buffer
+  @param[in]     twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table
+  @return        none
+ */
+
+void arm_radix4_butterfly_q15(
+        q15_t * pSrc16,
+        uint32_t fftLen,
+  const q15_t * pCoef16,
+        uint32_t twidCoefModifier)
+{
+
+#if defined (ARM_MATH_DSP)
+
+        q31_t R, S, T, U;
+        q31_t C1, C2, C3, out1, out2;
+        uint32_t n1, n2, ic, i0, j, k;
+
+        q15_t *ptr1;
+        q15_t *pSi0;
+        q15_t *pSi1;
+        q15_t *pSi2;
+        q15_t *pSi3;
+
+        q31_t xaya, xbyb, xcyc, xdyd;
+
+  /* Total process is divided into three stages */
+
+  /* process first stage, middle stages, & last stage */
+
+  /*  Initializations for the first stage */
+  n2 = fftLen;
+  n1 = n2;
+
+  /* n2 = fftLen/4 */
+  n2 >>= 2U;
+
+  /* Index for twiddle coefficient */
+  ic = 0U;
+
+  /* Index for input read and output write */
+  j = n2;
+
+  pSi0 = pSrc16;
+  pSi1 = pSi0 + 2 * n2;
+  pSi2 = pSi1 + 2 * n2;
+  pSi3 = pSi2 + 2 * n2;
+
+  /* Input is in 1.15(q15) format */
+
+  /*  start of first stage process */
+  do
+  {
+    /*  Butterfly implementation */
+
+    /* Reading i0, i0+fftLen/2 inputs */
+    /* Read ya (real), xa(imag) input */
+    T = read_q15x2 (pSi0);
+    T = __SHADD16(T, 0); /* this is just a SIMD arithmetic shift right by 1 */
+    T = __SHADD16(T, 0); /* it turns out doing this twice is 2 cycles, the alternative takes 3 cycles */
+/*
+    in = ((int16_t) (T & 0xFFFF)) >> 2;       // alternative code that takes 3 cycles
+     T = ((T >> 2) & 0xFFFF0000) | (in & 0xFFFF);
+*/
+
+    /* Read yc (real), xc(imag) input */
+    S = read_q15x2 (pSi2);
+    S = __SHADD16(S, 0);
+    S = __SHADD16(S, 0);
+
+    /* R = packed((ya + yc), (xa + xc) ) */
+    R = __QADD16(T, S);
+
+    /* S = packed((ya - yc), (xa - xc) ) */
+    S = __QSUB16(T, S);
+
+    /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
+    /* Read yb (real), xb(imag) input */
+    T = read_q15x2 (pSi1);
+    T = __SHADD16(T, 0);
+    T = __SHADD16(T, 0);
+
+    /* Read yd (real), xd(imag) input */
+    U = read_q15x2 (pSi3);
+    U = __SHADD16(U, 0);
+    U = __SHADD16(U, 0);
+
+    /* T = packed((yb + yd), (xb + xd) ) */
+    T = __QADD16(T, U);
+
+    /*  writing the butterfly processed i0 sample */
+    /* xa' = xa + xb + xc + xd */
+    /* ya' = ya + yb + yc + yd */
+    write_q15x2_ia (&pSi0, __SHADD16(R, T));
+
+    /* R = packed((ya + yc) - (yb + yd), (xa + xc)- (xb + xd)) */
+    R = __QSUB16(R, T);
+
+    /* co2 & si2 are read from SIMD Coefficient pointer */
+    C2 = read_q15x2 ((q15_t *) pCoef16 + (4U * ic));
+
+#ifndef ARM_MATH_BIG_ENDIAN
+    /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
+    out1 = __SMUAD(C2, R) >> 16U;
+    /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
+    out2 = __SMUSDX(C2, R);
+#else
+    /* xc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
+    out1 = __SMUSDX(R, C2) >> 16U;
+    /* yc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
+    out2 = __SMUAD(C2, R);
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+    /*  Reading i0+fftLen/4 */
+    /* T = packed(yb, xb) */
+    T = read_q15x2 (pSi1);
+    T = __SHADD16(T, 0);
+    T = __SHADD16(T, 0);
+
+    /* writing the butterfly processed i0 + fftLen/4 sample */
+    /* writing output(xc', yc') in little endian format */
+    write_q15x2_ia (&pSi1, (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
+
+    /*  Butterfly calculations */
+    /* U = packed(yd, xd) */
+    U = read_q15x2 (pSi3);
+    U = __SHADD16(U, 0);
+    U = __SHADD16(U, 0);
+
+    /* T = packed(yb-yd, xb-xd) */
+    T = __QSUB16(T, U);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+    /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
+    R = __QASX(S, T);
+    /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
+    S = __QSAX(S, T);
+#else
+    /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
+    R = __QSAX(S, T);
+    /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
+    S = __QASX(S, T);
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+    /* co1 & si1 are read from SIMD Coefficient pointer */
+    C1 = read_q15x2 ((q15_t *) pCoef16 + (2U * ic));
+    /*  Butterfly process for the i0+fftLen/2 sample */
+
+#ifndef ARM_MATH_BIG_ENDIAN
+    /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
+    out1 = __SMUAD(C1, S) >> 16U;
+    /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
+    out2 = __SMUSDX(C1, S);
+#else
+    /* xb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
+    out1 = __SMUSDX(S, C1) >> 16U;
+    /* yb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
+    out2 = __SMUAD(C1, S);
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+    /* writing output(xb', yb') in little endian format */
+    write_q15x2_ia (&pSi2, ((out2) & 0xFFFF0000) | ((out1) & 0x0000FFFF));
+
+    /* co3 & si3 are read from SIMD Coefficient pointer */
+    C3 = read_q15x2 ((q15_t *) pCoef16 + (6U * ic));
+    /*  Butterfly process for the i0+3fftLen/4 sample */
+
+#ifndef ARM_MATH_BIG_ENDIAN
+    /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
+    out1 = __SMUAD(C3, R) >> 16U;
+    /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
+    out2 = __SMUSDX(C3, R);
+#else
+    /* xd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
+    out1 = __SMUSDX(R, C3) >> 16U;
+    /* yd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
+    out2 = __SMUAD(C3, R);
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+    /* writing output(xd', yd') in little endian format */
+    write_q15x2_ia (&pSi3, ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
+
+    /*  Twiddle coefficients index modifier */
+    ic = ic + twidCoefModifier;
+
+  } while (--j);
+  /* data is in 4.11(q11) format */
+
+  /* end of first stage process */
+
+
+  /* start of middle stage process */
+
+  /*  Twiddle coefficients index modifier */
+  twidCoefModifier <<= 2U;
+
+  /*  Calculation of Middle stage */
+  for (k = fftLen / 4U; k > 4U; k >>= 2U)
+  {
+    /*  Initializations for the middle stage */
+    n1 = n2;
+    n2 >>= 2U;
+    ic = 0U;
+
+    for (j = 0U; j <= (n2 - 1U); j++)
+    {
+      /*  index calculation for the coefficients */
+      C1 = read_q15x2 ((q15_t *) pCoef16 + (2U * ic));
+      C2 = read_q15x2 ((q15_t *) pCoef16 + (4U * ic));
+      C3 = read_q15x2 ((q15_t *) pCoef16 + (6U * ic));
+
+      /*  Twiddle coefficients index modifier */
+      ic = ic + twidCoefModifier;
+
+      pSi0 = pSrc16 + 2 * j;
+      pSi1 = pSi0 + 2 * n2;
+      pSi2 = pSi1 + 2 * n2;
+      pSi3 = pSi2 + 2 * n2;
+
+      /*  Butterfly implementation */
+      for (i0 = j; i0 < fftLen; i0 += n1)
+      {
+        /*  Reading i0, i0+fftLen/2 inputs */
+        /* Read ya (real), xa(imag) input */
+        T = read_q15x2 (pSi0);
+
+        /* Read yc (real), xc(imag) input */
+        S = read_q15x2 (pSi2);
+
+        /* R = packed( (ya + yc), (xa + xc)) */
+        R = __QADD16(T, S);
+
+        /* S = packed((ya - yc), (xa - xc)) */
+        S = __QSUB16(T, S);
+
+        /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
+        /* Read yb (real), xb(imag) input */
+        T = read_q15x2 (pSi1);
+
+        /* Read yd (real), xd(imag) input */
+        U = read_q15x2 (pSi3);
+
+        /* T = packed( (yb + yd), (xb + xd)) */
+        T = __QADD16(T, U);
+
+        /*  writing the butterfly processed i0 sample */
+
+        /* xa' = xa + xb + xc + xd */
+        /* ya' = ya + yb + yc + yd */
+        out1 = __SHADD16(R, T);
+        out1 = __SHADD16(out1, 0);
+        write_q15x2 (pSi0, out1);
+        pSi0 += 2 * n1;
+
+        /* R = packed( (ya + yc) - (yb + yd), (xa + xc) - (xb + xd)) */
+        R = __SHSUB16(R, T);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+        /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
+        out1 = __SMUAD(C2, R) >> 16U;
+
+        /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
+        out2 = __SMUSDX(C2, R);
+#else
+        /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
+        out1 = __SMUSDX(R, C2) >> 16U;
+
+        /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
+        out2 = __SMUAD(C2, R);
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+        /*  Reading i0+3fftLen/4 */
+        /* Read yb (real), xb(imag) input */
+        T = read_q15x2 (pSi1);
+
+        /*  writing the butterfly processed i0 + fftLen/4 sample */
+        /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
+        /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
+        write_q15x2 (pSi1, ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
+        pSi1 += 2 * n1;
+
+        /*  Butterfly calculations */
+
+        /* Read yd (real), xd(imag) input */
+        U = read_q15x2 (pSi3);
+
+        /* T = packed(yb-yd, xb-xd) */
+        T = __QSUB16(T, U);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+        /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
+        R = __SHASX(S, T);
+
+        /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
+        S = __SHSAX(S, T);
+
+
+        /*  Butterfly process for the i0+fftLen/2 sample */
+        out1 = __SMUAD(C1, S) >> 16U;
+        out2 = __SMUSDX(C1, S);
+#else
+        /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
+        R = __SHSAX(S, T);
+
+        /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
+        S = __SHASX(S, T);
+
+
+        /*  Butterfly process for the i0+fftLen/2 sample */
+        out1 = __SMUSDX(S, C1) >> 16U;
+        out2 = __SMUAD(C1, S);
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+        /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
+        /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
+        write_q15x2 (pSi2, ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
+        pSi2 += 2 * n1;
+
+        /*  Butterfly process for the i0+3fftLen/4 sample */
+
+#ifndef ARM_MATH_BIG_ENDIAN
+        out1 = __SMUAD(C3, R) >> 16U;
+        out2 = __SMUSDX(C3, R);
+#else
+        out1 = __SMUSDX(R, C3) >> 16U;
+        out2 = __SMUAD(C3, R);
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+        /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
+        /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
+        write_q15x2 (pSi3, ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
+        pSi3 += 2 * n1;
+      }
+    }
+    /*  Twiddle coefficients index modifier */
+    twidCoefModifier <<= 2U;
+  }
+  /* end of middle stage process */
+
+
+  /* data is in 10.6(q6) format for the 1024 point */
+  /* data is in 8.8(q8) format for the 256 point */
+  /* data is in 6.10(q10) format for the 64 point */
+  /* data is in 4.12(q12) format for the 16 point */
+
+  /*  Initializations for the last stage */
+  j = fftLen >> 2;
+
+  ptr1 = &pSrc16[0];
+
+  /* start of last stage process */
+
+  /*  Butterfly implementation */
+  do
+  {
+    /* Read xa (real), ya(imag) input */
+    xaya = read_q15x2_ia ((q15_t **) &ptr1);
+
+    /* Read xb (real), yb(imag) input */
+    xbyb = read_q15x2_ia ((q15_t **) &ptr1);
+
+    /* Read xc (real), yc(imag) input */
+    xcyc = read_q15x2_ia ((q15_t **) &ptr1);
+
+    /* Read xd (real), yd(imag) input */
+    xdyd = read_q15x2_ia ((q15_t **) &ptr1);
+
+    /* R = packed((ya + yc), (xa + xc)) */
+    R = __QADD16(xaya, xcyc);
+
+    /* T = packed((yb + yd), (xb + xd)) */
+    T = __QADD16(xbyb, xdyd);
+
+    /* pointer updation for writing */
+    ptr1 = ptr1 - 8U;
+
+
+    /* xa' = xa + xb + xc + xd */
+    /* ya' = ya + yb + yc + yd */
+    write_q15x2_ia (&ptr1, __SHADD16(R, T));
+
+    /* T = packed((yb + yd), (xb + xd)) */
+    T = __QADD16(xbyb, xdyd);
+
+    /* xc' = (xa-xb+xc-xd) */
+    /* yc' = (ya-yb+yc-yd) */
+    write_q15x2_ia (&ptr1, __SHSUB16(R, T));
+
+    /* S = packed((ya - yc), (xa - xc)) */
+    S = __QSUB16(xaya, xcyc);
+
+    /* Read yd (real), xd(imag) input */
+    /* T = packed( (yb - yd), (xb - xd))  */
+    U = __QSUB16(xbyb, xdyd);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+    /* xb' = (xa+yb-xc-yd) */
+    /* yb' = (ya-xb-yc+xd) */
+    write_q15x2_ia (&ptr1, __SHSAX(S, U));
+
+    /* xd' = (xa-yb-xc+yd) */
+    /* yd' = (ya+xb-yc-xd) */
+    write_q15x2_ia (&ptr1, __SHASX(S, U));
+#else
+    /* xb' = (xa+yb-xc-yd) */
+    /* yb' = (ya-xb-yc+xd) */
+    write_q15x2_ia (&ptr1, __SHASX(S, U));
+
+    /* xd' = (xa-yb-xc+yd) */
+    /* yd' = (ya+xb-yc-xd) */
+    write_q15x2_ia (&ptr1, __SHSAX(S, U));
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+  } while (--j);
+
+  /* end of last stage process */
+
+  /* output is in 11.5(q5) format for the 1024 point */
+  /* output is in 9.7(q7) format for the 256 point   */
+  /* output is in 7.9(q9) format for the 64 point  */
+  /* output is in 5.11(q11) format for the 16 point  */
+
+
+#else /* #if defined (ARM_MATH_DSP) */
+
+        q15_t R0, R1, S0, S1, T0, T1, U0, U1;
+        q15_t Co1, Si1, Co2, Si2, Co3, Si3, out1, out2;
+        uint32_t n1, n2, ic, i0, i1, i2, i3, j, k;
+
+  /* Total process is divided into three stages */
+
+  /* process first stage, middle stages, & last stage */
+
+  /*  Initializations for the first stage */
+  n2 = fftLen;
+  n1 = n2;
+
+  /* n2 = fftLen/4 */
+  n2 >>= 2U;
+
+  /* Index for twiddle coefficient */
+  ic = 0U;
+
+  /* Index for input read and output write */
+  i0 = 0U;
+  j = n2;
+
+  /* Input is in 1.15(q15) format */
+
+  /*  start of first stage process */
+  do
+  {
+    /*  Butterfly implementation */
+
+    /*  index calculation for the input as, */
+    /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
+    i1 = i0 + n2;
+    i2 = i1 + n2;
+    i3 = i2 + n2;
+
+    /*  Reading i0, i0+fftLen/2 inputs */
+
+    /* input is down scale by 4 to avoid overflow */
+    /* Read ya (real), xa(imag) input */
+    T0 = pSrc16[i0 * 2U] >> 2U;
+    T1 = pSrc16[(i0 * 2U) + 1U] >> 2U;
+
+    /* input is down scale by 4 to avoid overflow */
+    /* Read yc (real), xc(imag) input */
+    S0 = pSrc16[i2 * 2U] >> 2U;
+    S1 = pSrc16[(i2 * 2U) + 1U] >> 2U;
+
+    /* R0 = (ya + yc) */
+    R0 = __SSAT(T0 + S0, 16U);
+    /* R1 = (xa + xc) */
+    R1 = __SSAT(T1 + S1, 16U);
+
+    /* S0 = (ya - yc) */
+    S0 = __SSAT(T0 - S0, 16);
+    /* S1 = (xa - xc) */
+    S1 = __SSAT(T1 - S1, 16);
+
+    /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
+    /* input is down scale by 4 to avoid overflow */
+    /* Read yb (real), xb(imag) input */
+    T0 = pSrc16[i1 * 2U] >> 2U;
+    T1 = pSrc16[(i1 * 2U) + 1U] >> 2U;
+
+    /* input is down scale by 4 to avoid overflow */
+    /* Read yd (real), xd(imag) input */
+    U0 = pSrc16[i3 * 2U] >> 2U;
+    U1 = pSrc16[(i3 * 2U) + 1] >> 2U;
+
+    /* T0 = (yb + yd) */
+    T0 = __SSAT(T0 + U0, 16U);
+    /* T1 = (xb + xd) */
+    T1 = __SSAT(T1 + U1, 16U);
+
+    /*  writing the butterfly processed i0 sample */
+    /* ya' = ya + yb + yc + yd */
+    /* xa' = xa + xb + xc + xd */
+    pSrc16[i0 * 2U] = (R0 >> 1U) + (T0 >> 1U);
+    pSrc16[(i0 * 2U) + 1U] = (R1 >> 1U) + (T1 >> 1U);
+
+    /* R0 = (ya + yc) - (yb + yd) */
+    /* R1 = (xa + xc) - (xb + xd) */
+    R0 = __SSAT(R0 - T0, 16U);
+    R1 = __SSAT(R1 - T1, 16U);
+
+    /* co2 & si2 are read from Coefficient pointer */
+    Co2 = pCoef16[2U * ic * 2U];
+    Si2 = pCoef16[(2U * ic * 2U) + 1];
+
+    /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
+    out1 = (q15_t) ((Co2 * R0 + Si2 * R1) >> 16U);
+    /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
+    out2 = (q15_t) ((-Si2 * R0 + Co2 * R1) >> 16U);
+
+    /*  Reading i0+fftLen/4 */
+    /* input is down scale by 4 to avoid overflow */
+    /* T0 = yb, T1 =  xb */
+    T0 = pSrc16[i1 * 2U] >> 2;
+    T1 = pSrc16[(i1 * 2U) + 1] >> 2;
+
+    /* writing the butterfly processed i0 + fftLen/4 sample */
+    /* writing output(xc', yc') in little endian format */
+    pSrc16[i1 * 2U] = out1;
+    pSrc16[(i1 * 2U) + 1] = out2;
+
+    /*  Butterfly calculations */
+    /* input is down scale by 4 to avoid overflow */
+    /* U0 = yd, U1 = xd */
+    U0 = pSrc16[i3 * 2U] >> 2;
+    U1 = pSrc16[(i3 * 2U) + 1] >> 2;
+    /* T0 = yb-yd */
+    T0 = __SSAT(T0 - U0, 16);
+    /* T1 = xb-xd */
+    T1 = __SSAT(T1 - U1, 16);
+
+    /* R1 = (ya-yc) + (xb- xd),  R0 = (xa-xc) - (yb-yd)) */
+    R0 = (q15_t) __SSAT((q31_t) (S0 - T1), 16);
+    R1 = (q15_t) __SSAT((q31_t) (S1 + T0), 16);
+
+    /* S1 = (ya-yc) - (xb- xd), S0 = (xa-xc) + (yb-yd)) */
+    S0 = (q15_t) __SSAT(((q31_t) S0 + T1), 16U);
+    S1 = (q15_t) __SSAT(((q31_t) S1 - T0), 16U);
+
+    /* co1 & si1 are read from Coefficient pointer */
+    Co1 = pCoef16[ic * 2U];
+    Si1 = pCoef16[(ic * 2U) + 1];
+    /*  Butterfly process for the i0+fftLen/2 sample */
+    /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
+    out1 = (q15_t) ((Si1 * S1 + Co1 * S0) >> 16);
+    /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
+    out2 = (q15_t) ((-Si1 * S0 + Co1 * S1) >> 16);
+
+    /* writing output(xb', yb') in little endian format */
+    pSrc16[i2 * 2U] = out1;
+    pSrc16[(i2 * 2U) + 1] = out2;
+
+    /* Co3 & si3 are read from Coefficient pointer */
+    Co3 = pCoef16[3U * (ic * 2U)];
+    Si3 = pCoef16[(3U * (ic * 2U)) + 1];
+    /*  Butterfly process for the i0+3fftLen/4 sample */
+    /* xd' = (xa-yb-xc+yd)* Co3 + (ya+xb-yc-xd)* (si3) */
+    out1 = (q15_t) ((Si3 * R1 + Co3 * R0) >> 16U);
+    /* yd' = (ya+xb-yc-xd)* Co3 - (xa-yb-xc+yd)* (si3) */
+    out2 = (q15_t) ((-Si3 * R0 + Co3 * R1) >> 16U);
+    /* writing output(xd', yd') in little endian format */
+    pSrc16[i3 * 2U] = out1;
+    pSrc16[(i3 * 2U) + 1] = out2;
+
+    /*  Twiddle coefficients index modifier */
+    ic = ic + twidCoefModifier;
+
+    /*  Updating input index */
+    i0 = i0 + 1U;
+
+  } while (--j);
+  /* data is in 4.11(q11) format */
+
+  /* end of first stage process */
+
+
+  /* start of middle stage process */
+
+  /*  Twiddle coefficients index modifier */
+  twidCoefModifier <<= 2U;
+
+  /*  Calculation of Middle stage */
+  for (k = fftLen / 4U; k > 4U; k >>= 2U)
+  {
+    /*  Initializations for the middle stage */
+    n1 = n2;
+    n2 >>= 2U;
+    ic = 0U;
+
+    for (j = 0U; j <= (n2 - 1U); j++)
+    {
+      /*  index calculation for the coefficients */
+      Co1 = pCoef16[ic * 2U];
+      Si1 = pCoef16[(ic * 2U) + 1U];
+      Co2 = pCoef16[2U * (ic * 2U)];
+      Si2 = pCoef16[(2U * (ic * 2U)) + 1U];
+      Co3 = pCoef16[3U * (ic * 2U)];
+      Si3 = pCoef16[(3U * (ic * 2U)) + 1U];
+
+      /*  Twiddle coefficients index modifier */
+      ic = ic + twidCoefModifier;
+
+      /*  Butterfly implementation */
+      for (i0 = j; i0 < fftLen; i0 += n1)
+      {
+        /*  index calculation for the input as, */
+        /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
+        i1 = i0 + n2;
+        i2 = i1 + n2;
+        i3 = i2 + n2;
+
+        /*  Reading i0, i0+fftLen/2 inputs */
+        /* Read ya (real), xa(imag) input */
+        T0 = pSrc16[i0 * 2U];
+        T1 = pSrc16[(i0 * 2U) + 1U];
+
+        /* Read yc (real), xc(imag) input */
+        S0 = pSrc16[i2 * 2U];
+        S1 = pSrc16[(i2 * 2U) + 1U];
+
+        /* R0 = (ya + yc), R1 = (xa + xc) */
+        R0 = __SSAT(T0 + S0, 16);
+        R1 = __SSAT(T1 + S1, 16);
+
+        /* S0 = (ya - yc), S1 =(xa - xc) */
+        S0 = __SSAT(T0 - S0, 16);
+        S1 = __SSAT(T1 - S1, 16);
+
+        /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
+        /* Read yb (real), xb(imag) input */
+        T0 = pSrc16[i1 * 2U];
+        T1 = pSrc16[(i1 * 2U) + 1U];
+
+        /* Read yd (real), xd(imag) input */
+        U0 = pSrc16[i3 * 2U];
+        U1 = pSrc16[(i3 * 2U) + 1U];
+
+
+        /* T0 = (yb + yd), T1 = (xb + xd) */
+        T0 = __SSAT(T0 + U0, 16);
+        T1 = __SSAT(T1 + U1, 16);
+
+        /*  writing the butterfly processed i0 sample */
+
+        /* xa' = xa + xb + xc + xd */
+        /* ya' = ya + yb + yc + yd */
+        out1 = ((R0 >> 1U) + (T0 >> 1U)) >> 1U;
+        out2 = ((R1 >> 1U) + (T1 >> 1U)) >> 1U;
+
+        pSrc16[i0 * 2U] = out1;
+        pSrc16[(2U * i0) + 1U] = out2;
+
+        /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
+        R0 = (R0 >> 1U) - (T0 >> 1U);
+        R1 = (R1 >> 1U) - (T1 >> 1U);
+
+        /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
+        out1 = (q15_t) ((Co2 * R0 + Si2 * R1) >> 16U);
+
+        /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
+        out2 = (q15_t) ((-Si2 * R0 + Co2 * R1) >> 16U);
+
+        /*  Reading i0+3fftLen/4 */
+        /* Read yb (real), xb(imag) input */
+        T0 = pSrc16[i1 * 2U];
+        T1 = pSrc16[(i1 * 2U) + 1U];
+
+        /*  writing the butterfly processed i0 + fftLen/4 sample */
+        /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
+        /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
+        pSrc16[i1 * 2U] = out1;
+        pSrc16[(i1 * 2U) + 1U] = out2;
+
+        /*  Butterfly calculations */
+
+        /* Read yd (real), xd(imag) input */
+        U0 = pSrc16[i3 * 2U];
+        U1 = pSrc16[(i3 * 2U) + 1U];
+
+        /* T0 = yb-yd, T1 = xb-xd */
+        T0 = __SSAT(T0 - U0, 16);
+        T1 = __SSAT(T1 - U1, 16);
+
+        /* R0 = (ya-yc) + (xb- xd), R1 = (xa-xc) - (yb-yd)) */
+        R0 = (S0 >> 1U) - (T1 >> 1U);
+        R1 = (S1 >> 1U) + (T0 >> 1U);
+
+        /* S0 = (ya-yc) - (xb- xd), S1 = (xa-xc) + (yb-yd)) */
+        S0 = (S0 >> 1U) + (T1 >> 1U);
+        S1 = (S1 >> 1U) - (T0 >> 1U);
+
+        /*  Butterfly process for the i0+fftLen/2 sample */
+        out1 = (q15_t) ((Co1 * S0 + Si1 * S1) >> 16U);
+
+        out2 = (q15_t) ((-Si1 * S0 + Co1 * S1) >> 16U);
+
+        /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
+        /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
+        pSrc16[i2 * 2U] = out1;
+        pSrc16[(i2 * 2U) + 1U] = out2;
+
+        /*  Butterfly process for the i0+3fftLen/4 sample */
+        out1 = (q15_t) ((Si3 * R1 + Co3 * R0) >> 16U);
+
+        out2 = (q15_t) ((-Si3 * R0 + Co3 * R1) >> 16U);
+        /* xd' = (xa-yb-xc+yd)* Co3 + (ya+xb-yc-xd)* (si3) */
+        /* yd' = (ya+xb-yc-xd)* Co3 - (xa-yb-xc+yd)* (si3) */
+        pSrc16[i3 * 2U] = out1;
+        pSrc16[(i3 * 2U) + 1U] = out2;
+      }
+    }
+    /*  Twiddle coefficients index modifier */
+    twidCoefModifier <<= 2U;
+  }
+  /* end of middle stage process */
+
+
+  /* data is in 10.6(q6) format for the 1024 point */
+  /* data is in 8.8(q8) format for the 256 point */
+  /* data is in 6.10(q10) format for the 64 point */
+  /* data is in 4.12(q12) format for the 16 point */
+
+  /*  Initializations for the last stage */
+  n1 = n2;
+  n2 >>= 2U;
+
+  /* start of last stage process */
+
+  /*  Butterfly implementation */
+  for (i0 = 0U; i0 <= (fftLen - n1); i0 += n1)
+  {
+    /*  index calculation for the input as, */
+    /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
+    i1 = i0 + n2;
+    i2 = i1 + n2;
+    i3 = i2 + n2;
+
+    /*  Reading i0, i0+fftLen/2 inputs */
+    /* Read ya (real), xa(imag) input */
+    T0 = pSrc16[i0 * 2U];
+    T1 = pSrc16[(i0 * 2U) + 1U];
+
+    /* Read yc (real), xc(imag) input */
+    S0 = pSrc16[i2 * 2U];
+    S1 = pSrc16[(i2 * 2U) + 1U];
+
+    /* R0 = (ya + yc), R1 = (xa + xc) */
+    R0 = __SSAT(T0 + S0, 16U);
+    R1 = __SSAT(T1 + S1, 16U);
+
+    /* S0 = (ya - yc), S1 = (xa - xc) */
+    S0 = __SSAT(T0 - S0, 16U);
+    S1 = __SSAT(T1 - S1, 16U);
+
+    /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
+    /* Read yb (real), xb(imag) input */
+    T0 = pSrc16[i1 * 2U];
+    T1 = pSrc16[(i1 * 2U) + 1U];
+    /* Read yd (real), xd(imag) input */
+    U0 = pSrc16[i3 * 2U];
+    U1 = pSrc16[(i3 * 2U) + 1U];
+
+    /* T0 = (yb + yd), T1 = (xb + xd)) */
+    T0 = __SSAT(T0 + U0, 16U);
+    T1 = __SSAT(T1 + U1, 16U);
+
+    /*  writing the butterfly processed i0 sample */
+    /* xa' = xa + xb + xc + xd */
+    /* ya' = ya + yb + yc + yd */
+    pSrc16[i0 * 2U] = (R0 >> 1U) + (T0 >> 1U);
+    pSrc16[(i0 * 2U) + 1U] = (R1 >> 1U) + (T1 >> 1U);
+
+    /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
+    R0 = (R0 >> 1U) - (T0 >> 1U);
+    R1 = (R1 >> 1U) - (T1 >> 1U);
+    /* Read yb (real), xb(imag) input */
+    T0 = pSrc16[i1 * 2U];
+    T1 = pSrc16[(i1 * 2U) + 1U];
+
+    /*  writing the butterfly processed i0 + fftLen/4 sample */
+    /* xc' = (xa-xb+xc-xd) */
+    /* yc' = (ya-yb+yc-yd) */
+    pSrc16[i1 * 2U] = R0;
+    pSrc16[(i1 * 2U) + 1U] = R1;
+
+    /* Read yd (real), xd(imag) input */
+    U0 = pSrc16[i3 * 2U];
+    U1 = pSrc16[(i3 * 2U) + 1U];
+    /* T0 = (yb - yd), T1 = (xb - xd)  */
+    T0 = __SSAT(T0 - U0, 16U);
+    T1 = __SSAT(T1 - U1, 16U);
+
+    /*  writing the butterfly processed i0 + fftLen/2 sample */
+    /* xb' = (xa+yb-xc-yd) */
+    /* yb' = (ya-xb-yc+xd) */
+    pSrc16[i2 * 2U] = (S0 >> 1U) + (T1 >> 1U);
+    pSrc16[(i2 * 2U) + 1U] = (S1 >> 1U) - (T0 >> 1U);
+
+    /*  writing the butterfly processed i0 + 3fftLen/4 sample */
+    /* xd' = (xa-yb-xc+yd) */
+    /* yd' = (ya+xb-yc-xd) */
+    pSrc16[i3 * 2U] = (S0 >> 1U) - (T1 >> 1U);
+    pSrc16[(i3 * 2U) + 1U] = (S1 >> 1U) + (T0 >> 1U);
+
+  }
+
+  /* end of last stage process */
+
+  /* output is in 11.5(q5) format for the 1024 point */
+  /* output is in 9.7(q7) format for the 256 point   */
+  /* output is in 7.9(q9) format for the 64 point  */
+  /* output is in 5.11(q11) format for the 16 point  */
+
+#endif /* #if defined (ARM_MATH_DSP) */
+
+}
+
+
+/**
+  @brief         Core function for the Q15 CIFFT butterfly process.
+  @param[in,out] pSrc16           points to the in-place buffer of Q15 data type
+  @param[in]     fftLen           length of the FFT
+  @param[in]     pCoef16          points to twiddle coefficient buffer
+  @param[in]     twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
+  @return        none
+ */
+
+/*
+ * Radix-4 IFFT algorithm used is :
+ *
+ * CIFFT uses same twiddle coefficients as CFFT function
+ *  x[k] = x[n] + (j)k * x[n + fftLen/4] + (-1)k * x[n+fftLen/2] + (-j)k * x[n+3*fftLen/4]
+ *
+ *
+ * IFFT is implemented with following changes in equations from FFT
+ *
+ * Input real and imaginary data:
+ * x(n) = xa + j * ya
+ * x(n+N/4 ) = xb + j * yb
+ * x(n+N/2 ) = xc + j * yc
+ * x(n+3N 4) = xd + j * yd
+ *
+ *
+ * Output real and imaginary data:
+ * x(4r) = xa'+ j * ya'
+ * x(4r+1) = xb'+ j * yb'
+ * x(4r+2) = xc'+ j * yc'
+ * x(4r+3) = xd'+ j * yd'
+ *
+ *
+ * Twiddle factors for radix-4 IFFT:
+ * Wn = co1 + j * (si1)
+ * W2n = co2 + j * (si2)
+ * W3n = co3 + j * (si3)
+ 
+ * The real and imaginary output values for the radix-4 butterfly are
+ * xa' = xa + xb + xc + xd
+ * ya' = ya + yb + yc + yd
+ * xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1)
+ * yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1)
+ * xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2)
+ * yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2)
+ * xd' = (xa+yb-xc-yd)* co3 - (ya-xb-yc+xd)* (si3)
+ * yd' = (ya-xb-yc+xd)* co3 + (xa+yb-xc-yd)* (si3)
+ *
+ */
+
+void arm_radix4_butterfly_inverse_q15(
+        q15_t * pSrc16,
+        uint32_t fftLen,
+  const q15_t * pCoef16,
+        uint32_t twidCoefModifier)
+{
+
+#if defined (ARM_MATH_DSP)
+
+        q31_t R, S, T, U;
+        q31_t C1, C2, C3, out1, out2;
+        uint32_t n1, n2, ic, i0, j, k;
+        
+        q15_t *ptr1;
+        q15_t *pSi0;
+        q15_t *pSi1;
+        q15_t *pSi2;
+        q15_t *pSi3;
+        
+        q31_t xaya, xbyb, xcyc, xdyd;
+
+  /* Total process is divided into three stages */
+
+  /* process first stage, middle stages, & last stage */
+
+  /*  Initializations for the first stage */
+  n2 = fftLen;
+  n1 = n2;
+
+  /* n2 = fftLen/4 */
+  n2 >>= 2U;
+
+  /* Index for twiddle coefficient */
+  ic = 0U;
+
+  /* Index for input read and output write */
+  j = n2;
+
+  pSi0 = pSrc16;
+  pSi1 = pSi0 + 2 * n2;
+  pSi2 = pSi1 + 2 * n2;
+  pSi3 = pSi2 + 2 * n2;
+
+  /* Input is in 1.15(q15) format */
+
+  /*  start of first stage process */
+  do
+  {
+    /*  Butterfly implementation */
+
+    /*  Reading i0, i0+fftLen/2 inputs */
+    /* Read ya (real), xa(imag) input */
+    T = read_q15x2 (pSi0);
+    T = __SHADD16(T, 0);
+    T = __SHADD16(T, 0);
+
+    /* Read yc (real), xc(imag) input */
+    S = read_q15x2 (pSi2);
+    S = __SHADD16(S, 0);
+    S = __SHADD16(S, 0);
+
+    /* R = packed((ya + yc), (xa + xc) ) */
+    R = __QADD16(T, S);
+
+    /* S = packed((ya - yc), (xa - xc) ) */
+    S = __QSUB16(T, S);
+
+    /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
+    /* Read yb (real), xb(imag) input */
+    T = read_q15x2 (pSi1);
+    T = __SHADD16(T, 0);
+    T = __SHADD16(T, 0);
+
+    /* Read yd (real), xd(imag) input */
+    U = read_q15x2 (pSi3);
+    U = __SHADD16(U, 0);
+    U = __SHADD16(U, 0);
+
+    /* T = packed((yb + yd), (xb + xd) ) */
+    T = __QADD16(T, U);
+
+    /*  writing the butterfly processed i0 sample */
+    /* xa' = xa + xb + xc + xd */
+    /* ya' = ya + yb + yc + yd */
+    write_q15x2_ia (&pSi0, __SHADD16(R, T));
+
+    /* R = packed((ya + yc) - (yb + yd), (xa + xc)- (xb + xd)) */
+    R = __QSUB16(R, T);
+
+    /* co2 & si2 are read from SIMD Coefficient pointer */
+    C2 = read_q15x2 ((q15_t *) pCoef16 + (4U * ic));
+
+#ifndef ARM_MATH_BIG_ENDIAN
+    /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
+    out1 = __SMUSD(C2, R) >> 16U;
+    /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
+    out2 = __SMUADX(C2, R);
+#else
+    /* xc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
+    out1 = __SMUADX(C2, R) >> 16U;
+    /* yc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
+    out2 = __SMUSD(__QSUB16(0, C2), R);
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+    /*  Reading i0+fftLen/4 */
+    /* T = packed(yb, xb) */
+    T = read_q15x2 (pSi1);
+    T = __SHADD16(T, 0);
+    T = __SHADD16(T, 0);
+
+    /* writing the butterfly processed i0 + fftLen/4 sample */
+    /* writing output(xc', yc') in little endian format */
+    write_q15x2_ia (&pSi1, (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
+
+    /*  Butterfly calculations */
+    /* U = packed(yd, xd) */
+    U = read_q15x2 (pSi3);
+    U = __SHADD16(U, 0);
+    U = __SHADD16(U, 0);
+
+    /* T = packed(yb-yd, xb-xd) */
+    T = __QSUB16(T, U);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+    /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
+    R = __QSAX(S, T);
+    /* S = packed((ya-yc) + (xb- xd),  (xa-xc) - (yb-yd)) */
+    S = __QASX(S, T);
+#else
+    /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
+    R = __QASX(S, T);
+    /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
+    S = __QSAX(S, T);
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+    /* co1 & si1 are read from SIMD Coefficient pointer */
+    C1 = read_q15x2 ((q15_t *) pCoef16 + (2U * ic));
+    /*  Butterfly process for the i0+fftLen/2 sample */
+
+#ifndef ARM_MATH_BIG_ENDIAN
+    /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
+    out1 = __SMUSD(C1, S) >> 16U;
+    /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
+    out2 = __SMUADX(C1, S);
+#else
+    /* xb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
+    out1 = __SMUADX(C1, S) >> 16U;
+    /* yb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
+    out2 = __SMUSD(__QSUB16(0, C1), S);
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+    /* writing output(xb', yb') in little endian format */
+    write_q15x2_ia (&pSi2, ((out2) & 0xFFFF0000) | ((out1) & 0x0000FFFF));
+
+    /* co3 & si3 are read from SIMD Coefficient pointer */
+    C3 = read_q15x2 ((q15_t *) pCoef16 + (6U * ic));
+    /*  Butterfly process for the i0+3fftLen/4 sample */
+
+#ifndef ARM_MATH_BIG_ENDIAN
+    /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
+    out1 = __SMUSD(C3, R) >> 16U;
+    /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
+    out2 = __SMUADX(C3, R);
+#else
+    /* xd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
+    out1 = __SMUADX(C3, R) >> 16U;
+    /* yd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
+    out2 = __SMUSD(__QSUB16(0, C3), R);
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+    /* writing output(xd', yd') in little endian format */
+    write_q15x2_ia (&pSi3, ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
+
+    /*  Twiddle coefficients index modifier */
+    ic = ic + twidCoefModifier;
+
+  } while (--j);
+  /* data is in 4.11(q11) format */
+
+  /* end of first stage process */
+
+
+  /* start of middle stage process */
+
+  /*  Twiddle coefficients index modifier */
+  twidCoefModifier <<= 2U;
+
+  /*  Calculation of Middle stage */
+  for (k = fftLen / 4U; k > 4U; k >>= 2U)
+  {
+    /*  Initializations for the middle stage */
+    n1 = n2;
+    n2 >>= 2U;
+    ic = 0U;
+
+    for (j = 0U; j <= (n2 - 1U); j++)
+    {
+      /*  index calculation for the coefficients */
+      C1 = read_q15x2 ((q15_t *) pCoef16 + (2U * ic));
+      C2 = read_q15x2 ((q15_t *) pCoef16 + (4U * ic));
+      C3 = read_q15x2 ((q15_t *) pCoef16 + (6U * ic));
+
+      /*  Twiddle coefficients index modifier */
+      ic = ic + twidCoefModifier;
+
+      pSi0 = pSrc16 + 2 * j;
+      pSi1 = pSi0 + 2 * n2;
+      pSi2 = pSi1 + 2 * n2;
+      pSi3 = pSi2 + 2 * n2;
+
+      /*  Butterfly implementation */
+      for (i0 = j; i0 < fftLen; i0 += n1)
+      {
+        /*  Reading i0, i0+fftLen/2 inputs */
+        /* Read ya (real), xa(imag) input */
+        T = read_q15x2 (pSi0);
+
+        /* Read yc (real), xc(imag) input */
+        S = read_q15x2 (pSi2);
+
+        /* R = packed( (ya + yc), (xa + xc)) */
+        R = __QADD16(T, S);
+
+        /* S = packed((ya - yc), (xa - xc)) */
+        S = __QSUB16(T, S);
+
+        /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
+        /* Read yb (real), xb(imag) input */
+        T = read_q15x2 (pSi1);
+
+        /* Read yd (real), xd(imag) input */
+        U = read_q15x2 (pSi3);
+
+        /* T = packed( (yb + yd), (xb + xd)) */
+        T = __QADD16(T, U);
+
+        /*  writing the butterfly processed i0 sample */
+
+        /* xa' = xa + xb + xc + xd */
+        /* ya' = ya + yb + yc + yd */
+        out1 = __SHADD16(R, T);
+        out1 = __SHADD16(out1, 0);
+        write_q15x2 (pSi0, out1);
+        pSi0 += 2 * n1;
+
+        /* R = packed( (ya + yc) - (yb + yd), (xa + xc) - (xb + xd)) */
+        R = __SHSUB16(R, T);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+        /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
+        out1 = __SMUSD(C2, R) >> 16U;
+
+        /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
+        out2 = __SMUADX(C2, R);
+#else
+        /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
+        out1 = __SMUADX(R, C2) >> 16U;
+
+        /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
+        out2 = __SMUSD(__QSUB16(0, C2), R);
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+        /*  Reading i0+3fftLen/4 */
+        /* Read yb (real), xb(imag) input */
+        T = read_q15x2 (pSi1);
+
+        /*  writing the butterfly processed i0 + fftLen/4 sample */
+        /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
+        /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
+        write_q15x2 (pSi1, ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
+        pSi1 += 2 * n1;
+
+        /*  Butterfly calculations */
+
+        /* Read yd (real), xd(imag) input */
+        U = read_q15x2 (pSi3);
+
+        /* T = packed(yb-yd, xb-xd) */
+        T = __QSUB16(T, U);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+        /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
+        R = __SHSAX(S, T);
+
+        /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
+        S = __SHASX(S, T);
+
+        /*  Butterfly process for the i0+fftLen/2 sample */
+        out1 = __SMUSD(C1, S) >> 16U;
+        out2 = __SMUADX(C1, S);
+#else
+        /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
+        R = __SHASX(S, T);
+
+        /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
+        S = __SHSAX(S, T);
+
+        /*  Butterfly process for the i0+fftLen/2 sample */
+        out1 = __SMUADX(S, C1) >> 16U;
+        out2 = __SMUSD(__QSUB16(0, C1), S);
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+        /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
+        /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
+        write_q15x2 (pSi2, ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
+        pSi2 += 2 * n1;
+
+        /*  Butterfly process for the i0+3fftLen/4 sample */
+
+#ifndef ARM_MATH_BIG_ENDIAN
+        out1 = __SMUSD(C3, R) >> 16U;
+        out2 = __SMUADX(C3, R);
+#else
+        out1 = __SMUADX(C3, R) >> 16U;
+        out2 = __SMUSD(__QSUB16(0, C3), R);
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+        /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
+        /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
+        write_q15x2 (pSi3, ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
+        pSi3 += 2 * n1;
+      }
+    }
+    /*  Twiddle coefficients index modifier */
+    twidCoefModifier <<= 2U;
+  }
+  /* end of middle stage process */
+
+  /* data is in 10.6(q6) format for the 1024 point */
+  /* data is in 8.8(q8) format for the 256 point */
+  /* data is in 6.10(q10) format for the 64 point */
+  /* data is in 4.12(q12) format for the 16 point */
+
+  /*  Initializations for the last stage */
+  j = fftLen >> 2;
+
+  ptr1 = &pSrc16[0];
+
+  /* start of last stage process */
+
+  /*  Butterfly implementation */
+  do
+  {
+    /* Read xa (real), ya(imag) input */
+    xaya = read_q15x2_ia ((q15_t **) &ptr1);
+
+    /* Read xb (real), yb(imag) input */
+    xbyb = read_q15x2_ia ((q15_t **) &ptr1);
+
+    /* Read xc (real), yc(imag) input */
+    xcyc = read_q15x2_ia ((q15_t **) &ptr1);
+
+    /* Read xd (real), yd(imag) input */
+    xdyd = read_q15x2_ia ((q15_t **) &ptr1);
+
+    /* R = packed((ya + yc), (xa + xc)) */
+    R = __QADD16(xaya, xcyc);
+
+    /* T = packed((yb + yd), (xb + xd)) */
+    T = __QADD16(xbyb, xdyd);
+
+    /* pointer updation for writing */
+    ptr1 = ptr1 - 8U;
+
+
+    /* xa' = xa + xb + xc + xd */
+    /* ya' = ya + yb + yc + yd */
+    write_q15x2_ia (&ptr1, __SHADD16(R, T));
+
+    /* T = packed((yb + yd), (xb + xd)) */
+    T = __QADD16(xbyb, xdyd);
+
+    /* xc' = (xa-xb+xc-xd) */
+    /* yc' = (ya-yb+yc-yd) */
+    write_q15x2_ia (&ptr1, __SHSUB16(R, T));
+
+    /* S = packed((ya - yc), (xa - xc)) */
+    S = __QSUB16(xaya, xcyc);
+
+    /* Read yd (real), xd(imag) input */
+    /* T = packed( (yb - yd), (xb - xd))  */
+    U = __QSUB16(xbyb, xdyd);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+    /* xb' = (xa+yb-xc-yd) */
+    /* yb' = (ya-xb-yc+xd) */
+    write_q15x2_ia (&ptr1, __SHASX(S, U));
+
+    /* xd' = (xa-yb-xc+yd) */
+    /* yd' = (ya+xb-yc-xd) */
+    write_q15x2_ia (&ptr1, __SHSAX(S, U));
+#else
+    /* xb' = (xa+yb-xc-yd) */
+    /* yb' = (ya-xb-yc+xd) */
+    write_q15x2_ia (&ptr1, __SHSAX(S, U));
+
+    /* xd' = (xa-yb-xc+yd) */
+    /* yd' = (ya+xb-yc-xd) */
+    write_q15x2_ia (&ptr1, __SHASX(S, U));
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+  } while (--j);
+
+  /* end of last stage  process */
+
+  /* output is in 11.5(q5) format for the 1024 point */
+  /* output is in 9.7(q7) format for the 256 point   */
+  /* output is in 7.9(q9) format for the 64 point  */
+  /* output is in 5.11(q11) format for the 16 point  */
+
+
+#else /* arm_radix4_butterfly_inverse_q15 */
+
+        q15_t R0, R1, S0, S1, T0, T1, U0, U1;
+        q15_t Co1, Si1, Co2, Si2, Co3, Si3, out1, out2;
+        uint32_t n1, n2, ic, i0, i1, i2, i3, j, k;
+
+  /* Total process is divided into three stages */
+
+  /* process first stage, middle stages, & last stage */
+
+  /*  Initializations for the first stage */
+  n2 = fftLen;
+  n1 = n2;
+
+  /* n2 = fftLen/4 */
+  n2 >>= 2U;
+
+  /* Index for twiddle coefficient */
+  ic = 0U;
+
+  /* Index for input read and output write */
+  i0 = 0U;
+
+  j = n2;
+
+  /* Input is in 1.15(q15) format */
+
+  /*  Start of first stage process */
+  do
+  {
+    /*  Butterfly implementation */
+
+    /*  index calculation for the input as, */
+    /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
+    i1 = i0 + n2;
+    i2 = i1 + n2;
+    i3 = i2 + n2;
+
+    /*  Reading i0, i0+fftLen/2 inputs */
+    /* input is down scale by 4 to avoid overflow */
+    /* Read ya (real), xa(imag) input */
+    T0 = pSrc16[i0 * 2U] >> 2U;
+    T1 = pSrc16[(i0 * 2U) + 1U] >> 2U;
+    /* input is down scale by 4 to avoid overflow */
+    /* Read yc (real), xc(imag) input */
+    S0 = pSrc16[i2 * 2U] >> 2U;
+    S1 = pSrc16[(i2 * 2U) + 1U] >> 2U;
+
+    /* R0 = (ya + yc), R1 = (xa + xc) */
+    R0 = __SSAT(T0 + S0, 16U);
+    R1 = __SSAT(T1 + S1, 16U);
+    /* S0 = (ya - yc), S1 = (xa - xc) */
+    S0 = __SSAT(T0 - S0, 16U);
+    S1 = __SSAT(T1 - S1, 16U);
+
+    /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
+    /* input is down scale by 4 to avoid overflow */
+    /* Read yb (real), xb(imag) input */
+    T0 = pSrc16[i1 * 2U] >> 2U;
+    T1 = pSrc16[(i1 * 2U) + 1U] >> 2U;
+    /* Read yd (real), xd(imag) input */
+    /* input is down scale by 4 to avoid overflow */
+    U0 = pSrc16[i3 * 2U] >> 2U;
+    U1 = pSrc16[(i3 * 2U) + 1U] >> 2U;
+
+    /* T0 = (yb + yd), T1 = (xb + xd) */
+    T0 = __SSAT(T0 + U0, 16U);
+    T1 = __SSAT(T1 + U1, 16U);
+
+    /*  writing the butterfly processed i0 sample */
+    /* xa' = xa + xb + xc + xd */
+    /* ya' = ya + yb + yc + yd */
+    pSrc16[i0 * 2U] = (R0 >> 1U) + (T0 >> 1U);
+    pSrc16[(i0 * 2U) + 1U] = (R1 >> 1U) + (T1 >> 1U);
+
+    /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc)- (xb + xd) */
+    R0 = __SSAT(R0 - T0, 16U);
+    R1 = __SSAT(R1 - T1, 16U);
+    /* co2 & si2 are read from Coefficient pointer */
+    Co2 = pCoef16[2U * ic * 2U];
+    Si2 = pCoef16[(2U * ic * 2U) + 1U];
+    /* xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2) */
+    out1 = (q15_t) ((Co2 * R0 - Si2 * R1) >> 16U);
+    /* yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2) */
+    out2 = (q15_t) ((Si2 * R0 + Co2 * R1) >> 16U);
+
+    /*  Reading i0+fftLen/4 */
+    /* input is down scale by 4 to avoid overflow */
+    /* T0 = yb, T1 = xb */
+    T0 = pSrc16[i1 * 2U] >> 2U;
+    T1 = pSrc16[(i1 * 2U) + 1U] >> 2U;
+
+    /* writing the butterfly processed i0 + fftLen/4 sample */
+    /* writing output(xc', yc') in little endian format */
+    pSrc16[i1 * 2U] = out1;
+    pSrc16[(i1 * 2U) + 1U] = out2;
+
+    /*  Butterfly calculations */
+    /* input is down scale by 4 to avoid overflow */
+    /* U0 = yd, U1 = xd) */
+    U0 = pSrc16[i3 * 2U] >> 2U;
+    U1 = pSrc16[(i3 * 2U) + 1U] >> 2U;
+
+    /* T0 = yb-yd, T1 = xb-xd) */
+    T0 = __SSAT(T0 - U0, 16U);
+    T1 = __SSAT(T1 - U1, 16U);
+    /* R0 = (ya-yc) - (xb- xd) , R1 = (xa-xc) + (yb-yd) */
+    R0 = (q15_t) __SSAT((q31_t) (S0 + T1), 16);
+    R1 = (q15_t) __SSAT((q31_t) (S1 - T0), 16);
+    /* S = (ya-yc) + (xb- xd), S1 = (xa-xc) - (yb-yd) */
+    S0 = (q15_t) __SSAT((q31_t) (S0 - T1), 16);
+    S1 = (q15_t) __SSAT((q31_t) (S1 + T0), 16);
+
+    /* co1 & si1 are read from Coefficient pointer */
+    Co1 = pCoef16[ic * 2U];
+    Si1 = pCoef16[(ic * 2U) + 1U];
+    /*  Butterfly process for the i0+fftLen/2 sample */
+    /* xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1) */
+    out1 = (q15_t) ((Co1 * S0 - Si1 * S1) >> 16U);
+    /* yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1) */
+    out2 = (q15_t) ((Si1 * S0 + Co1 * S1) >> 16U);
+    /* writing output(xb', yb') in little endian format */
+    pSrc16[i2 * 2U] = out1;
+    pSrc16[(i2 * 2U) + 1U] = out2;
+
+    /* Co3 & si3 are read from Coefficient pointer */
+    Co3 = pCoef16[3U * ic * 2U];
+    Si3 = pCoef16[(3U * ic * 2U) + 1U];
+    /*  Butterfly process for the i0+3fftLen/4 sample */
+    /* xd' = (xa+yb-xc-yd)* Co3 - (ya-xb-yc+xd)* (si3) */
+    out1 = (q15_t) ((Co3 * R0 - Si3 * R1) >> 16U);
+    /* yd' = (ya-xb-yc+xd)* Co3 + (xa+yb-xc-yd)* (si3) */
+    out2 = (q15_t) ((Si3 * R0 + Co3 * R1) >> 16U);
+    /* writing output(xd', yd') in little endian format */
+    pSrc16[i3 * 2U] = out1;
+    pSrc16[(i3 * 2U) + 1U] = out2;
+
+    /*  Twiddle coefficients index modifier */
+    ic = ic + twidCoefModifier;
+
+    /*  Updating input index */
+    i0 = i0 + 1U;
+
+  } while (--j);
+
+  /*  End of first stage process */
+
+  /* data is in 4.11(q11) format */
+
+
+  /*  Start of Middle stage process */
+
+  /*  Twiddle coefficients index modifier */
+  twidCoefModifier <<= 2U;
+
+  /*  Calculation of Middle stage */
+  for (k = fftLen / 4U; k > 4U; k >>= 2U)
+  {
+    /*  Initializations for the middle stage */
+    n1 = n2;
+    n2 >>= 2U;
+    ic = 0U;
+
+    for (j = 0U; j <= (n2 - 1U); j++)
+    {
+      /*  index calculation for the coefficients */
+      Co1 = pCoef16[ic * 2U];
+      Si1 = pCoef16[(ic * 2U) + 1U];
+      Co2 = pCoef16[2U * ic * 2U];
+      Si2 = pCoef16[2U * ic * 2U + 1U];
+      Co3 = pCoef16[3U * ic * 2U];
+      Si3 = pCoef16[(3U * ic * 2U) + 1U];
+
+      /*  Twiddle coefficients index modifier */
+      ic = ic + twidCoefModifier;
+
+      /*  Butterfly implementation */
+      for (i0 = j; i0 < fftLen; i0 += n1)
+      {
+        /*  index calculation for the input as, */
+        /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
+        i1 = i0 + n2;
+        i2 = i1 + n2;
+        i3 = i2 + n2;
+
+        /*  Reading i0, i0+fftLen/2 inputs */
+        /* Read ya (real), xa(imag) input */
+        T0 = pSrc16[i0 * 2U];
+        T1 = pSrc16[(i0 * 2U) + 1U];
+
+        /* Read yc (real), xc(imag) input */
+        S0 = pSrc16[i2 * 2U];
+        S1 = pSrc16[(i2 * 2U) + 1U];
+
+
+        /* R0 = (ya + yc), R1 = (xa + xc) */
+        R0 = __SSAT(T0 + S0, 16U);
+        R1 = __SSAT(T1 + S1, 16U);
+        /* S0 = (ya - yc), S1 = (xa - xc) */
+        S0 = __SSAT(T0 - S0, 16U);
+        S1 = __SSAT(T1 - S1, 16U);
+
+        /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
+        /* Read yb (real), xb(imag) input */
+        T0 = pSrc16[i1 * 2U];
+        T1 = pSrc16[(i1 * 2U) + 1U];
+
+        /* Read yd (real), xd(imag) input */
+        U0 = pSrc16[i3 * 2U];
+        U1 = pSrc16[(i3 * 2U) + 1U];
+
+        /* T0 = (yb + yd), T1 = (xb + xd) */
+        T0 = __SSAT(T0 + U0, 16U);
+        T1 = __SSAT(T1 + U1, 16U);
+
+        /*  writing the butterfly processed i0 sample */
+        /* xa' = xa + xb + xc + xd */
+        /* ya' = ya + yb + yc + yd */
+        pSrc16[i0 * 2U] = ((R0 >> 1U) + (T0 >> 1U)) >> 1U;
+        pSrc16[(i0 * 2U) + 1U] = ((R1 >> 1U) + (T1 >> 1U)) >> 1U;
+
+        /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
+        R0 = (R0 >> 1U) - (T0 >> 1U);
+        R1 = (R1 >> 1U) - (T1 >> 1U);
+
+        /* (ya-yb+yc-yd)* (si2) - (xa-xb+xc-xd)* co2 */
+        out1 = (q15_t) ((Co2 * R0 - Si2 * R1) >> 16);
+        /* (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2) */
+        out2 = (q15_t) ((Si2 * R0 + Co2 * R1) >> 16);
+
+        /*  Reading i0+3fftLen/4 */
+        /* Read yb (real), xb(imag) input */
+        T0 = pSrc16[i1 * 2U];
+        T1 = pSrc16[(i1 * 2U) + 1U];
+
+        /*  writing the butterfly processed i0 + fftLen/4 sample */
+        /* xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2) */
+        /* yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2) */
+        pSrc16[i1 * 2U] = out1;
+        pSrc16[(i1 * 2U) + 1U] = out2;
+
+        /*  Butterfly calculations */
+        /* Read yd (real), xd(imag) input */
+        U0 = pSrc16[i3 * 2U];
+        U1 = pSrc16[(i3 * 2U) + 1U];
+
+        /* T0 = yb-yd, T1 = xb-xd) */
+        T0 = __SSAT(T0 - U0, 16U);
+        T1 = __SSAT(T1 - U1, 16U);
+
+        /* R0 = (ya-yc) - (xb- xd) , R1 = (xa-xc) + (yb-yd) */
+        R0 = (S0 >> 1U) + (T1 >> 1U);
+        R1 = (S1 >> 1U) - (T0 >> 1U);
+
+        /* S1 = (ya-yc) + (xb- xd), S1 = (xa-xc) - (yb-yd) */
+        S0 = (S0 >> 1U) - (T1 >> 1U);
+        S1 = (S1 >> 1U) + (T0 >> 1U);
+
+        /*  Butterfly process for the i0+fftLen/2 sample */
+        out1 = (q15_t) ((Co1 * S0 - Si1 * S1) >> 16U);
+        out2 = (q15_t) ((Si1 * S0 + Co1 * S1) >> 16U);
+        /* xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1) */
+        /* yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1) */
+        pSrc16[i2 * 2U] = out1;
+        pSrc16[(i2 * 2U) + 1U] = out2;
+
+        /*  Butterfly process for the i0+3fftLen/4 sample */
+        out1 = (q15_t) ((Co3 * R0 - Si3 * R1) >> 16U);
+
+        out2 = (q15_t) ((Si3 * R0 + Co3 * R1) >> 16U);
+        /* xd' = (xa+yb-xc-yd)* Co3 - (ya-xb-yc+xd)* (si3) */
+        /* yd' = (ya-xb-yc+xd)* Co3 + (xa+yb-xc-yd)* (si3) */
+        pSrc16[i3 * 2U] = out1;
+        pSrc16[(i3 * 2U) + 1U] = out2;
+
+
+      }
+    }
+    /*  Twiddle coefficients index modifier */
+    twidCoefModifier <<= 2U;
+  }
+  /*  End of Middle stages process */
+
+
+  /* data is in 10.6(q6) format for the 1024 point */
+  /* data is in 8.8(q8) format for the 256 point   */
+  /* data is in 6.10(q10) format for the 64 point  */
+  /* data is in 4.12(q12) format for the 16 point  */
+
+  /* start of last stage process */
+
+
+  /*  Initializations for the last stage */
+  n1 = n2;
+  n2 >>= 2U;
+
+  /*  Butterfly implementation */
+  for (i0 = 0U; i0 <= (fftLen - n1); i0 += n1)
+  {
+    /*  index calculation for the input as, */
+    /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
+    i1 = i0 + n2;
+    i2 = i1 + n2;
+    i3 = i2 + n2;
+
+    /*  Reading i0, i0+fftLen/2 inputs */
+    /* Read ya (real), xa(imag) input */
+    T0 = pSrc16[i0 * 2U];
+    T1 = pSrc16[(i0 * 2U) + 1U];
+    /* Read yc (real), xc(imag) input */
+    S0 = pSrc16[i2 * 2U];
+    S1 = pSrc16[(i2 * 2U) + 1U];
+
+    /* R0 = (ya + yc), R1 = (xa + xc) */
+    R0 = __SSAT(T0 + S0, 16U);
+    R1 = __SSAT(T1 + S1, 16U);
+    /* S0 = (ya - yc), S1 = (xa - xc) */
+    S0 = __SSAT(T0 - S0, 16U);
+    S1 = __SSAT(T1 - S1, 16U);
+
+    /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
+    /* Read yb (real), xb(imag) input */
+    T0 = pSrc16[i1 * 2U];
+    T1 = pSrc16[(i1 * 2U) + 1U];
+    /* Read yd (real), xd(imag) input */
+    U0 = pSrc16[i3 * 2U];
+    U1 = pSrc16[(i3 * 2U) + 1U];
+
+    /* T0 = (yb + yd), T1 = (xb + xd) */
+    T0 = __SSAT(T0 + U0, 16U);
+    T1 = __SSAT(T1 + U1, 16U);
+
+    /*  writing the butterfly processed i0 sample */
+    /* xa' = xa + xb + xc + xd */
+    /* ya' = ya + yb + yc + yd */
+    pSrc16[i0 * 2U] = (R0 >> 1U) + (T0 >> 1U);
+    pSrc16[(i0 * 2U) + 1U] = (R1 >> 1U) + (T1 >> 1U);
+
+    /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
+    R0 = (R0 >> 1U) - (T0 >> 1U);
+    R1 = (R1 >> 1U) - (T1 >> 1U);
+
+    /* Read yb (real), xb(imag) input */
+    T0 = pSrc16[i1 * 2U];
+    T1 = pSrc16[(i1 * 2U) + 1U];
+
+    /*  writing the butterfly processed i0 + fftLen/4 sample */
+    /* xc' = (xa-xb+xc-xd) */
+    /* yc' = (ya-yb+yc-yd) */
+    pSrc16[i1 * 2U] = R0;
+    pSrc16[(i1 * 2U) + 1U] = R1;
+
+    /* Read yd (real), xd(imag) input */
+    U0 = pSrc16[i3 * 2U];
+    U1 = pSrc16[(i3 * 2U) + 1U];
+    /* T0 = (yb - yd), T1 = (xb - xd) */
+    T0 = __SSAT(T0 - U0, 16U);
+    T1 = __SSAT(T1 - U1, 16U);
+
+    /*  writing the butterfly processed i0 + fftLen/2 sample */
+    /* xb' = (xa-yb-xc+yd) */
+    /* yb' = (ya+xb-yc-xd) */
+    pSrc16[i2 * 2U] = (S0 >> 1U) - (T1 >> 1U);
+    pSrc16[(i2 * 2U) + 1U] = (S1 >> 1U) + (T0 >> 1U);
+
+
+    /*  writing the butterfly processed i0 + 3fftLen/4 sample */
+    /* xd' = (xa+yb-xc-yd) */
+    /* yd' = (ya-xb-yc+xd) */
+    pSrc16[i3 * 2U] = (S0 >> 1U) + (T1 >> 1U);
+    pSrc16[(i3 * 2U) + 1U] = (S1 >> 1U) - (T0 >> 1U);
+  }
+  /* end of last stage  process */
+
+  /* output is in 11.5(q5) format for the 1024 point */
+  /* output is in 9.7(q7) format for the 256 point   */
+  /* output is in 7.9(q9) format for the 64 point  */
+  /* output is in 5.11(q11) format for the 16 point  */
+
+#endif /* #if defined (ARM_MATH_DSP) */
+
+}
diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_q31.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_q31.c
index 46c1e47..b7ea7e5 100644
--- a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_q31.c
+++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_q31.c
@@ -1,827 +1,827 @@
-/* ----------------------------------------------------------------------
- * Project:      CMSIS DSP Library
- * Title:        arm_cfft_radix4_q31.c
- * Description:  This file has function definition of Radix-4 FFT & IFFT function and
- *               In-place bit reversal using bit reversal table
- *
- * $Date:        23 April 2021
- * $Revision:    V1.9.0
- *
- * Target Processor: Cortex-M and Cortex-A cores
- * -------------------------------------------------------------------- */
-/*
- * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "dsp/transform_functions.h"
-
-void arm_radix4_butterfly_inverse_q31(
-        q31_t * pSrc,
-        uint32_t fftLen,
-  const q31_t * pCoef,
-        uint32_t twidCoefModifier);
-
-void arm_radix4_butterfly_q31(
-        q31_t * pSrc,
-        uint32_t fftLen,
-  const q31_t * pCoef,
-        uint32_t twidCoefModifier);
-
-void arm_bitreversal_q31(
-        q31_t * pSrc,
-        uint32_t fftLen,
-        uint16_t bitRevFactor,
-  const uint16_t * pBitRevTab);
-
-/**
-  @ingroup groupTransforms
- */
-
-/**
-  @addtogroup ComplexFFT
-  @{
- */
-
-/**
-  @brief         Processing function for the Q31 CFFT/CIFFT.
-  @deprecated    Do not use this function.  It has been superseded by \ref arm_cfft_q31 and will be removed in the future.
-  @param[in]     S    points to an instance of the Q31 CFFT/CIFFT structure
-  @param[in,out] pSrc points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place
-  @return        none
- 
-  @par Input and output formats:
-                 Internally input is downscaled by 2 for every stage to avoid saturations inside CFFT/CIFFT process.
-                 Hence the output format is different for different FFT sizes.
-                 The input and output formats for different FFT sizes and number of bits to upscale are mentioned in the tables below for CFFT and CIFFT:
-  @par
-                 \image html CFFTQ31.gif "Input and Output Formats for Q31 CFFT"
-                 \image html CIFFTQ31.gif "Input and Output Formats for Q31 CIFFT"
- */
-
-void arm_cfft_radix4_q31(
-  const arm_cfft_radix4_instance_q31 * S,
-        q31_t * pSrc)
-{
-  if (S->ifftFlag == 1U)
-  {
-    /* Complex IFFT radix-4 */
-    arm_radix4_butterfly_inverse_q31(pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier);
-  }
-  else
-  {
-    /* Complex FFT radix-4 */
-    arm_radix4_butterfly_q31(pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier);
-  }
-
-  if (S->bitReverseFlag == 1U)
-  {
-    /*  Bit Reversal */
-    arm_bitreversal_q31(pSrc, S->fftLen, S->bitRevFactor, S->pBitRevTable);
-  }
-
-}
-
-/**
-  @} end of ComplexFFT group
- */
-
-/*
- * Radix-4 FFT algorithm used is :
- *
- * Input real and imaginary data:
- * x(n) = xa + j * ya
- * x(n+N/4 ) = xb + j * yb
- * x(n+N/2 ) = xc + j * yc
- * x(n+3N 4) = xd + j * yd
- *
- *
- * Output real and imaginary data:
- * x(4r) = xa'+ j * ya'
- * x(4r+1) = xb'+ j * yb'
- * x(4r+2) = xc'+ j * yc'
- * x(4r+3) = xd'+ j * yd'
- *
- *
- * Twiddle factors for radix-4 FFT:
- * Wn = co1 + j * (- si1)
- * W2n = co2 + j * (- si2)
- * W3n = co3 + j * (- si3)
- *
- *  Butterfly implementation:
- * xa' = xa + xb + xc + xd
- * ya' = ya + yb + yc + yd
- * xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1)
- * yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1)
- * xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2)
- * yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2)
- * xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3)
- * yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3)
- *
- */
-
-/**
-  @brief         Core function for the Q31 CFFT butterfly process.
-  @param[in,out] pSrc             points to the in-place buffer of Q31 data type.
-  @param[in]     fftLen           length of the FFT.
-  @param[in]     pCoef            points to twiddle coefficient buffer.
-  @param[in]     twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
-  @return        none
- */
-
-void arm_radix4_butterfly_q31(
-        q31_t * pSrc,
-        uint32_t fftLen,
-  const q31_t * pCoef,
-        uint32_t twidCoefModifier)
-{
-        uint32_t n1, n2, ia1, ia2, ia3, i0, i1, i2, i3, j, k;
-        q31_t t1, t2, r1, r2, s1, s2, co1, co2, co3, si1, si2, si3;
-        
-        q31_t xa, xb, xc, xd;
-        q31_t ya, yb, yc, yd;
-        q31_t xa_out, xb_out, xc_out, xd_out;
-        q31_t ya_out, yb_out, yc_out, yd_out;
-        
-        q31_t *ptr1;
-
-  /* Total process is divided into three stages */
-
-  /* process first stage, middle stages, & last stage */
-
-
-  /* start of first stage process */
-
-  /*  Initializations for the first stage */
-  n2 = fftLen;
-  n1 = n2;
-  /* n2 = fftLen/4 */
-  n2 >>= 2U;
-  i0 = 0U;
-  ia1 = 0U;
-
-  j = n2;
-
-  /*  Calculation of first stage */
-  do
-  {
-    /*  index calculation for the input as, */
-    /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2U], pSrc[i0 + 3fftLen/4] */
-    i1 = i0 + n2;
-    i2 = i1 + n2;
-    i3 = i2 + n2;
-
-    /* input is in 1.31(q31) format and provide 4 guard bits for the input */
-
-    /*  Butterfly implementation */
-    /* xa + xc */
-    r1 = (pSrc[(2U * i0)] >> 4U) + (pSrc[(2U * i2)] >> 4U);
-    /* xa - xc */
-    r2 = (pSrc[(2U * i0)] >> 4U) - (pSrc[(2U * i2)] >> 4U);
-
-    /* xb + xd */
-    t1 = (pSrc[(2U * i1)] >> 4U) + (pSrc[(2U * i3)] >> 4U);
-
-    /* ya + yc */
-    s1 = (pSrc[(2U * i0) + 1U] >> 4U) + (pSrc[(2U * i2) + 1U] >> 4U);
-    /* ya - yc */
-    s2 = (pSrc[(2U * i0) + 1U] >> 4U) - (pSrc[(2U * i2) + 1U] >> 4U);
-
-    /* xa' = xa + xb + xc + xd */
-    pSrc[2U * i0] = (r1 + t1);
-    /* (xa + xc) - (xb + xd) */
-    r1 = r1 - t1;
-    /* yb + yd */
-    t2 = (pSrc[(2U * i1) + 1U] >> 4U) + (pSrc[(2U * i3) + 1U] >> 4U);
-
-    /* ya' = ya + yb + yc + yd */
-    pSrc[(2U * i0) + 1U] = (s1 + t2);
-
-    /* (ya + yc) - (yb + yd) */
-    s1 = s1 - t2;
-
-    /* yb - yd */
-    t1 = (pSrc[(2U * i1) + 1U] >> 4U) - (pSrc[(2U * i3) + 1U] >> 4U);
-    /* xb - xd */
-    t2 = (pSrc[(2U * i1)] >> 4U) - (pSrc[(2U * i3)] >> 4U);
-
-    /*  index calculation for the coefficients */
-    ia2 = 2U * ia1;
-    co2 = pCoef[(ia2 * 2U)];
-    si2 = pCoef[(ia2 * 2U) + 1U];
-
-    /* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */
-    pSrc[2U * i1] = (((int32_t) (((q63_t) r1 * co2) >> 32)) +
-                     ((int32_t) (((q63_t) s1 * si2) >> 32))) << 1U;
-
-    /* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */
-    pSrc[(2U * i1) + 1U] = (((int32_t) (((q63_t) s1 * co2) >> 32)) -
-                            ((int32_t) (((q63_t) r1 * si2) >> 32))) << 1U;
-
-    /* (xa - xc) + (yb - yd) */
-    r1 = r2 + t1;
-    /* (xa - xc) - (yb - yd) */
-    r2 = r2 - t1;
-
-    /* (ya - yc) - (xb - xd) */
-    s1 = s2 - t2;
-    /* (ya - yc) + (xb - xd) */
-    s2 = s2 + t2;
-
-    co1 = pCoef[(ia1 * 2U)];
-    si1 = pCoef[(ia1 * 2U) + 1U];
-
-    /* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */
-    pSrc[2U * i2] = (((int32_t) (((q63_t) r1 * co1) >> 32)) +
-                     ((int32_t) (((q63_t) s1 * si1) >> 32))) << 1U;
-
-    /* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */
-    pSrc[(2U * i2) + 1U] = (((int32_t) (((q63_t) s1 * co1) >> 32)) -
-                            ((int32_t) (((q63_t) r1 * si1) >> 32))) << 1U;
-
-    /*  index calculation for the coefficients */
-    ia3 = 3U * ia1;
-    co3 = pCoef[(ia3 * 2U)];
-    si3 = pCoef[(ia3 * 2U) + 1U];
-
-    /* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */
-    pSrc[2U * i3] = (((int32_t) (((q63_t) r2 * co3) >> 32)) +
-                     ((int32_t) (((q63_t) s2 * si3) >> 32))) << 1U;
-
-    /* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */
-    pSrc[(2U * i3) + 1U] = (((int32_t) (((q63_t) s2 * co3) >> 32)) -
-                            ((int32_t) (((q63_t) r2 * si3) >> 32))) << 1U;
-
-    /*  Twiddle coefficients index modifier */
-    ia1 = ia1 + twidCoefModifier;
-
-    /*  Updating input index */
-    i0 = i0 + 1U;
-
-  } while (--j);
-
-  /* end of first stage process */
-
-  /* data is in 5.27(q27) format */
-
-
-  /* start of Middle stages process */
-
-
-  /* each stage in middle stages provides two down scaling of the input */
-
-  twidCoefModifier <<= 2U;
-
-
-  for (k = fftLen / 4U; k > 4U; k >>= 2U)
-  {
-    /*  Initializations for the first stage */
-    n1 = n2;
-    n2 >>= 2U;
-    ia1 = 0U;
-
-    /*  Calculation of first stage */
-    for (j = 0U; j <= (n2 - 1U); j++)
-    {
-      /*  index calculation for the coefficients */
-      ia2 = ia1 + ia1;
-      ia3 = ia2 + ia1;
-      co1 = pCoef[(ia1 * 2U)];
-      si1 = pCoef[(ia1 * 2U) + 1U];
-      co2 = pCoef[(ia2 * 2U)];
-      si2 = pCoef[(ia2 * 2U) + 1U];
-      co3 = pCoef[(ia3 * 2U)];
-      si3 = pCoef[(ia3 * 2U) + 1U];
-      /*  Twiddle coefficients index modifier */
-      ia1 = ia1 + twidCoefModifier;
-
-      for (i0 = j; i0 < fftLen; i0 += n1)
-      {
-        /*  index calculation for the input as, */
-        /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2U], pSrc[i0 + 3fftLen/4] */
-        i1 = i0 + n2;
-        i2 = i1 + n2;
-        i3 = i2 + n2;
-
-        /*  Butterfly implementation */
-        /* xa + xc */
-        r1 = pSrc[2U * i0] + pSrc[2U * i2];
-        /* xa - xc */
-        r2 = pSrc[2U * i0] - pSrc[2U * i2];
-
-        /* ya + yc */
-        s1 = pSrc[(2U * i0) + 1U] + pSrc[(2U * i2) + 1U];
-        /* ya - yc */
-        s2 = pSrc[(2U * i0) + 1U] - pSrc[(2U * i2) + 1U];
-
-        /* xb + xd */
-        t1 = pSrc[2U * i1] + pSrc[2U * i3];
-
-        /* xa' = xa + xb + xc + xd */
-        pSrc[2U * i0] = (r1 + t1) >> 2U;
-        /* xa + xc -(xb + xd) */
-        r1 = r1 - t1;
-
-        /* yb + yd */
-        t2 = pSrc[(2U * i1) + 1U] + pSrc[(2U * i3) + 1U];
-        /* ya' = ya + yb + yc + yd */
-        pSrc[(2U * i0) + 1U] = (s1 + t2) >> 2U;
-
-        /* (ya + yc) - (yb + yd) */
-        s1 = s1 - t2;
-
-        /* (yb - yd) */
-        t1 = pSrc[(2U * i1) + 1U] - pSrc[(2U * i3) + 1U];
-        /* (xb - xd) */
-        t2 = pSrc[2U * i1] - pSrc[2U * i3];
-
-        /* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */
-        pSrc[2U * i1] = (((int32_t) (((q63_t) r1 * co2) >> 32)) +
-                         ((int32_t) (((q63_t) s1 * si2) >> 32))) >> 1U;
-
-        /* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */
-        pSrc[(2U * i1) + 1U] = (((int32_t) (((q63_t) s1 * co2) >> 32)) -
-                                ((int32_t) (((q63_t) r1 * si2) >> 32))) >> 1U;
-
-        /* (xa - xc) + (yb - yd) */
-        r1 = r2 + t1;
-        /* (xa - xc) - (yb - yd) */
-        r2 = r2 - t1;
-
-        /* (ya - yc) -  (xb - xd) */
-        s1 = s2 - t2;
-        /* (ya - yc) +  (xb - xd) */
-        s2 = s2 + t2;
-
-        /* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */
-        pSrc[2U * i2] = (((int32_t) (((q63_t) r1 * co1) >> 32)) +
-                         ((int32_t) (((q63_t) s1 * si1) >> 32))) >> 1U;
-
-        /* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */
-        pSrc[(2U * i2) + 1U] = (((int32_t) (((q63_t) s1 * co1) >> 32)) -
-                                ((int32_t) (((q63_t) r1 * si1) >> 32))) >> 1U;
-
-        /* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */
-        pSrc[2U * i3] = (((int32_t) (((q63_t) r2 * co3) >> 32)) +
-                         ((int32_t) (((q63_t) s2 * si3) >> 32))) >> 1U;
-
-        /* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */
-        pSrc[(2U * i3) + 1U] = (((int32_t) (((q63_t) s2 * co3) >> 32)) -
-                                ((int32_t) (((q63_t) r2 * si3) >> 32))) >> 1U;
-      }
-    }
-    twidCoefModifier <<= 2U;
-  }
-
-  /* End of Middle stages process */
-
-  /* data is in 11.21(q21) format for the 1024 point as there are 3 middle stages */
-  /* data is in 9.23(q23) format for the 256 point as there are 2 middle stages */
-  /* data is in 7.25(q25) format for the 64 point as there are 1 middle stage */
-  /* data is in 5.27(q27) format for the 16 point as there are no middle stages */
-
-
-  /* start of Last stage process */
-  /*  Initializations for the last stage */
-  j = fftLen >> 2;
-  ptr1 = &pSrc[0];
-
-  /*  Calculations of last stage */
-  do
-  {
-    /* Read xa (real), ya(imag) input */
-    xa = *ptr1++;
-    ya = *ptr1++;
-
-    /* Read xb (real), yb(imag) input */
-    xb = *ptr1++;
-    yb = *ptr1++;
-
-    /* Read xc (real), yc(imag) input */
-    xc = *ptr1++;
-    yc = *ptr1++;
-
-    /* Read xc (real), yc(imag) input */
-    xd = *ptr1++;
-    yd = *ptr1++;
-
-    /* xa' = xa + xb + xc + xd */
-    xa_out = xa + xb + xc + xd;
-
-    /* ya' = ya + yb + yc + yd */
-    ya_out = ya + yb + yc + yd;
-
-    /* pointer updation for writing */
-    ptr1 = ptr1 - 8U;
-
-    /* writing xa' and ya' */
-    *ptr1++ = xa_out;
-    *ptr1++ = ya_out;
-
-    xc_out = (xa - xb + xc - xd);
-    yc_out = (ya - yb + yc - yd);
-
-    /* writing xc' and yc' */
-    *ptr1++ = xc_out;
-    *ptr1++ = yc_out;
-
-    xb_out = (xa + yb - xc - yd);
-    yb_out = (ya - xb - yc + xd);
-
-    /* writing xb' and yb' */
-    *ptr1++ = xb_out;
-    *ptr1++ = yb_out;
-
-    xd_out = (xa - yb - xc + yd);
-    yd_out = (ya + xb - yc - xd);
-
-    /* writing xd' and yd' */
-    *ptr1++ = xd_out;
-    *ptr1++ = yd_out;
-
-
-  } while (--j);
-
-  /* output is in 11.21(q21) format for the 1024 point */
-  /* output is in 9.23(q23) format for the 256 point */
-  /* output is in 7.25(q25) format for the 64 point */
-  /* output is in 5.27(q27) format for the 16 point */
-
-  /* End of last stage process */
-
-}
-
-
-/**
-  @brief         Core function for the Q31 CIFFT butterfly process.
-  @param[in,out] pSrc             points to the in-place buffer of Q31 data type.
-  @param[in]     fftLen           length of the FFT.
-  @param[in]     pCoef            points to twiddle coefficient buffer.
-  @param[in]     twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
-  @return        none
- */
-
-/*
- * Radix-4 IFFT algorithm used is :
- *
- * CIFFT uses same twiddle coefficients as CFFT Function
- *  x[k] = x[n] + (j)k * x[n + fftLen/4] + (-1)k * x[n+fftLen/2] + (-j)k * x[n+3*fftLen/4]
- *
- *
- * IFFT is implemented with following changes in equations from FFT
- *
- * Input real and imaginary data:
- * x(n) = xa + j * ya
- * x(n+N/4 ) = xb + j * yb
- * x(n+N/2 ) = xc + j * yc
- * x(n+3N 4) = xd + j * yd
- *
- *
- * Output real and imaginary data:
- * x(4r) = xa'+ j * ya'
- * x(4r+1) = xb'+ j * yb'
- * x(4r+2) = xc'+ j * yc'
- * x(4r+3) = xd'+ j * yd'
- *
- *
- * Twiddle factors for radix-4 IFFT:
- * Wn = co1 + j * (si1)
- * W2n = co2 + j * (si2)
- * W3n = co3 + j * (si3)
- 
- * The real and imaginary output values for the radix-4 butterfly are
- * xa' = xa + xb + xc + xd
- * ya' = ya + yb + yc + yd
- * xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1)
- * yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1)
- * xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2)
- * yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2)
- * xd' = (xa+yb-xc-yd)* co3 - (ya-xb-yc+xd)* (si3)
- * yd' = (ya-xb-yc+xd)* co3 + (xa+yb-xc-yd)* (si3)
- *
- */
-
-void arm_radix4_butterfly_inverse_q31(
-        q31_t * pSrc,
-        uint32_t fftLen,
-  const q31_t * pCoef,
-        uint32_t twidCoefModifier)
-{
-        uint32_t n1, n2, ia1, ia2, ia3, i0, i1, i2, i3, j, k;
-        q31_t t1, t2, r1, r2, s1, s2, co1, co2, co3, si1, si2, si3;
-        q31_t xa, xb, xc, xd;
-        q31_t ya, yb, yc, yd;
-        q31_t xa_out, xb_out, xc_out, xd_out;
-        q31_t ya_out, yb_out, yc_out, yd_out;
-        
-        q31_t *ptr1;
-
-  /* input is be 1.31(q31) format for all FFT sizes */
-  /* Total process is divided into three stages */
-  /* process first stage, middle stages, & last stage */
-
-  /* Start of first stage process */
-
-  /* Initializations for the first stage */
-  n2 = fftLen;
-  n1 = n2;
-  /* n2 = fftLen/4 */
-  n2 >>= 2U;
-  i0 = 0U;
-  ia1 = 0U;
-
-  j = n2;
-
-  do
-  {
-    /* input is in 1.31(q31) format and provide 4 guard bits for the input */
-
-    /*  index calculation for the input as, */
-    /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2U], pSrc[i0 + 3fftLen/4] */
-    i1 = i0 + n2;
-    i2 = i1 + n2;
-    i3 = i2 + n2;
-
-    /*  Butterfly implementation */
-    /* xa + xc */
-    r1 = (pSrc[2U * i0] >> 4U) + (pSrc[2U * i2] >> 4U);
-    /* xa - xc */
-    r2 = (pSrc[2U * i0] >> 4U) - (pSrc[2U * i2] >> 4U);
-
-    /* xb + xd */
-    t1 = (pSrc[2U * i1] >> 4U) + (pSrc[2U * i3] >> 4U);
-
-    /* ya + yc */
-    s1 = (pSrc[(2U * i0) + 1U] >> 4U) + (pSrc[(2U * i2) + 1U] >> 4U);
-    /* ya - yc */
-    s2 = (pSrc[(2U * i0) + 1U] >> 4U) - (pSrc[(2U * i2) + 1U] >> 4U);
-
-    /* xa' = xa + xb + xc + xd */
-    pSrc[2U * i0] = (r1 + t1);
-    /* (xa + xc) - (xb + xd) */
-    r1 = r1 - t1;
-    /* yb + yd */
-    t2 = (pSrc[(2U * i1) + 1U] >> 4U) + (pSrc[(2U * i3) + 1U] >> 4U);
-    /* ya' = ya + yb + yc + yd */
-    pSrc[(2U * i0) + 1U] = (s1 + t2);
-
-    /* (ya + yc) - (yb + yd) */
-    s1 = s1 - t2;
-
-    /* yb - yd */
-    t1 = (pSrc[(2U * i1) + 1U] >> 4U) - (pSrc[(2U * i3) + 1U] >> 4U);
-    /* xb - xd */
-    t2 = (pSrc[2U * i1] >> 4U) - (pSrc[2U * i3] >> 4U);
-
-    /*  index calculation for the coefficients */
-    ia2 = 2U * ia1;
-    co2 = pCoef[ia2 * 2U];
-    si2 = pCoef[(ia2 * 2U) + 1U];
-
-    /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
-    pSrc[2U * i1] = (((int32_t) (((q63_t) r1 * co2) >> 32)) -
-                     ((int32_t) (((q63_t) s1 * si2) >> 32))) << 1U;
-
-    /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
-    pSrc[2U * i1 + 1U] = (((int32_t) (((q63_t) s1 * co2) >> 32)) +
-                          ((int32_t) (((q63_t) r1 * si2) >> 32))) << 1U;
-
-    /* (xa - xc) - (yb - yd) */
-    r1 = r2 - t1;
-    /* (xa - xc) + (yb - yd) */
-    r2 = r2 + t1;
-
-    /* (ya - yc) + (xb - xd) */
-    s1 = s2 + t2;
-    /* (ya - yc) - (xb - xd) */
-    s2 = s2 - t2;
-
-    co1 = pCoef[ia1 * 2U];
-    si1 = pCoef[(ia1 * 2U) + 1U];
-
-    /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
-    pSrc[2U * i2] = (((int32_t) (((q63_t) r1 * co1) >> 32)) -
-                     ((int32_t) (((q63_t) s1 * si1) >> 32))) << 1U;
-
-    /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
-    pSrc[(2U * i2) + 1U] = (((int32_t) (((q63_t) s1 * co1) >> 32)) +
-                            ((int32_t) (((q63_t) r1 * si1) >> 32))) << 1U;
-
-    /*  index calculation for the coefficients */
-    ia3 = 3U * ia1;
-    co3 = pCoef[ia3 * 2U];
-    si3 = pCoef[(ia3 * 2U) + 1U];
-
-    /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
-    pSrc[2U * i3] = (((int32_t) (((q63_t) r2 * co3) >> 32)) -
-                     ((int32_t) (((q63_t) s2 * si3) >> 32))) << 1U;
-
-    /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
-    pSrc[(2U * i3) + 1U] = (((int32_t) (((q63_t) s2 * co3) >> 32)) +
-                            ((int32_t) (((q63_t) r2 * si3) >> 32))) << 1U;
-
-    /*  Twiddle coefficients index modifier */
-    ia1 = ia1 + twidCoefModifier;
-
-    /*  Updating input index */
-    i0 = i0 + 1U;
-
-  } while (--j);
-
-  /* data is in 5.27(q27) format */
-  /* each stage provides two down scaling of the input */
-
-
-  /* Start of Middle stages process */
-
-  twidCoefModifier <<= 2U;
-
-  /*  Calculation of second stage to excluding last stage */
-  for (k = fftLen / 4U; k > 4U; k >>= 2U)
-  {
-    /*  Initializations for the first stage */
-    n1 = n2;
-    n2 >>= 2U;
-    ia1 = 0U;
-
-    for (j = 0; j <= (n2 - 1U); j++)
-    {
-      /*  index calculation for the coefficients */
-      ia2 = ia1 + ia1;
-      ia3 = ia2 + ia1;
-      co1 = pCoef[(ia1 * 2U)];
-      si1 = pCoef[(ia1 * 2U) + 1U];
-      co2 = pCoef[(ia2 * 2U)];
-      si2 = pCoef[(ia2 * 2U) + 1U];
-      co3 = pCoef[(ia3 * 2U)];
-      si3 = pCoef[(ia3 * 2U) + 1U];
-      /*  Twiddle coefficients index modifier */
-      ia1 = ia1 + twidCoefModifier;
-
-      for (i0 = j; i0 < fftLen; i0 += n1)
-      {
-        /*  index calculation for the input as, */
-        /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2U], pSrc[i0 + 3fftLen/4] */
-        i1 = i0 + n2;
-        i2 = i1 + n2;
-        i3 = i2 + n2;
-
-        /*  Butterfly implementation */
-        /* xa + xc */
-        r1 = pSrc[2U * i0] + pSrc[2U * i2];
-        /* xa - xc */
-        r2 = pSrc[2U * i0] - pSrc[2U * i2];
-
-        /* ya + yc */
-        s1 = pSrc[(2U * i0) + 1U] + pSrc[(2U * i2) + 1U];
-        /* ya - yc */
-        s2 = pSrc[(2U * i0) + 1U] - pSrc[(2U * i2) + 1U];
-
-        /* xb + xd */
-        t1 = pSrc[2U * i1] + pSrc[2U * i3];
-
-        /* xa' = xa + xb + xc + xd */
-        pSrc[2U * i0] = (r1 + t1) >> 2U;
-        /* xa + xc -(xb + xd) */
-        r1 = r1 - t1;
-        /* yb + yd */
-        t2 = pSrc[(2U * i1) + 1U] + pSrc[(2U * i3) + 1U];
-        /* ya' = ya + yb + yc + yd */
-        pSrc[(2U * i0) + 1U] = (s1 + t2) >> 2U;
-
-        /* (ya + yc) - (yb + yd) */
-        s1 = s1 - t2;
-
-        /* (yb - yd) */
-        t1 = pSrc[(2U * i1) + 1U] - pSrc[(2U * i3) + 1U];
-        /* (xb - xd) */
-        t2 = pSrc[2U * i1] - pSrc[2U * i3];
-
-        /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
-        pSrc[2U * i1] = (((int32_t) (((q63_t) r1 * co2) >> 32U)) -
-                         ((int32_t) (((q63_t) s1 * si2) >> 32U))) >> 1U;
-
-        /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
-        pSrc[(2U * i1) + 1U] = (((int32_t) (((q63_t) s1 * co2) >> 32U)) +
-                                ((int32_t) (((q63_t) r1 * si2) >> 32U))) >> 1U;
-
-        /* (xa - xc) - (yb - yd) */
-        r1 = r2 - t1;
-        /* (xa - xc) + (yb - yd) */
-        r2 = r2 + t1;
-
-        /* (ya - yc) +  (xb - xd) */
-        s1 = s2 + t2;
-        /* (ya - yc) -  (xb - xd) */
-        s2 = s2 - t2;
-
-        /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
-        pSrc[2U * i2] = (((int32_t) (((q63_t) r1 * co1) >> 32)) -
-                         ((int32_t) (((q63_t) s1 * si1) >> 32))) >> 1U;
-
-        /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
-        pSrc[(2U * i2) + 1U] = (((int32_t) (((q63_t) s1 * co1) >> 32)) +
-                                ((int32_t) (((q63_t) r1 * si1) >> 32))) >> 1U;
-
-        /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
-        pSrc[(2U * i3)] = (((int32_t) (((q63_t) r2 * co3) >> 32)) -
-                           ((int32_t) (((q63_t) s2 * si3) >> 32))) >> 1U;
-
-        /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
-        pSrc[(2U * i3) + 1U] = (((int32_t) (((q63_t) s2 * co3) >> 32)) +
-                                ((int32_t) (((q63_t) r2 * si3) >> 32))) >> 1U;
-      }
-    }
-    twidCoefModifier <<= 2U;
-  }
-
-  /* End of Middle stages process */
-
-  /* data is in 11.21(q21) format for the 1024 point as there are 3 middle stages */
-  /* data is in 9.23(q23) format for the 256 point as there are 2 middle stages */
-  /* data is in 7.25(q25) format for the 64 point as there are 1 middle stage */
-  /* data is in 5.27(q27) format for the 16 point as there are no middle stages */
-
-
-  /* Start of last stage process */
-
-
-  /*  Initializations for the last stage */
-  j = fftLen >> 2;
-  ptr1 = &pSrc[0];
-
-  /*  Calculations of last stage */
-  do
-  {
-    /* Read xa (real), ya(imag) input */
-    xa = *ptr1++;
-    ya = *ptr1++;
-
-    /* Read xb (real), yb(imag) input */
-    xb = *ptr1++;
-    yb = *ptr1++;
-
-    /* Read xc (real), yc(imag) input */
-    xc = *ptr1++;
-    yc = *ptr1++;
-
-    /* Read xc (real), yc(imag) input */
-    xd = *ptr1++;
-    yd = *ptr1++;
-
-    /* xa' = xa + xb + xc + xd */
-    xa_out = xa + xb + xc + xd;
-
-    /* ya' = ya + yb + yc + yd */
-    ya_out = ya + yb + yc + yd;
-
-    /* pointer updation for writing */
-    ptr1 = ptr1 - 8U;
-
-    /* writing xa' and ya' */
-    *ptr1++ = xa_out;
-    *ptr1++ = ya_out;
-
-    xc_out = (xa - xb + xc - xd);
-    yc_out = (ya - yb + yc - yd);
-
-    /* writing xc' and yc' */
-    *ptr1++ = xc_out;
-    *ptr1++ = yc_out;
-
-    xb_out = (xa - yb - xc + yd);
-    yb_out = (ya + xb - yc - xd);
-
-    /* writing xb' and yb' */
-    *ptr1++ = xb_out;
-    *ptr1++ = yb_out;
-
-    xd_out = (xa + yb - xc - yd);
-    yd_out = (ya - xb - yc + xd);
-
-    /* writing xd' and yd' */
-    *ptr1++ = xd_out;
-    *ptr1++ = yd_out;
-
-  } while (--j);
-
-  /* output is in 11.21(q21) format for the 1024 point */
-  /* output is in 9.23(q23) format for the 256 point */
-  /* output is in 7.25(q25) format for the 64 point */
-  /* output is in 5.27(q27) format for the 16 point */
-
-  /* End of last stage process */
-}
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_cfft_radix4_q31.c
+ * Description:  This file has function definition of Radix-4 FFT & IFFT function and
+ *               In-place bit reversal using bit reversal table
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+void arm_radix4_butterfly_inverse_q31(
+        q31_t * pSrc,
+        uint32_t fftLen,
+  const q31_t * pCoef,
+        uint32_t twidCoefModifier);
+
+void arm_radix4_butterfly_q31(
+        q31_t * pSrc,
+        uint32_t fftLen,
+  const q31_t * pCoef,
+        uint32_t twidCoefModifier);
+
+void arm_bitreversal_q31(
+        q31_t * pSrc,
+        uint32_t fftLen,
+        uint16_t bitRevFactor,
+  const uint16_t * pBitRevTab);
+
+/**
+  @ingroup groupTransforms
+ */
+
+/**
+  @addtogroup ComplexFFT
+  @{
+ */
+
+/**
+  @brief         Processing function for the Q31 CFFT/CIFFT.
+  @deprecated    Do not use this function.  It has been superseded by \ref arm_cfft_q31 and will be removed in the future.
+  @param[in]     S    points to an instance of the Q31 CFFT/CIFFT structure
+  @param[in,out] pSrc points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place
+  @return        none
+ 
+  @par Input and output formats:
+                 Internally input is downscaled by 2 for every stage to avoid saturations inside CFFT/CIFFT process.
+                 Hence the output format is different for different FFT sizes.
+                 The input and output formats for different FFT sizes and number of bits to upscale are mentioned in the tables below for CFFT and CIFFT:
+  @par
+                 \image html CFFTQ31.gif "Input and Output Formats for Q31 CFFT"
+                 \image html CIFFTQ31.gif "Input and Output Formats for Q31 CIFFT"
+ */
+
+void arm_cfft_radix4_q31(
+  const arm_cfft_radix4_instance_q31 * S,
+        q31_t * pSrc)
+{
+  if (S->ifftFlag == 1U)
+  {
+    /* Complex IFFT radix-4 */
+    arm_radix4_butterfly_inverse_q31(pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier);
+  }
+  else
+  {
+    /* Complex FFT radix-4 */
+    arm_radix4_butterfly_q31(pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier);
+  }
+
+  if (S->bitReverseFlag == 1U)
+  {
+    /*  Bit Reversal */
+    arm_bitreversal_q31(pSrc, S->fftLen, S->bitRevFactor, S->pBitRevTable);
+  }
+
+}
+
+/**
+  @} end of ComplexFFT group
+ */
+
+/*
+ * Radix-4 FFT algorithm used is :
+ *
+ * Input real and imaginary data:
+ * x(n) = xa + j * ya
+ * x(n+N/4 ) = xb + j * yb
+ * x(n+N/2 ) = xc + j * yc
+ * x(n+3N 4) = xd + j * yd
+ *
+ *
+ * Output real and imaginary data:
+ * x(4r) = xa'+ j * ya'
+ * x(4r+1) = xb'+ j * yb'
+ * x(4r+2) = xc'+ j * yc'
+ * x(4r+3) = xd'+ j * yd'
+ *
+ *
+ * Twiddle factors for radix-4 FFT:
+ * Wn = co1 + j * (- si1)
+ * W2n = co2 + j * (- si2)
+ * W3n = co3 + j * (- si3)
+ *
+ *  Butterfly implementation:
+ * xa' = xa + xb + xc + xd
+ * ya' = ya + yb + yc + yd
+ * xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1)
+ * yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1)
+ * xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2)
+ * yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2)
+ * xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3)
+ * yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3)
+ *
+ */
+
+/**
+  @brief         Core function for the Q31 CFFT butterfly process.
+  @param[in,out] pSrc             points to the in-place buffer of Q31 data type.
+  @param[in]     fftLen           length of the FFT.
+  @param[in]     pCoef            points to twiddle coefficient buffer.
+  @param[in]     twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
+  @return        none
+ */
+
+void arm_radix4_butterfly_q31(
+        q31_t * pSrc,
+        uint32_t fftLen,
+  const q31_t * pCoef,
+        uint32_t twidCoefModifier)
+{
+        uint32_t n1, n2, ia1, ia2, ia3, i0, i1, i2, i3, j, k;
+        q31_t t1, t2, r1, r2, s1, s2, co1, co2, co3, si1, si2, si3;
+        
+        q31_t xa, xb, xc, xd;
+        q31_t ya, yb, yc, yd;
+        q31_t xa_out, xb_out, xc_out, xd_out;
+        q31_t ya_out, yb_out, yc_out, yd_out;
+        
+        q31_t *ptr1;
+
+  /* Total process is divided into three stages */
+
+  /* process first stage, middle stages, & last stage */
+
+
+  /* start of first stage process */
+
+  /*  Initializations for the first stage */
+  n2 = fftLen;
+  n1 = n2;
+  /* n2 = fftLen/4 */
+  n2 >>= 2U;
+  i0 = 0U;
+  ia1 = 0U;
+
+  j = n2;
+
+  /*  Calculation of first stage */
+  do
+  {
+    /*  index calculation for the input as, */
+    /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2U], pSrc[i0 + 3fftLen/4] */
+    i1 = i0 + n2;
+    i2 = i1 + n2;
+    i3 = i2 + n2;
+
+    /* input is in 1.31(q31) format and provide 4 guard bits for the input */
+
+    /*  Butterfly implementation */
+    /* xa + xc */
+    r1 = (pSrc[(2U * i0)] >> 4U) + (pSrc[(2U * i2)] >> 4U);
+    /* xa - xc */
+    r2 = (pSrc[(2U * i0)] >> 4U) - (pSrc[(2U * i2)] >> 4U);
+
+    /* xb + xd */
+    t1 = (pSrc[(2U * i1)] >> 4U) + (pSrc[(2U * i3)] >> 4U);
+
+    /* ya + yc */
+    s1 = (pSrc[(2U * i0) + 1U] >> 4U) + (pSrc[(2U * i2) + 1U] >> 4U);
+    /* ya - yc */
+    s2 = (pSrc[(2U * i0) + 1U] >> 4U) - (pSrc[(2U * i2) + 1U] >> 4U);
+
+    /* xa' = xa + xb + xc + xd */
+    pSrc[2U * i0] = (r1 + t1);
+    /* (xa + xc) - (xb + xd) */
+    r1 = r1 - t1;
+    /* yb + yd */
+    t2 = (pSrc[(2U * i1) + 1U] >> 4U) + (pSrc[(2U * i3) + 1U] >> 4U);
+
+    /* ya' = ya + yb + yc + yd */
+    pSrc[(2U * i0) + 1U] = (s1 + t2);
+
+    /* (ya + yc) - (yb + yd) */
+    s1 = s1 - t2;
+
+    /* yb - yd */
+    t1 = (pSrc[(2U * i1) + 1U] >> 4U) - (pSrc[(2U * i3) + 1U] >> 4U);
+    /* xb - xd */
+    t2 = (pSrc[(2U * i1)] >> 4U) - (pSrc[(2U * i3)] >> 4U);
+
+    /*  index calculation for the coefficients */
+    ia2 = 2U * ia1;
+    co2 = pCoef[(ia2 * 2U)];
+    si2 = pCoef[(ia2 * 2U) + 1U];
+
+    /* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */
+    pSrc[2U * i1] = (((int32_t) (((q63_t) r1 * co2) >> 32)) +
+                     ((int32_t) (((q63_t) s1 * si2) >> 32))) << 1U;
+
+    /* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */
+    pSrc[(2U * i1) + 1U] = (((int32_t) (((q63_t) s1 * co2) >> 32)) -
+                            ((int32_t) (((q63_t) r1 * si2) >> 32))) << 1U;
+
+    /* (xa - xc) + (yb - yd) */
+    r1 = r2 + t1;
+    /* (xa - xc) - (yb - yd) */
+    r2 = r2 - t1;
+
+    /* (ya - yc) - (xb - xd) */
+    s1 = s2 - t2;
+    /* (ya - yc) + (xb - xd) */
+    s2 = s2 + t2;
+
+    co1 = pCoef[(ia1 * 2U)];
+    si1 = pCoef[(ia1 * 2U) + 1U];
+
+    /* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */
+    pSrc[2U * i2] = (((int32_t) (((q63_t) r1 * co1) >> 32)) +
+                     ((int32_t) (((q63_t) s1 * si1) >> 32))) << 1U;
+
+    /* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */
+    pSrc[(2U * i2) + 1U] = (((int32_t) (((q63_t) s1 * co1) >> 32)) -
+                            ((int32_t) (((q63_t) r1 * si1) >> 32))) << 1U;
+
+    /*  index calculation for the coefficients */
+    ia3 = 3U * ia1;
+    co3 = pCoef[(ia3 * 2U)];
+    si3 = pCoef[(ia3 * 2U) + 1U];
+
+    /* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */
+    pSrc[2U * i3] = (((int32_t) (((q63_t) r2 * co3) >> 32)) +
+                     ((int32_t) (((q63_t) s2 * si3) >> 32))) << 1U;
+
+    /* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */
+    pSrc[(2U * i3) + 1U] = (((int32_t) (((q63_t) s2 * co3) >> 32)) -
+                            ((int32_t) (((q63_t) r2 * si3) >> 32))) << 1U;
+
+    /*  Twiddle coefficients index modifier */
+    ia1 = ia1 + twidCoefModifier;
+
+    /*  Updating input index */
+    i0 = i0 + 1U;
+
+  } while (--j);
+
+  /* end of first stage process */
+
+  /* data is in 5.27(q27) format */
+
+
+  /* start of Middle stages process */
+
+
+  /* each stage in middle stages provides two down scaling of the input */
+
+  twidCoefModifier <<= 2U;
+
+
+  for (k = fftLen / 4U; k > 4U; k >>= 2U)
+  {
+    /*  Initializations for the first stage */
+    n1 = n2;
+    n2 >>= 2U;
+    ia1 = 0U;
+
+    /*  Calculation of first stage */
+    for (j = 0U; j <= (n2 - 1U); j++)
+    {
+      /*  index calculation for the coefficients */
+      ia2 = ia1 + ia1;
+      ia3 = ia2 + ia1;
+      co1 = pCoef[(ia1 * 2U)];
+      si1 = pCoef[(ia1 * 2U) + 1U];
+      co2 = pCoef[(ia2 * 2U)];
+      si2 = pCoef[(ia2 * 2U) + 1U];
+      co3 = pCoef[(ia3 * 2U)];
+      si3 = pCoef[(ia3 * 2U) + 1U];
+      /*  Twiddle coefficients index modifier */
+      ia1 = ia1 + twidCoefModifier;
+
+      for (i0 = j; i0 < fftLen; i0 += n1)
+      {
+        /*  index calculation for the input as, */
+        /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2U], pSrc[i0 + 3fftLen/4] */
+        i1 = i0 + n2;
+        i2 = i1 + n2;
+        i3 = i2 + n2;
+
+        /*  Butterfly implementation */
+        /* xa + xc */
+        r1 = pSrc[2U * i0] + pSrc[2U * i2];
+        /* xa - xc */
+        r2 = pSrc[2U * i0] - pSrc[2U * i2];
+
+        /* ya + yc */
+        s1 = pSrc[(2U * i0) + 1U] + pSrc[(2U * i2) + 1U];
+        /* ya - yc */
+        s2 = pSrc[(2U * i0) + 1U] - pSrc[(2U * i2) + 1U];
+
+        /* xb + xd */
+        t1 = pSrc[2U * i1] + pSrc[2U * i3];
+
+        /* xa' = xa + xb + xc + xd */
+        pSrc[2U * i0] = (r1 + t1) >> 2U;
+        /* xa + xc -(xb + xd) */
+        r1 = r1 - t1;
+
+        /* yb + yd */
+        t2 = pSrc[(2U * i1) + 1U] + pSrc[(2U * i3) + 1U];
+        /* ya' = ya + yb + yc + yd */
+        pSrc[(2U * i0) + 1U] = (s1 + t2) >> 2U;
+
+        /* (ya + yc) - (yb + yd) */
+        s1 = s1 - t2;
+
+        /* (yb - yd) */
+        t1 = pSrc[(2U * i1) + 1U] - pSrc[(2U * i3) + 1U];
+        /* (xb - xd) */
+        t2 = pSrc[2U * i1] - pSrc[2U * i3];
+
+        /* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */
+        pSrc[2U * i1] = (((int32_t) (((q63_t) r1 * co2) >> 32)) +
+                         ((int32_t) (((q63_t) s1 * si2) >> 32))) >> 1U;
+
+        /* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */
+        pSrc[(2U * i1) + 1U] = (((int32_t) (((q63_t) s1 * co2) >> 32)) -
+                                ((int32_t) (((q63_t) r1 * si2) >> 32))) >> 1U;
+
+        /* (xa - xc) + (yb - yd) */
+        r1 = r2 + t1;
+        /* (xa - xc) - (yb - yd) */
+        r2 = r2 - t1;
+
+        /* (ya - yc) -  (xb - xd) */
+        s1 = s2 - t2;
+        /* (ya - yc) +  (xb - xd) */
+        s2 = s2 + t2;
+
+        /* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */
+        pSrc[2U * i2] = (((int32_t) (((q63_t) r1 * co1) >> 32)) +
+                         ((int32_t) (((q63_t) s1 * si1) >> 32))) >> 1U;
+
+        /* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */
+        pSrc[(2U * i2) + 1U] = (((int32_t) (((q63_t) s1 * co1) >> 32)) -
+                                ((int32_t) (((q63_t) r1 * si1) >> 32))) >> 1U;
+
+        /* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */
+        pSrc[2U * i3] = (((int32_t) (((q63_t) r2 * co3) >> 32)) +
+                         ((int32_t) (((q63_t) s2 * si3) >> 32))) >> 1U;
+
+        /* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */
+        pSrc[(2U * i3) + 1U] = (((int32_t) (((q63_t) s2 * co3) >> 32)) -
+                                ((int32_t) (((q63_t) r2 * si3) >> 32))) >> 1U;
+      }
+    }
+    twidCoefModifier <<= 2U;
+  }
+
+  /* End of Middle stages process */
+
+  /* data is in 11.21(q21) format for the 1024 point as there are 3 middle stages */
+  /* data is in 9.23(q23) format for the 256 point as there are 2 middle stages */
+  /* data is in 7.25(q25) format for the 64 point as there are 1 middle stage */
+  /* data is in 5.27(q27) format for the 16 point as there are no middle stages */
+
+
+  /* start of Last stage process */
+  /*  Initializations for the last stage */
+  j = fftLen >> 2;
+  ptr1 = &pSrc[0];
+
+  /*  Calculations of last stage */
+  do
+  {
+    /* Read xa (real), ya(imag) input */
+    xa = *ptr1++;
+    ya = *ptr1++;
+
+    /* Read xb (real), yb(imag) input */
+    xb = *ptr1++;
+    yb = *ptr1++;
+
+    /* Read xc (real), yc(imag) input */
+    xc = *ptr1++;
+    yc = *ptr1++;
+
+    /* Read xc (real), yc(imag) input */
+    xd = *ptr1++;
+    yd = *ptr1++;
+
+    /* xa' = xa + xb + xc + xd */
+    xa_out = xa + xb + xc + xd;
+
+    /* ya' = ya + yb + yc + yd */
+    ya_out = ya + yb + yc + yd;
+
+    /* pointer updation for writing */
+    ptr1 = ptr1 - 8U;
+
+    /* writing xa' and ya' */
+    *ptr1++ = xa_out;
+    *ptr1++ = ya_out;
+
+    xc_out = (xa - xb + xc - xd);
+    yc_out = (ya - yb + yc - yd);
+
+    /* writing xc' and yc' */
+    *ptr1++ = xc_out;
+    *ptr1++ = yc_out;
+
+    xb_out = (xa + yb - xc - yd);
+    yb_out = (ya - xb - yc + xd);
+
+    /* writing xb' and yb' */
+    *ptr1++ = xb_out;
+    *ptr1++ = yb_out;
+
+    xd_out = (xa - yb - xc + yd);
+    yd_out = (ya + xb - yc - xd);
+
+    /* writing xd' and yd' */
+    *ptr1++ = xd_out;
+    *ptr1++ = yd_out;
+
+
+  } while (--j);
+
+  /* output is in 11.21(q21) format for the 1024 point */
+  /* output is in 9.23(q23) format for the 256 point */
+  /* output is in 7.25(q25) format for the 64 point */
+  /* output is in 5.27(q27) format for the 16 point */
+
+  /* End of last stage process */
+
+}
+
+
+/**
+  @brief         Core function for the Q31 CIFFT butterfly process.
+  @param[in,out] pSrc             points to the in-place buffer of Q31 data type.
+  @param[in]     fftLen           length of the FFT.
+  @param[in]     pCoef            points to twiddle coefficient buffer.
+  @param[in]     twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
+  @return        none
+ */
+
+/*
+ * Radix-4 IFFT algorithm used is :
+ *
+ * CIFFT uses same twiddle coefficients as CFFT Function
+ *  x[k] = x[n] + (j)k * x[n + fftLen/4] + (-1)k * x[n+fftLen/2] + (-j)k * x[n+3*fftLen/4]
+ *
+ *
+ * IFFT is implemented with following changes in equations from FFT
+ *
+ * Input real and imaginary data:
+ * x(n) = xa + j * ya
+ * x(n+N/4 ) = xb + j * yb
+ * x(n+N/2 ) = xc + j * yc
+ * x(n+3N 4) = xd + j * yd
+ *
+ *
+ * Output real and imaginary data:
+ * x(4r) = xa'+ j * ya'
+ * x(4r+1) = xb'+ j * yb'
+ * x(4r+2) = xc'+ j * yc'
+ * x(4r+3) = xd'+ j * yd'
+ *
+ *
+ * Twiddle factors for radix-4 IFFT:
+ * Wn = co1 + j * (si1)
+ * W2n = co2 + j * (si2)
+ * W3n = co3 + j * (si3)
+ 
+ * The real and imaginary output values for the radix-4 butterfly are
+ * xa' = xa + xb + xc + xd
+ * ya' = ya + yb + yc + yd
+ * xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1)
+ * yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1)
+ * xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2)
+ * yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2)
+ * xd' = (xa+yb-xc-yd)* co3 - (ya-xb-yc+xd)* (si3)
+ * yd' = (ya-xb-yc+xd)* co3 + (xa+yb-xc-yd)* (si3)
+ *
+ */
+
+void arm_radix4_butterfly_inverse_q31(
+        q31_t * pSrc,
+        uint32_t fftLen,
+  const q31_t * pCoef,
+        uint32_t twidCoefModifier)
+{
+        uint32_t n1, n2, ia1, ia2, ia3, i0, i1, i2, i3, j, k;
+        q31_t t1, t2, r1, r2, s1, s2, co1, co2, co3, si1, si2, si3;
+        q31_t xa, xb, xc, xd;
+        q31_t ya, yb, yc, yd;
+        q31_t xa_out, xb_out, xc_out, xd_out;
+        q31_t ya_out, yb_out, yc_out, yd_out;
+        
+        q31_t *ptr1;
+
+  /* input is be 1.31(q31) format for all FFT sizes */
+  /* Total process is divided into three stages */
+  /* process first stage, middle stages, & last stage */
+
+  /* Start of first stage process */
+
+  /* Initializations for the first stage */
+  n2 = fftLen;
+  n1 = n2;
+  /* n2 = fftLen/4 */
+  n2 >>= 2U;
+  i0 = 0U;
+  ia1 = 0U;
+
+  j = n2;
+
+  do
+  {
+    /* input is in 1.31(q31) format and provide 4 guard bits for the input */
+
+    /*  index calculation for the input as, */
+    /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2U], pSrc[i0 + 3fftLen/4] */
+    i1 = i0 + n2;
+    i2 = i1 + n2;
+    i3 = i2 + n2;
+
+    /*  Butterfly implementation */
+    /* xa + xc */
+    r1 = (pSrc[2U * i0] >> 4U) + (pSrc[2U * i2] >> 4U);
+    /* xa - xc */
+    r2 = (pSrc[2U * i0] >> 4U) - (pSrc[2U * i2] >> 4U);
+
+    /* xb + xd */
+    t1 = (pSrc[2U * i1] >> 4U) + (pSrc[2U * i3] >> 4U);
+
+    /* ya + yc */
+    s1 = (pSrc[(2U * i0) + 1U] >> 4U) + (pSrc[(2U * i2) + 1U] >> 4U);
+    /* ya - yc */
+    s2 = (pSrc[(2U * i0) + 1U] >> 4U) - (pSrc[(2U * i2) + 1U] >> 4U);
+
+    /* xa' = xa + xb + xc + xd */
+    pSrc[2U * i0] = (r1 + t1);
+    /* (xa + xc) - (xb + xd) */
+    r1 = r1 - t1;
+    /* yb + yd */
+    t2 = (pSrc[(2U * i1) + 1U] >> 4U) + (pSrc[(2U * i3) + 1U] >> 4U);
+    /* ya' = ya + yb + yc + yd */
+    pSrc[(2U * i0) + 1U] = (s1 + t2);
+
+    /* (ya + yc) - (yb + yd) */
+    s1 = s1 - t2;
+
+    /* yb - yd */
+    t1 = (pSrc[(2U * i1) + 1U] >> 4U) - (pSrc[(2U * i3) + 1U] >> 4U);
+    /* xb - xd */
+    t2 = (pSrc[2U * i1] >> 4U) - (pSrc[2U * i3] >> 4U);
+
+    /*  index calculation for the coefficients */
+    ia2 = 2U * ia1;
+    co2 = pCoef[ia2 * 2U];
+    si2 = pCoef[(ia2 * 2U) + 1U];
+
+    /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
+    pSrc[2U * i1] = (((int32_t) (((q63_t) r1 * co2) >> 32)) -
+                     ((int32_t) (((q63_t) s1 * si2) >> 32))) << 1U;
+
+    /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
+    pSrc[2U * i1 + 1U] = (((int32_t) (((q63_t) s1 * co2) >> 32)) +
+                          ((int32_t) (((q63_t) r1 * si2) >> 32))) << 1U;
+
+    /* (xa - xc) - (yb - yd) */
+    r1 = r2 - t1;
+    /* (xa - xc) + (yb - yd) */
+    r2 = r2 + t1;
+
+    /* (ya - yc) + (xb - xd) */
+    s1 = s2 + t2;
+    /* (ya - yc) - (xb - xd) */
+    s2 = s2 - t2;
+
+    co1 = pCoef[ia1 * 2U];
+    si1 = pCoef[(ia1 * 2U) + 1U];
+
+    /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
+    pSrc[2U * i2] = (((int32_t) (((q63_t) r1 * co1) >> 32)) -
+                     ((int32_t) (((q63_t) s1 * si1) >> 32))) << 1U;
+
+    /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
+    pSrc[(2U * i2) + 1U] = (((int32_t) (((q63_t) s1 * co1) >> 32)) +
+                            ((int32_t) (((q63_t) r1 * si1) >> 32))) << 1U;
+
+    /*  index calculation for the coefficients */
+    ia3 = 3U * ia1;
+    co3 = pCoef[ia3 * 2U];
+    si3 = pCoef[(ia3 * 2U) + 1U];
+
+    /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
+    pSrc[2U * i3] = (((int32_t) (((q63_t) r2 * co3) >> 32)) -
+                     ((int32_t) (((q63_t) s2 * si3) >> 32))) << 1U;
+
+    /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
+    pSrc[(2U * i3) + 1U] = (((int32_t) (((q63_t) s2 * co3) >> 32)) +
+                            ((int32_t) (((q63_t) r2 * si3) >> 32))) << 1U;
+
+    /*  Twiddle coefficients index modifier */
+    ia1 = ia1 + twidCoefModifier;
+
+    /*  Updating input index */
+    i0 = i0 + 1U;
+
+  } while (--j);
+
+  /* data is in 5.27(q27) format */
+  /* each stage provides two down scaling of the input */
+
+
+  /* Start of Middle stages process */
+
+  twidCoefModifier <<= 2U;
+
+  /*  Calculation of second stage to excluding last stage */
+  for (k = fftLen / 4U; k > 4U; k >>= 2U)
+  {
+    /*  Initializations for the first stage */
+    n1 = n2;
+    n2 >>= 2U;
+    ia1 = 0U;
+
+    for (j = 0; j <= (n2 - 1U); j++)
+    {
+      /*  index calculation for the coefficients */
+      ia2 = ia1 + ia1;
+      ia3 = ia2 + ia1;
+      co1 = pCoef[(ia1 * 2U)];
+      si1 = pCoef[(ia1 * 2U) + 1U];
+      co2 = pCoef[(ia2 * 2U)];
+      si2 = pCoef[(ia2 * 2U) + 1U];
+      co3 = pCoef[(ia3 * 2U)];
+      si3 = pCoef[(ia3 * 2U) + 1U];
+      /*  Twiddle coefficients index modifier */
+      ia1 = ia1 + twidCoefModifier;
+
+      for (i0 = j; i0 < fftLen; i0 += n1)
+      {
+        /*  index calculation for the input as, */
+        /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2U], pSrc[i0 + 3fftLen/4] */
+        i1 = i0 + n2;
+        i2 = i1 + n2;
+        i3 = i2 + n2;
+
+        /*  Butterfly implementation */
+        /* xa + xc */
+        r1 = pSrc[2U * i0] + pSrc[2U * i2];
+        /* xa - xc */
+        r2 = pSrc[2U * i0] - pSrc[2U * i2];
+
+        /* ya + yc */
+        s1 = pSrc[(2U * i0) + 1U] + pSrc[(2U * i2) + 1U];
+        /* ya - yc */
+        s2 = pSrc[(2U * i0) + 1U] - pSrc[(2U * i2) + 1U];
+
+        /* xb + xd */
+        t1 = pSrc[2U * i1] + pSrc[2U * i3];
+
+        /* xa' = xa + xb + xc + xd */
+        pSrc[2U * i0] = (r1 + t1) >> 2U;
+        /* xa + xc -(xb + xd) */
+        r1 = r1 - t1;
+        /* yb + yd */
+        t2 = pSrc[(2U * i1) + 1U] + pSrc[(2U * i3) + 1U];
+        /* ya' = ya + yb + yc + yd */
+        pSrc[(2U * i0) + 1U] = (s1 + t2) >> 2U;
+
+        /* (ya + yc) - (yb + yd) */
+        s1 = s1 - t2;
+
+        /* (yb - yd) */
+        t1 = pSrc[(2U * i1) + 1U] - pSrc[(2U * i3) + 1U];
+        /* (xb - xd) */
+        t2 = pSrc[2U * i1] - pSrc[2U * i3];
+
+        /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
+        pSrc[2U * i1] = (((int32_t) (((q63_t) r1 * co2) >> 32U)) -
+                         ((int32_t) (((q63_t) s1 * si2) >> 32U))) >> 1U;
+
+        /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
+        pSrc[(2U * i1) + 1U] = (((int32_t) (((q63_t) s1 * co2) >> 32U)) +
+                                ((int32_t) (((q63_t) r1 * si2) >> 32U))) >> 1U;
+
+        /* (xa - xc) - (yb - yd) */
+        r1 = r2 - t1;
+        /* (xa - xc) + (yb - yd) */
+        r2 = r2 + t1;
+
+        /* (ya - yc) +  (xb - xd) */
+        s1 = s2 + t2;
+        /* (ya - yc) -  (xb - xd) */
+        s2 = s2 - t2;
+
+        /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
+        pSrc[2U * i2] = (((int32_t) (((q63_t) r1 * co1) >> 32)) -
+                         ((int32_t) (((q63_t) s1 * si1) >> 32))) >> 1U;
+
+        /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
+        pSrc[(2U * i2) + 1U] = (((int32_t) (((q63_t) s1 * co1) >> 32)) +
+                                ((int32_t) (((q63_t) r1 * si1) >> 32))) >> 1U;
+
+        /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
+        pSrc[(2U * i3)] = (((int32_t) (((q63_t) r2 * co3) >> 32)) -
+                           ((int32_t) (((q63_t) s2 * si3) >> 32))) >> 1U;
+
+        /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
+        pSrc[(2U * i3) + 1U] = (((int32_t) (((q63_t) s2 * co3) >> 32)) +
+                                ((int32_t) (((q63_t) r2 * si3) >> 32))) >> 1U;
+      }
+    }
+    twidCoefModifier <<= 2U;
+  }
+
+  /* End of Middle stages process */
+
+  /* data is in 11.21(q21) format for the 1024 point as there are 3 middle stages */
+  /* data is in 9.23(q23) format for the 256 point as there are 2 middle stages */
+  /* data is in 7.25(q25) format for the 64 point as there are 1 middle stage */
+  /* data is in 5.27(q27) format for the 16 point as there are no middle stages */
+
+
+  /* Start of last stage process */
+
+
+  /*  Initializations for the last stage */
+  j = fftLen >> 2;
+  ptr1 = &pSrc[0];
+
+  /*  Calculations of last stage */
+  do
+  {
+    /* Read xa (real), ya(imag) input */
+    xa = *ptr1++;
+    ya = *ptr1++;
+
+    /* Read xb (real), yb(imag) input */
+    xb = *ptr1++;
+    yb = *ptr1++;
+
+    /* Read xc (real), yc(imag) input */
+    xc = *ptr1++;
+    yc = *ptr1++;
+
+    /* Read xc (real), yc(imag) input */
+    xd = *ptr1++;
+    yd = *ptr1++;
+
+    /* xa' = xa + xb + xc + xd */
+    xa_out = xa + xb + xc + xd;
+
+    /* ya' = ya + yb + yc + yd */
+    ya_out = ya + yb + yc + yd;
+
+    /* pointer updation for writing */
+    ptr1 = ptr1 - 8U;
+
+    /* writing xa' and ya' */
+    *ptr1++ = xa_out;
+    *ptr1++ = ya_out;
+
+    xc_out = (xa - xb + xc - xd);
+    yc_out = (ya - yb + yc - yd);
+
+    /* writing xc' and yc' */
+    *ptr1++ = xc_out;
+    *ptr1++ = yc_out;
+
+    xb_out = (xa - yb - xc + yd);
+    yb_out = (ya + xb - yc - xd);
+
+    /* writing xb' and yb' */
+    *ptr1++ = xb_out;
+    *ptr1++ = yb_out;
+
+    xd_out = (xa + yb - xc - yd);
+    yd_out = (ya - xb - yc + xd);
+
+    /* writing xd' and yd' */
+    *ptr1++ = xd_out;
+    *ptr1++ = yd_out;
+
+  } while (--j);
+
+  /* output is in 11.21(q21) format for the 1024 point */
+  /* output is in 9.23(q23) format for the 256 point */
+  /* output is in 7.25(q25) format for the 64 point */
+  /* output is in 5.27(q27) format for the 16 point */
+
+  /* End of last stage process */
+}
diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix8_f32.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix8_f32.c
index 7990d47..ae74977 100644
--- a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix8_f32.c
+++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix8_f32.c
@@ -1,285 +1,285 @@
-/* ----------------------------------------------------------------------
- * Project:      CMSIS DSP Library
- * Title:        arm_cfft_radix8_f32.c
- * Description:  Radix-8 Decimation in Frequency CFFT & CIFFT Floating point processing function
- *
- * $Date:        23 April 2021
- * $Revision:    V1.9.0
- *
- * Target Processor: Cortex-M and Cortex-A cores
- * -------------------------------------------------------------------- */
-/*
- * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "dsp/transform_functions.h"
-
-
-/* ----------------------------------------------------------------------
- * Internal helper function used by the FFTs
- * -------------------------------------------------------------------- */
-
-/**
-  brief         Core function for the floating-point CFFT butterfly process.
-  param[in,out] pSrc             points to the in-place buffer of floating-point data type.
-  param[in]     fftLen           length of the FFT.
-  param[in]     pCoef            points to the twiddle coefficient buffer.
-  param[in]     twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
-  return        none
-*/
-
-void arm_radix8_butterfly_f32(
-  float32_t * pSrc,
-  uint16_t fftLen,
-  const float32_t * pCoef,
-  uint16_t twidCoefModifier)
-{
-   uint32_t ia1, ia2, ia3, ia4, ia5, ia6, ia7;
-   uint32_t i1, i2, i3, i4, i5, i6, i7, i8;
-   uint32_t id;
-   uint32_t n1, n2, j;
-
-   float32_t r1, r2, r3, r4, r5, r6, r7, r8;
-   float32_t t1, t2;
-   float32_t s1, s2, s3, s4, s5, s6, s7, s8;
-   float32_t p1, p2, p3, p4;
-   float32_t co2, co3, co4, co5, co6, co7, co8;
-   float32_t si2, si3, si4, si5, si6, si7, si8;
-   const float32_t C81 = 0.70710678118f;
-
-   n2 = fftLen;
-
-   do
-   {
-      n1 = n2;
-      n2 = n2 >> 3;
-      i1 = 0;
-
-      do
-      {
-         i2 = i1 + n2;
-         i3 = i2 + n2;
-         i4 = i3 + n2;
-         i5 = i4 + n2;
-         i6 = i5 + n2;
-         i7 = i6 + n2;
-         i8 = i7 + n2;
-         r1 = pSrc[2 * i1] + pSrc[2 * i5];
-         r5 = pSrc[2 * i1] - pSrc[2 * i5];
-         r2 = pSrc[2 * i2] + pSrc[2 * i6];
-         r6 = pSrc[2 * i2] - pSrc[2 * i6];
-         r3 = pSrc[2 * i3] + pSrc[2 * i7];
-         r7 = pSrc[2 * i3] - pSrc[2 * i7];
-         r4 = pSrc[2 * i4] + pSrc[2 * i8];
-         r8 = pSrc[2 * i4] - pSrc[2 * i8];
-         t1 = r1 - r3;
-         r1 = r1 + r3;
-         r3 = r2 - r4;
-         r2 = r2 + r4;
-         pSrc[2 * i1] = r1 + r2;
-         pSrc[2 * i5] = r1 - r2;
-         r1 = pSrc[2 * i1 + 1] + pSrc[2 * i5 + 1];
-         s5 = pSrc[2 * i1 + 1] - pSrc[2 * i5 + 1];
-         r2 = pSrc[2 * i2 + 1] + pSrc[2 * i6 + 1];
-         s6 = pSrc[2 * i2 + 1] - pSrc[2 * i6 + 1];
-         s3 = pSrc[2 * i3 + 1] + pSrc[2 * i7 + 1];
-         s7 = pSrc[2 * i3 + 1] - pSrc[2 * i7 + 1];
-         r4 = pSrc[2 * i4 + 1] + pSrc[2 * i8 + 1];
-         s8 = pSrc[2 * i4 + 1] - pSrc[2 * i8 + 1];
-         t2 = r1 - s3;
-         r1 = r1 + s3;
-         s3 = r2 - r4;
-         r2 = r2 + r4;
-         pSrc[2 * i1 + 1] = r1 + r2;
-         pSrc[2 * i5 + 1] = r1 - r2;
-         pSrc[2 * i3]     = t1 + s3;
-         pSrc[2 * i7]     = t1 - s3;
-         pSrc[2 * i3 + 1] = t2 - r3;
-         pSrc[2 * i7 + 1] = t2 + r3;
-         r1 = (r6 - r8) * C81;
-         r6 = (r6 + r8) * C81;
-         r2 = (s6 - s8) * C81;
-         s6 = (s6 + s8) * C81;
-         t1 = r5 - r1;
-         r5 = r5 + r1;
-         r8 = r7 - r6;
-         r7 = r7 + r6;
-         t2 = s5 - r2;
-         s5 = s5 + r2;
-         s8 = s7 - s6;
-         s7 = s7 + s6;
-         pSrc[2 * i2]     = r5 + s7;
-         pSrc[2 * i8]     = r5 - s7;
-         pSrc[2 * i6]     = t1 + s8;
-         pSrc[2 * i4]     = t1 - s8;
-         pSrc[2 * i2 + 1] = s5 - r7;
-         pSrc[2 * i8 + 1] = s5 + r7;
-         pSrc[2 * i6 + 1] = t2 - r8;
-         pSrc[2 * i4 + 1] = t2 + r8;
-
-         i1 += n1;
-      } while (i1 < fftLen);
-
-      if (n2 < 8)
-         break;
-
-      ia1 = 0;
-      j = 1;
-
-      do
-      {
-         /*  index calculation for the coefficients */
-         id  = ia1 + twidCoefModifier;
-         ia1 = id;
-         ia2 = ia1 + id;
-         ia3 = ia2 + id;
-         ia4 = ia3 + id;
-         ia5 = ia4 + id;
-         ia6 = ia5 + id;
-         ia7 = ia6 + id;
-
-         co2 = pCoef[2 * ia1];
-         co3 = pCoef[2 * ia2];
-         co4 = pCoef[2 * ia3];
-         co5 = pCoef[2 * ia4];
-         co6 = pCoef[2 * ia5];
-         co7 = pCoef[2 * ia6];
-         co8 = pCoef[2 * ia7];
-         si2 = pCoef[2 * ia1 + 1];
-         si3 = pCoef[2 * ia2 + 1];
-         si4 = pCoef[2 * ia3 + 1];
-         si5 = pCoef[2 * ia4 + 1];
-         si6 = pCoef[2 * ia5 + 1];
-         si7 = pCoef[2 * ia6 + 1];
-         si8 = pCoef[2 * ia7 + 1];
-
-         i1 = j;
-
-         do
-         {
-            /*  index calculation for the input */
-            i2 = i1 + n2;
-            i3 = i2 + n2;
-            i4 = i3 + n2;
-            i5 = i4 + n2;
-            i6 = i5 + n2;
-            i7 = i6 + n2;
-            i8 = i7 + n2;
-            r1 = pSrc[2 * i1] + pSrc[2 * i5];
-            r5 = pSrc[2 * i1] - pSrc[2 * i5];
-            r2 = pSrc[2 * i2] + pSrc[2 * i6];
-            r6 = pSrc[2 * i2] - pSrc[2 * i6];
-            r3 = pSrc[2 * i3] + pSrc[2 * i7];
-            r7 = pSrc[2 * i3] - pSrc[2 * i7];
-            r4 = pSrc[2 * i4] + pSrc[2 * i8];
-            r8 = pSrc[2 * i4] - pSrc[2 * i8];
-            t1 = r1 - r3;
-            r1 = r1 + r3;
-            r3 = r2 - r4;
-            r2 = r2 + r4;
-            pSrc[2 * i1] = r1 + r2;
-            r2 = r1 - r2;
-            s1 = pSrc[2 * i1 + 1] + pSrc[2 * i5 + 1];
-            s5 = pSrc[2 * i1 + 1] - pSrc[2 * i5 + 1];
-            s2 = pSrc[2 * i2 + 1] + pSrc[2 * i6 + 1];
-            s6 = pSrc[2 * i2 + 1] - pSrc[2 * i6 + 1];
-            s3 = pSrc[2 * i3 + 1] + pSrc[2 * i7 + 1];
-            s7 = pSrc[2 * i3 + 1] - pSrc[2 * i7 + 1];
-            s4 = pSrc[2 * i4 + 1] + pSrc[2 * i8 + 1];
-            s8 = pSrc[2 * i4 + 1] - pSrc[2 * i8 + 1];
-            t2 = s1 - s3;
-            s1 = s1 + s3;
-            s3 = s2 - s4;
-            s2 = s2 + s4;
-            r1 = t1 + s3;
-            t1 = t1 - s3;
-            pSrc[2 * i1 + 1] = s1 + s2;
-            s2 = s1 - s2;
-            s1 = t2 - r3;
-            t2 = t2 + r3;
-            p1 = co5 * r2;
-            p2 = si5 * s2;
-            p3 = co5 * s2;
-            p4 = si5 * r2;
-            pSrc[2 * i5]     = p1 + p2;
-            pSrc[2 * i5 + 1] = p3 - p4;
-            p1 = co3 * r1;
-            p2 = si3 * s1;
-            p3 = co3 * s1;
-            p4 = si3 * r1;
-            pSrc[2 * i3]     = p1 + p2;
-            pSrc[2 * i3 + 1] = p3 - p4;
-            p1 = co7 * t1;
-            p2 = si7 * t2;
-            p3 = co7 * t2;
-            p4 = si7 * t1;
-            pSrc[2 * i7]     = p1 + p2;
-            pSrc[2 * i7 + 1] = p3 - p4;
-            r1 = (r6 - r8) * C81;
-            r6 = (r6 + r8) * C81;
-            s1 = (s6 - s8) * C81;
-            s6 = (s6 + s8) * C81;
-            t1 = r5 - r1;
-            r5 = r5 + r1;
-            r8 = r7 - r6;
-            r7 = r7 + r6;
-            t2 = s5 - s1;
-            s5 = s5 + s1;
-            s8 = s7 - s6;
-            s7 = s7 + s6;
-            r1 = r5 + s7;
-            r5 = r5 - s7;
-            r6 = t1 + s8;
-            t1 = t1 - s8;
-            s1 = s5 - r7;
-            s5 = s5 + r7;
-            s6 = t2 - r8;
-            t2 = t2 + r8;
-            p1 = co2 * r1;
-            p2 = si2 * s1;
-            p3 = co2 * s1;
-            p4 = si2 * r1;
-            pSrc[2 * i2]     = p1 + p2;
-            pSrc[2 * i2 + 1] = p3 - p4;
-            p1 = co8 * r5;
-            p2 = si8 * s5;
-            p3 = co8 * s5;
-            p4 = si8 * r5;
-            pSrc[2 * i8]     = p1 + p2;
-            pSrc[2 * i8 + 1] = p3 - p4;
-            p1 = co6 * r6;
-            p2 = si6 * s6;
-            p3 = co6 * s6;
-            p4 = si6 * r6;
-            pSrc[2 * i6]     = p1 + p2;
-            pSrc[2 * i6 + 1] = p3 - p4;
-            p1 = co4 * t1;
-            p2 = si4 * t2;
-            p3 = co4 * t2;
-            p4 = si4 * t1;
-            pSrc[2 * i4]     = p1 + p2;
-            pSrc[2 * i4 + 1] = p3 - p4;
-
-            i1 += n1;
-         } while (i1 < fftLen);
-
-         j++;
-      } while (j < n2);
-
-      twidCoefModifier <<= 3;
-   } while (n2 > 7);
-}
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_cfft_radix8_f32.c
+ * Description:  Radix-8 Decimation in Frequency CFFT & CIFFT Floating point processing function
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+
+/* ----------------------------------------------------------------------
+ * Internal helper function used by the FFTs
+ * -------------------------------------------------------------------- */
+
+/**
+  brief         Core function for the floating-point CFFT butterfly process.
+  param[in,out] pSrc             points to the in-place buffer of floating-point data type.
+  param[in]     fftLen           length of the FFT.
+  param[in]     pCoef            points to the twiddle coefficient buffer.
+  param[in]     twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
+  return        none
+*/
+
+void arm_radix8_butterfly_f32(
+  float32_t * pSrc,
+  uint16_t fftLen,
+  const float32_t * pCoef,
+  uint16_t twidCoefModifier)
+{
+   uint32_t ia1, ia2, ia3, ia4, ia5, ia6, ia7;
+   uint32_t i1, i2, i3, i4, i5, i6, i7, i8;
+   uint32_t id;
+   uint32_t n1, n2, j;
+
+   float32_t r1, r2, r3, r4, r5, r6, r7, r8;
+   float32_t t1, t2;
+   float32_t s1, s2, s3, s4, s5, s6, s7, s8;
+   float32_t p1, p2, p3, p4;
+   float32_t co2, co3, co4, co5, co6, co7, co8;
+   float32_t si2, si3, si4, si5, si6, si7, si8;
+   const float32_t C81 = 0.70710678118f;
+
+   n2 = fftLen;
+
+   do
+   {
+      n1 = n2;
+      n2 = n2 >> 3;
+      i1 = 0;
+
+      do
+      {
+         i2 = i1 + n2;
+         i3 = i2 + n2;
+         i4 = i3 + n2;
+         i5 = i4 + n2;
+         i6 = i5 + n2;
+         i7 = i6 + n2;
+         i8 = i7 + n2;
+         r1 = pSrc[2 * i1] + pSrc[2 * i5];
+         r5 = pSrc[2 * i1] - pSrc[2 * i5];
+         r2 = pSrc[2 * i2] + pSrc[2 * i6];
+         r6 = pSrc[2 * i2] - pSrc[2 * i6];
+         r3 = pSrc[2 * i3] + pSrc[2 * i7];
+         r7 = pSrc[2 * i3] - pSrc[2 * i7];
+         r4 = pSrc[2 * i4] + pSrc[2 * i8];
+         r8 = pSrc[2 * i4] - pSrc[2 * i8];
+         t1 = r1 - r3;
+         r1 = r1 + r3;
+         r3 = r2 - r4;
+         r2 = r2 + r4;
+         pSrc[2 * i1] = r1 + r2;
+         pSrc[2 * i5] = r1 - r2;
+         r1 = pSrc[2 * i1 + 1] + pSrc[2 * i5 + 1];
+         s5 = pSrc[2 * i1 + 1] - pSrc[2 * i5 + 1];
+         r2 = pSrc[2 * i2 + 1] + pSrc[2 * i6 + 1];
+         s6 = pSrc[2 * i2 + 1] - pSrc[2 * i6 + 1];
+         s3 = pSrc[2 * i3 + 1] + pSrc[2 * i7 + 1];
+         s7 = pSrc[2 * i3 + 1] - pSrc[2 * i7 + 1];
+         r4 = pSrc[2 * i4 + 1] + pSrc[2 * i8 + 1];
+         s8 = pSrc[2 * i4 + 1] - pSrc[2 * i8 + 1];
+         t2 = r1 - s3;
+         r1 = r1 + s3;
+         s3 = r2 - r4;
+         r2 = r2 + r4;
+         pSrc[2 * i1 + 1] = r1 + r2;
+         pSrc[2 * i5 + 1] = r1 - r2;
+         pSrc[2 * i3]     = t1 + s3;
+         pSrc[2 * i7]     = t1 - s3;
+         pSrc[2 * i3 + 1] = t2 - r3;
+         pSrc[2 * i7 + 1] = t2 + r3;
+         r1 = (r6 - r8) * C81;
+         r6 = (r6 + r8) * C81;
+         r2 = (s6 - s8) * C81;
+         s6 = (s6 + s8) * C81;
+         t1 = r5 - r1;
+         r5 = r5 + r1;
+         r8 = r7 - r6;
+         r7 = r7 + r6;
+         t2 = s5 - r2;
+         s5 = s5 + r2;
+         s8 = s7 - s6;
+         s7 = s7 + s6;
+         pSrc[2 * i2]     = r5 + s7;
+         pSrc[2 * i8]     = r5 - s7;
+         pSrc[2 * i6]     = t1 + s8;
+         pSrc[2 * i4]     = t1 - s8;
+         pSrc[2 * i2 + 1] = s5 - r7;
+         pSrc[2 * i8 + 1] = s5 + r7;
+         pSrc[2 * i6 + 1] = t2 - r8;
+         pSrc[2 * i4 + 1] = t2 + r8;
+
+         i1 += n1;
+      } while (i1 < fftLen);
+
+      if (n2 < 8)
+         break;
+
+      ia1 = 0;
+      j = 1;
+
+      do
+      {
+         /*  index calculation for the coefficients */
+         id  = ia1 + twidCoefModifier;
+         ia1 = id;
+         ia2 = ia1 + id;
+         ia3 = ia2 + id;
+         ia4 = ia3 + id;
+         ia5 = ia4 + id;
+         ia6 = ia5 + id;
+         ia7 = ia6 + id;
+
+         co2 = pCoef[2 * ia1];
+         co3 = pCoef[2 * ia2];
+         co4 = pCoef[2 * ia3];
+         co5 = pCoef[2 * ia4];
+         co6 = pCoef[2 * ia5];
+         co7 = pCoef[2 * ia6];
+         co8 = pCoef[2 * ia7];
+         si2 = pCoef[2 * ia1 + 1];
+         si3 = pCoef[2 * ia2 + 1];
+         si4 = pCoef[2 * ia3 + 1];
+         si5 = pCoef[2 * ia4 + 1];
+         si6 = pCoef[2 * ia5 + 1];
+         si7 = pCoef[2 * ia6 + 1];
+         si8 = pCoef[2 * ia7 + 1];
+
+         i1 = j;
+
+         do
+         {
+            /*  index calculation for the input */
+            i2 = i1 + n2;
+            i3 = i2 + n2;
+            i4 = i3 + n2;
+            i5 = i4 + n2;
+            i6 = i5 + n2;
+            i7 = i6 + n2;
+            i8 = i7 + n2;
+            r1 = pSrc[2 * i1] + pSrc[2 * i5];
+            r5 = pSrc[2 * i1] - pSrc[2 * i5];
+            r2 = pSrc[2 * i2] + pSrc[2 * i6];
+            r6 = pSrc[2 * i2] - pSrc[2 * i6];
+            r3 = pSrc[2 * i3] + pSrc[2 * i7];
+            r7 = pSrc[2 * i3] - pSrc[2 * i7];
+            r4 = pSrc[2 * i4] + pSrc[2 * i8];
+            r8 = pSrc[2 * i4] - pSrc[2 * i8];
+            t1 = r1 - r3;
+            r1 = r1 + r3;
+            r3 = r2 - r4;
+            r2 = r2 + r4;
+            pSrc[2 * i1] = r1 + r2;
+            r2 = r1 - r2;
+            s1 = pSrc[2 * i1 + 1] + pSrc[2 * i5 + 1];
+            s5 = pSrc[2 * i1 + 1] - pSrc[2 * i5 + 1];
+            s2 = pSrc[2 * i2 + 1] + pSrc[2 * i6 + 1];
+            s6 = pSrc[2 * i2 + 1] - pSrc[2 * i6 + 1];
+            s3 = pSrc[2 * i3 + 1] + pSrc[2 * i7 + 1];
+            s7 = pSrc[2 * i3 + 1] - pSrc[2 * i7 + 1];
+            s4 = pSrc[2 * i4 + 1] + pSrc[2 * i8 + 1];
+            s8 = pSrc[2 * i4 + 1] - pSrc[2 * i8 + 1];
+            t2 = s1 - s3;
+            s1 = s1 + s3;
+            s3 = s2 - s4;
+            s2 = s2 + s4;
+            r1 = t1 + s3;
+            t1 = t1 - s3;
+            pSrc[2 * i1 + 1] = s1 + s2;
+            s2 = s1 - s2;
+            s1 = t2 - r3;
+            t2 = t2 + r3;
+            p1 = co5 * r2;
+            p2 = si5 * s2;
+            p3 = co5 * s2;
+            p4 = si5 * r2;
+            pSrc[2 * i5]     = p1 + p2;
+            pSrc[2 * i5 + 1] = p3 - p4;
+            p1 = co3 * r1;
+            p2 = si3 * s1;
+            p3 = co3 * s1;
+            p4 = si3 * r1;
+            pSrc[2 * i3]     = p1 + p2;
+            pSrc[2 * i3 + 1] = p3 - p4;
+            p1 = co7 * t1;
+            p2 = si7 * t2;
+            p3 = co7 * t2;
+            p4 = si7 * t1;
+            pSrc[2 * i7]     = p1 + p2;
+            pSrc[2 * i7 + 1] = p3 - p4;
+            r1 = (r6 - r8) * C81;
+            r6 = (r6 + r8) * C81;
+            s1 = (s6 - s8) * C81;
+            s6 = (s6 + s8) * C81;
+            t1 = r5 - r1;
+            r5 = r5 + r1;
+            r8 = r7 - r6;
+            r7 = r7 + r6;
+            t2 = s5 - s1;
+            s5 = s5 + s1;
+            s8 = s7 - s6;
+            s7 = s7 + s6;
+            r1 = r5 + s7;
+            r5 = r5 - s7;
+            r6 = t1 + s8;
+            t1 = t1 - s8;
+            s1 = s5 - r7;
+            s5 = s5 + r7;
+            s6 = t2 - r8;
+            t2 = t2 + r8;
+            p1 = co2 * r1;
+            p2 = si2 * s1;
+            p3 = co2 * s1;
+            p4 = si2 * r1;
+            pSrc[2 * i2]     = p1 + p2;
+            pSrc[2 * i2 + 1] = p3 - p4;
+            p1 = co8 * r5;
+            p2 = si8 * s5;
+            p3 = co8 * s5;
+            p4 = si8 * r5;
+            pSrc[2 * i8]     = p1 + p2;
+            pSrc[2 * i8 + 1] = p3 - p4;
+            p1 = co6 * r6;
+            p2 = si6 * s6;
+            p3 = co6 * s6;
+            p4 = si6 * r6;
+            pSrc[2 * i6]     = p1 + p2;
+            pSrc[2 * i6 + 1] = p3 - p4;
+            p1 = co4 * t1;
+            p2 = si4 * t2;
+            p3 = co4 * t2;
+            p4 = si4 * t1;
+            pSrc[2 * i4]     = p1 + p2;
+            pSrc[2 * i4 + 1] = p3 - p4;
+
+            i1 += n1;
+         } while (i1 < fftLen);
+
+         j++;
+      } while (j < n2);
+
+      twidCoefModifier <<= 3;
+   } while (n2 > 7);
+}
diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_dct4_f32.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_dct4_f32.c
index b9dff3f..729ebf1 100644
--- a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_dct4_f32.c
+++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_dct4_f32.c
@@ -1,448 +1,448 @@
-/* ----------------------------------------------------------------------
- * Project:      CMSIS DSP Library
- * Title:        arm_dct4_f32.c
- * Description:  Processing function of DCT4 & IDCT4 F32
- *
- * $Date:        23 April 2021
- * $Revision:    V1.9.0
- *
- * Target Processor: Cortex-M and Cortex-A cores
- * -------------------------------------------------------------------- */
-/*
- * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "dsp/transform_functions.h"
-
-/**
-  @ingroup groupTransforms
- */
-
-/**
-  @defgroup DCT4_IDCT4 DCT Type IV Functions
-
-  Representation of signals by minimum number of values is important for storage and transmission.
-  The possibility of large discontinuity between the beginning and end of a period of a signal
-  in DFT can be avoided by extending the signal so that it is even-symmetric.
-  Discrete Cosine Transform (DCT) is constructed such that its energy is heavily concentrated in the lower part of the
-  spectrum and is very widely used in signal and image coding applications.
-  The family of DCTs (DCT type- 1,2,3,4) is the outcome of different combinations of homogeneous boundary conditions.
-  DCT has an excellent energy-packing capability, hence has many applications and in data compression in particular.
-  
-  DCT is essentially the Discrete Fourier Transform(DFT) of an even-extended real signal.
-  Reordering of the input data makes the computation of DCT just a problem of
-  computing the DFT of a real signal with a few additional operations.
-  This approach provides regular, simple, and very efficient DCT algorithms for practical hardware and software implementations.
-  
-  DCT type-II can be implemented using Fast fourier transform (FFT) internally, as the transform is applied on real values, Real FFT can be used.
-  DCT4 is implemented using DCT2 as their implementations are similar except with some added pre-processing and post-processing.
-  DCT2 implementation can be described in the following steps:
-  - Re-ordering input
-  - Calculating Real FFT
-  - Multiplication of weights and Real FFT output and getting real part from the product.
-  
-  This process is explained by the block diagram below:
-  \image html DCT4.gif "Discrete Cosine Transform - type-IV"
- 
-  @par           Algorithm
-                   The N-point type-IV DCT is defined as a real, linear transformation by the formula:
-                   \image html DCT4Equation.gif
-                   where <code>k = 0, 1, 2, ..., N-1</code>
-  @par
-                   Its inverse is defined as follows:
-                   \image html IDCT4Equation.gif
-                   where <code>n = 0, 1, 2, ..., N-1</code>
-  @par
-                   The DCT4 matrices become involutory (i.e. they are self-inverse) by multiplying with an overall scale factor of sqrt(2/N).
-                   The symmetry of the transform matrix indicates that the fast algorithms for the forward
-                   and inverse transform computation are identical.
-                   Note that the implementation of Inverse DCT4 and DCT4 is same, hence same process function can be used for both.
- 
-  @par           Lengths supported by the transform:
-                   As DCT4 internally uses Real FFT, it supports all the lengths 128, 512, 2048 and 8192.
-                   The library provides separate functions for Q15, Q31, and floating-point data types.
-
-  @par           Instance Structure
-                   The instances for Real FFT and FFT, cosine values table and twiddle factor table are stored in an instance data structure.
-                   A separate instance structure must be defined for each transform.
-                   There are separate instance structure declarations for each of the 3 supported data types.
-                 
-  @par           Initialization Functions
-                   There is also an associated initialization function for each data type.
-                   The initialization function performs the following operations:
-                   - Sets the values of the internal structure fields.
-                   - Initializes Real FFT as its process function is used internally in DCT4, by calling \ref arm_rfft_init_f32().
-  @par
-                   Use of the initialization function is optional.
-                   However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
-                   To place an instance structure into a const data section, the instance structure must be manually initialized.
-                   Manually initialize the instance structure as follows:
-  <pre>
-      arm_dct4_instance_f32 S = {N, Nby2, normalize, pTwiddle, pCosFactor, pRfft, pCfft};
-      arm_dct4_instance_q31 S = {N, Nby2, normalize, pTwiddle, pCosFactor, pRfft, pCfft};
-      arm_dct4_instance_q15 S = {N, Nby2, normalize, pTwiddle, pCosFactor, pRfft, pCfft};
-  </pre>
-                   where \c N is the length of the DCT4; \c Nby2 is half of the length of the DCT4;
-                   \c normalize is normalizing factor used and is equal to <code>sqrt(2/N)</code>;
-                   \c pTwiddle points to the twiddle factor table;
-                   \c pCosFactor points to the cosFactor table;
-                   \c pRfft points to the real FFT instance;
-                   \c pCfft points to the complex FFT instance;
-                   The CFFT and RFFT structures also needs to be initialized, refer to arm_cfft_radix4_f32()
-                   and arm_rfft_f32() respectively for details regarding static initialization.
- 
-  @par           Fixed-Point Behavior
-                   Care must be taken when using the fixed-point versions of the DCT4 transform functions.
-                   In particular, the overflow and saturation behavior of the accumulator used in each function must be considered.
-                   Refer to the function specific documentation below for usage guidelines.
- */
-
- /**
-  @addtogroup DCT4_IDCT4
-  @{
- */
-
-/**
-  @brief         Processing function for the floating-point DCT4/IDCT4.
-  @param[in]     S             points to an instance of the floating-point DCT4/IDCT4 structure
-  @param[in]     pState        points to state buffer
-  @param[in,out] pInlineBuffer points to the in-place input and output buffer
-  @return        none
- */
-
-void arm_dct4_f32(
-  const arm_dct4_instance_f32 * S,
-        float32_t * pState,
-        float32_t * pInlineBuffer)
-{
-  const float32_t *weights = S->pTwiddle;              /* Pointer to the Weights table */
-  const float32_t *cosFact = S->pCosFactor;            /* Pointer to the cos factors table */
-        float32_t *pS1, *pS2, *pbuff;                  /* Temporary pointers for input buffer and pState buffer */
-        float32_t in;                                  /* Temporary variable */
-        uint32_t i;                                    /* Loop counter */
-
-
-  /* DCT4 computation involves DCT2 (which is calculated using RFFT)
-   * along with some pre-processing and post-processing.
-   * Computational procedure is explained as follows:
-   * (a) Pre-processing involves multiplying input with cos factor,
-   *     r(n) = 2 * u(n) * cos(pi*(2*n+1)/(4*n))
-   *              where,
-   *                 r(n) -- output of preprocessing
-   *                 u(n) -- input to preprocessing(actual Source buffer)
-   * (b) Calculation of DCT2 using FFT is divided into three steps:
-   *                  Step1: Re-ordering of even and odd elements of input.
-   *                  Step2: Calculating FFT of the re-ordered input.
-   *                  Step3: Taking the real part of the product of FFT output and weights.
-   * (c) Post-processing - DCT4 can be obtained from DCT2 output using the following equation:
-   *                   Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
-   *                        where,
-   *                           Y4 -- DCT4 output,   Y2 -- DCT2 output
-   * (d) Multiplying the output with the normalizing factor sqrt(2/N).
-   */
-
-  /*-------- Pre-processing ------------*/
-  /* Multiplying input with cos factor i.e. r(n) = 2 * x(n) * cos(pi*(2*n+1)/(4*n)) */
-  arm_scale_f32(pInlineBuffer, 2.0f, pInlineBuffer, S->N);
-  arm_mult_f32(pInlineBuffer, cosFact, pInlineBuffer, S->N);
-
-  /* ----------------------------------------------------------------
-   * Step1: Re-ordering of even and odd elements as
-   *             pState[i] =  pInlineBuffer[2*i] and
-   *             pState[N-i-1] = pInlineBuffer[2*i+1] where i = 0 to N/2
-   ---------------------------------------------------------------------*/
-
-  /* pS1 initialized to pState */
-  pS1 = pState;
-
-  /* pS2 initialized to pState+N-1, so that it points to the end of the state buffer */
-  pS2 = pState + (S->N - 1U);
-
-  /* pbuff initialized to input buffer */
-  pbuff = pInlineBuffer;
-
-
-#if defined (ARM_MATH_LOOPUNROLL)
-
-  /* Initializing the loop counter to N/2 >> 2 for loop unrolling by 4 */
-  i = S->Nby2 >> 2U;
-
-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-   ** a second loop below computes the remaining 1 to 3 samples. */
-  do
-  {
-    /* Re-ordering of even and odd elements */
-    /* pState[i] =  pInlineBuffer[2*i] */
-    *pS1++ = *pbuff++;
-    /* pState[N-i-1] = pInlineBuffer[2*i+1] */
-    *pS2-- = *pbuff++;
-
-    *pS1++ = *pbuff++;
-    *pS2-- = *pbuff++;
-
-    *pS1++ = *pbuff++;
-    *pS2-- = *pbuff++;
-
-    *pS1++ = *pbuff++;
-    *pS2-- = *pbuff++;
-
-    /* Decrement loop counter */
-    i--;
-  } while (i > 0U);
-
-  /* pbuff initialized to input buffer */
-  pbuff = pInlineBuffer;
-
-  /* pS1 initialized to pState */
-  pS1 = pState;
-
-  /* Initializing the loop counter to N/4 instead of N for loop unrolling */
-  i = S->N >> 2U;
-
-  /* Processing with loop unrolling 4 times as N is always multiple of 4.
-   * Compute 4 outputs at a time */
-  do
-  {
-    /* Writing the re-ordered output back to inplace input buffer */
-    *pbuff++ = *pS1++;
-    *pbuff++ = *pS1++;
-    *pbuff++ = *pS1++;
-    *pbuff++ = *pS1++;
-
-    /* Decrement the loop counter */
-    i--;
-  } while (i > 0U);
-
-
-  /* ---------------------------------------------------------
-   *     Step2: Calculate RFFT for N-point input
-   * ---------------------------------------------------------- */
-  /* pInlineBuffer is real input of length N , pState is the complex output of length 2N */
-  arm_rfft_f32 (S->pRfft, pInlineBuffer, pState);
-
-  /*----------------------------------------------------------------------
-   *  Step3: Multiply the FFT output with the weights.
-   *----------------------------------------------------------------------*/
-  arm_cmplx_mult_cmplx_f32 (pState, weights, pState, S->N);
-
-  /* ----------- Post-processing ---------- */
-  /* DCT-IV can be obtained from DCT-II by the equation,
-   *       Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
-   *       Hence, Y4(0) = Y2(0)/2  */
-  /* Getting only real part from the output and Converting to DCT-IV */
-
-  /* Initializing the loop counter to N >> 2 for loop unrolling by 4 */
-  i = (S->N - 1U) >> 2U;
-
-  /* pbuff initialized to input buffer. */
-  pbuff = pInlineBuffer;
-
-  /* pS1 initialized to pState */
-  pS1 = pState;
-
-  /* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */
-  in = *pS1++ * (float32_t) 0.5;
-  /* input buffer acts as inplace, so output values are stored in the input itself. */
-  *pbuff++ = in;
-
-  /* pState pointer is incremented twice as the real values are located alternatively in the array */
-  pS1++;
-
-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-   ** a second loop below computes the remaining 1 to 3 samples. */
-  do
-  {
-    /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
-    /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
-    in = *pS1++ - in;
-    *pbuff++ = in;
-    /* points to the next real value */
-    pS1++;
-
-    in = *pS1++ - in;
-    *pbuff++ = in;
-    pS1++;
-
-    in = *pS1++ - in;
-    *pbuff++ = in;
-    pS1++;
-
-    in = *pS1++ - in;
-    *pbuff++ = in;
-    pS1++;
-
-    /* Decrement the loop counter */
-    i--;
-  } while (i > 0U);
-
-  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
-   ** No loop unrolling is used. */
-  i = (S->N - 1U) % 0x4U;
-
-  while (i > 0U)
-  {
-    /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
-    /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
-    in = *pS1++ - in;
-    *pbuff++ = in;
-
-    /* points to the next real value */
-    pS1++;
-
-    /* Decrement the loop counter */
-    i--;
-  }
-
-
-  /*------------ Normalizing the output by multiplying with the normalizing factor ----------*/
-
-  /* Initializing the loop counter to N/4 instead of N for loop unrolling */
-  i = S->N >> 2U;
-
-  /* pbuff initialized to the pInlineBuffer(now contains the output values) */
-  pbuff = pInlineBuffer;
-
-  /* Processing with loop unrolling 4 times as N is always multiple of 4.  Compute 4 outputs at a time */
-  do
-  {
-    /* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */
-    in = *pbuff;
-    *pbuff++ = in * S->normalize;
-
-    in = *pbuff;
-    *pbuff++ = in * S->normalize;
-
-    in = *pbuff;
-    *pbuff++ = in * S->normalize;
-
-    in = *pbuff;
-    *pbuff++ = in * S->normalize;
-
-    /* Decrement the loop counter */
-    i--;
-  } while (i > 0U);
-
-
-#else
-
-  /* Initializing the loop counter to N/2 */
-  i = S->Nby2;
-
-  do
-  {
-    /* Re-ordering of even and odd elements */
-    /* pState[i] =  pInlineBuffer[2*i] */
-    *pS1++ = *pbuff++;
-    /* pState[N-i-1] = pInlineBuffer[2*i+1] */
-    *pS2-- = *pbuff++;
-
-    /* Decrement the loop counter */
-    i--;
-  } while (i > 0U);
-
-  /* pbuff initialized to input buffer */
-  pbuff = pInlineBuffer;
-
-  /* pS1 initialized to pState */
-  pS1 = pState;
-
-  /* Initializing the loop counter */
-  i = S->N;
-
-  do
-  {
-    /* Writing the re-ordered output back to inplace input buffer */
-    *pbuff++ = *pS1++;
-
-    /* Decrement the loop counter */
-    i--;
-  } while (i > 0U);
-
-
-  /* ---------------------------------------------------------
-   *     Step2: Calculate RFFT for N-point input
-   * ---------------------------------------------------------- */
-  /* pInlineBuffer is real input of length N , pState is the complex output of length 2N */
-  arm_rfft_f32 (S->pRfft, pInlineBuffer, pState);
-
-  /*----------------------------------------------------------------------
-   *  Step3: Multiply the FFT output with the weights.
-   *----------------------------------------------------------------------*/
-  arm_cmplx_mult_cmplx_f32 (pState, weights, pState, S->N);
-
-  /* ----------- Post-processing ---------- */
-  /* DCT-IV can be obtained from DCT-II by the equation,
-   *       Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
-   *       Hence, Y4(0) = Y2(0)/2  */
-  /* Getting only real part from the output and Converting to DCT-IV */
-
-  /* pbuff initialized to input buffer. */
-  pbuff = pInlineBuffer;
-
-  /* pS1 initialized to pState */
-  pS1 = pState;
-
-  /* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */
-  in = *pS1++ * (float32_t) 0.5;
-  /* input buffer acts as inplace, so output values are stored in the input itself. */
-  *pbuff++ = in;
-
-  /* pState pointer is incremented twice as the real values are located alternatively in the array */
-  pS1++;
-
-  /* Initializing the loop counter */
-  i = (S->N - 1U);
-
-  do
-  {
-    /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
-    /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
-    in = *pS1++ - in;
-    *pbuff++ = in;
-
-    /* points to the next real value */
-    pS1++;
-
-    /* Decrement loop counter */
-    i--;
-  } while (i > 0U);
-
-  /*------------ Normalizing the output by multiplying with the normalizing factor ----------*/
-
-  /* Initializing loop counter */
-  i = S->N;
-
-  /* pbuff initialized to the pInlineBuffer (now contains the output values) */
-  pbuff = pInlineBuffer;
-
-  do
-  {
-    /* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */
-    in = *pbuff;
-    *pbuff++ = in * S->normalize;
-
-    /* Decrement loop counter */
-    i--;
-  } while (i > 0U);
-
-#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
-
-}
-
-/**
-  @} end of DCT4_IDCT4 group
- */
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_dct4_f32.c
+ * Description:  Processing function of DCT4 & IDCT4 F32
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupTransforms
+ */
+
+/**
+  @defgroup DCT4_IDCT4 DCT Type IV Functions
+
+  Representation of signals by minimum number of values is important for storage and transmission.
+  The possibility of large discontinuity between the beginning and end of a period of a signal
+  in DFT can be avoided by extending the signal so that it is even-symmetric.
+  Discrete Cosine Transform (DCT) is constructed such that its energy is heavily concentrated in the lower part of the
+  spectrum and is very widely used in signal and image coding applications.
+  The family of DCTs (DCT type- 1,2,3,4) is the outcome of different combinations of homogeneous boundary conditions.
+  DCT has an excellent energy-packing capability, hence has many applications and in data compression in particular.
+  
+  DCT is essentially the Discrete Fourier Transform(DFT) of an even-extended real signal.
+  Reordering of the input data makes the computation of DCT just a problem of
+  computing the DFT of a real signal with a few additional operations.
+  This approach provides regular, simple, and very efficient DCT algorithms for practical hardware and software implementations.
+  
+  DCT type-II can be implemented using Fast fourier transform (FFT) internally, as the transform is applied on real values, Real FFT can be used.
+  DCT4 is implemented using DCT2 as their implementations are similar except with some added pre-processing and post-processing.
+  DCT2 implementation can be described in the following steps:
+  - Re-ordering input
+  - Calculating Real FFT
+  - Multiplication of weights and Real FFT output and getting real part from the product.
+  
+  This process is explained by the block diagram below:
+  \image html DCT4.gif "Discrete Cosine Transform - type-IV"
+ 
+  @par           Algorithm
+                   The N-point type-IV DCT is defined as a real, linear transformation by the formula:
+                   \image html DCT4Equation.gif
+                   where <code>k = 0, 1, 2, ..., N-1</code>
+  @par
+                   Its inverse is defined as follows:
+                   \image html IDCT4Equation.gif
+                   where <code>n = 0, 1, 2, ..., N-1</code>
+  @par
+                   The DCT4 matrices become involutory (i.e. they are self-inverse) by multiplying with an overall scale factor of sqrt(2/N).
+                   The symmetry of the transform matrix indicates that the fast algorithms for the forward
+                   and inverse transform computation are identical.
+                   Note that the implementation of Inverse DCT4 and DCT4 is same, hence same process function can be used for both.
+ 
+  @par           Lengths supported by the transform:
+                   As DCT4 internally uses Real FFT, it supports all the lengths 128, 512, 2048 and 8192.
+                   The library provides separate functions for Q15, Q31, and floating-point data types.
+
+  @par           Instance Structure
+                   The instances for Real FFT and FFT, cosine values table and twiddle factor table are stored in an instance data structure.
+                   A separate instance structure must be defined for each transform.
+                   There are separate instance structure declarations for each of the 3 supported data types.
+                 
+  @par           Initialization Functions
+                   There is also an associated initialization function for each data type.
+                   The initialization function performs the following operations:
+                   - Sets the values of the internal structure fields.
+                   - Initializes Real FFT as its process function is used internally in DCT4, by calling \ref arm_rfft_init_f32().
+  @par
+                   Use of the initialization function is optional.
+                   However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
+                   To place an instance structure into a const data section, the instance structure must be manually initialized.
+                   Manually initialize the instance structure as follows:
+  <pre>
+      arm_dct4_instance_f32 S = {N, Nby2, normalize, pTwiddle, pCosFactor, pRfft, pCfft};
+      arm_dct4_instance_q31 S = {N, Nby2, normalize, pTwiddle, pCosFactor, pRfft, pCfft};
+      arm_dct4_instance_q15 S = {N, Nby2, normalize, pTwiddle, pCosFactor, pRfft, pCfft};
+  </pre>
+                   where \c N is the length of the DCT4; \c Nby2 is half of the length of the DCT4;
+                   \c normalize is normalizing factor used and is equal to <code>sqrt(2/N)</code>;
+                   \c pTwiddle points to the twiddle factor table;
+                   \c pCosFactor points to the cosFactor table;
+                   \c pRfft points to the real FFT instance;
+                   \c pCfft points to the complex FFT instance;
+                   The CFFT and RFFT structures also needs to be initialized, refer to arm_cfft_radix4_f32()
+                   and arm_rfft_f32() respectively for details regarding static initialization.
+ 
+  @par           Fixed-Point Behavior
+                   Care must be taken when using the fixed-point versions of the DCT4 transform functions.
+                   In particular, the overflow and saturation behavior of the accumulator used in each function must be considered.
+                   Refer to the function specific documentation below for usage guidelines.
+ */
+
+ /**
+  @addtogroup DCT4_IDCT4
+  @{
+ */
+
+/**
+  @brief         Processing function for the floating-point DCT4/IDCT4.
+  @param[in]     S             points to an instance of the floating-point DCT4/IDCT4 structure
+  @param[in]     pState        points to state buffer
+  @param[in,out] pInlineBuffer points to the in-place input and output buffer
+  @return        none
+ */
+
+void arm_dct4_f32(
+  const arm_dct4_instance_f32 * S,
+        float32_t * pState,
+        float32_t * pInlineBuffer)
+{
+  const float32_t *weights = S->pTwiddle;              /* Pointer to the Weights table */
+  const float32_t *cosFact = S->pCosFactor;            /* Pointer to the cos factors table */
+        float32_t *pS1, *pS2, *pbuff;                  /* Temporary pointers for input buffer and pState buffer */
+        float32_t in;                                  /* Temporary variable */
+        uint32_t i;                                    /* Loop counter */
+
+
+  /* DCT4 computation involves DCT2 (which is calculated using RFFT)
+   * along with some pre-processing and post-processing.
+   * Computational procedure is explained as follows:
+   * (a) Pre-processing involves multiplying input with cos factor,
+   *     r(n) = 2 * u(n) * cos(pi*(2*n+1)/(4*n))
+   *              where,
+   *                 r(n) -- output of preprocessing
+   *                 u(n) -- input to preprocessing(actual Source buffer)
+   * (b) Calculation of DCT2 using FFT is divided into three steps:
+   *                  Step1: Re-ordering of even and odd elements of input.
+   *                  Step2: Calculating FFT of the re-ordered input.
+   *                  Step3: Taking the real part of the product of FFT output and weights.
+   * (c) Post-processing - DCT4 can be obtained from DCT2 output using the following equation:
+   *                   Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
+   *                        where,
+   *                           Y4 -- DCT4 output,   Y2 -- DCT2 output
+   * (d) Multiplying the output with the normalizing factor sqrt(2/N).
+   */
+
+  /*-------- Pre-processing ------------*/
+  /* Multiplying input with cos factor i.e. r(n) = 2 * x(n) * cos(pi*(2*n+1)/(4*n)) */
+  arm_scale_f32(pInlineBuffer, 2.0f, pInlineBuffer, S->N);
+  arm_mult_f32(pInlineBuffer, cosFact, pInlineBuffer, S->N);
+
+  /* ----------------------------------------------------------------
+   * Step1: Re-ordering of even and odd elements as
+   *             pState[i] =  pInlineBuffer[2*i] and
+   *             pState[N-i-1] = pInlineBuffer[2*i+1] where i = 0 to N/2
+   ---------------------------------------------------------------------*/
+
+  /* pS1 initialized to pState */
+  pS1 = pState;
+
+  /* pS2 initialized to pState+N-1, so that it points to the end of the state buffer */
+  pS2 = pState + (S->N - 1U);
+
+  /* pbuff initialized to input buffer */
+  pbuff = pInlineBuffer;
+
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Initializing the loop counter to N/2 >> 2 for loop unrolling by 4 */
+  i = S->Nby2 >> 2U;
+
+  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
+   ** a second loop below computes the remaining 1 to 3 samples. */
+  do
+  {
+    /* Re-ordering of even and odd elements */
+    /* pState[i] =  pInlineBuffer[2*i] */
+    *pS1++ = *pbuff++;
+    /* pState[N-i-1] = pInlineBuffer[2*i+1] */
+    *pS2-- = *pbuff++;
+
+    *pS1++ = *pbuff++;
+    *pS2-- = *pbuff++;
+
+    *pS1++ = *pbuff++;
+    *pS2-- = *pbuff++;
+
+    *pS1++ = *pbuff++;
+    *pS2-- = *pbuff++;
+
+    /* Decrement loop counter */
+    i--;
+  } while (i > 0U);
+
+  /* pbuff initialized to input buffer */
+  pbuff = pInlineBuffer;
+
+  /* pS1 initialized to pState */
+  pS1 = pState;
+
+  /* Initializing the loop counter to N/4 instead of N for loop unrolling */
+  i = S->N >> 2U;
+
+  /* Processing with loop unrolling 4 times as N is always multiple of 4.
+   * Compute 4 outputs at a time */
+  do
+  {
+    /* Writing the re-ordered output back to inplace input buffer */
+    *pbuff++ = *pS1++;
+    *pbuff++ = *pS1++;
+    *pbuff++ = *pS1++;
+    *pbuff++ = *pS1++;
+
+    /* Decrement the loop counter */
+    i--;
+  } while (i > 0U);
+
+
+  /* ---------------------------------------------------------
+   *     Step2: Calculate RFFT for N-point input
+   * ---------------------------------------------------------- */
+  /* pInlineBuffer is real input of length N , pState is the complex output of length 2N */
+  arm_rfft_f32 (S->pRfft, pInlineBuffer, pState);
+
+  /*----------------------------------------------------------------------
+   *  Step3: Multiply the FFT output with the weights.
+   *----------------------------------------------------------------------*/
+  arm_cmplx_mult_cmplx_f32 (pState, weights, pState, S->N);
+
+  /* ----------- Post-processing ---------- */
+  /* DCT-IV can be obtained from DCT-II by the equation,
+   *       Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
+   *       Hence, Y4(0) = Y2(0)/2  */
+  /* Getting only real part from the output and Converting to DCT-IV */
+
+  /* Initializing the loop counter to N >> 2 for loop unrolling by 4 */
+  i = (S->N - 1U) >> 2U;
+
+  /* pbuff initialized to input buffer. */
+  pbuff = pInlineBuffer;
+
+  /* pS1 initialized to pState */
+  pS1 = pState;
+
+  /* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */
+  in = *pS1++ * (float32_t) 0.5;
+  /* input buffer acts as inplace, so output values are stored in the input itself. */
+  *pbuff++ = in;
+
+  /* pState pointer is incremented twice as the real values are located alternatively in the array */
+  pS1++;
+
+  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
+   ** a second loop below computes the remaining 1 to 3 samples. */
+  do
+  {
+    /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
+    /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
+    in = *pS1++ - in;
+    *pbuff++ = in;
+    /* points to the next real value */
+    pS1++;
+
+    in = *pS1++ - in;
+    *pbuff++ = in;
+    pS1++;
+
+    in = *pS1++ - in;
+    *pbuff++ = in;
+    pS1++;
+
+    in = *pS1++ - in;
+    *pbuff++ = in;
+    pS1++;
+
+    /* Decrement the loop counter */
+    i--;
+  } while (i > 0U);
+
+  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+   ** No loop unrolling is used. */
+  i = (S->N - 1U) % 0x4U;
+
+  while (i > 0U)
+  {
+    /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
+    /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
+    in = *pS1++ - in;
+    *pbuff++ = in;
+
+    /* points to the next real value */
+    pS1++;
+
+    /* Decrement the loop counter */
+    i--;
+  }
+
+
+  /*------------ Normalizing the output by multiplying with the normalizing factor ----------*/
+
+  /* Initializing the loop counter to N/4 instead of N for loop unrolling */
+  i = S->N >> 2U;
+
+  /* pbuff initialized to the pInlineBuffer(now contains the output values) */
+  pbuff = pInlineBuffer;
+
+  /* Processing with loop unrolling 4 times as N is always multiple of 4.  Compute 4 outputs at a time */
+  do
+  {
+    /* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */
+    in = *pbuff;
+    *pbuff++ = in * S->normalize;
+
+    in = *pbuff;
+    *pbuff++ = in * S->normalize;
+
+    in = *pbuff;
+    *pbuff++ = in * S->normalize;
+
+    in = *pbuff;
+    *pbuff++ = in * S->normalize;
+
+    /* Decrement the loop counter */
+    i--;
+  } while (i > 0U);
+
+
+#else
+
+  /* Initializing the loop counter to N/2 */
+  i = S->Nby2;
+
+  do
+  {
+    /* Re-ordering of even and odd elements */
+    /* pState[i] =  pInlineBuffer[2*i] */
+    *pS1++ = *pbuff++;
+    /* pState[N-i-1] = pInlineBuffer[2*i+1] */
+    *pS2-- = *pbuff++;
+
+    /* Decrement the loop counter */
+    i--;
+  } while (i > 0U);
+
+  /* pbuff initialized to input buffer */
+  pbuff = pInlineBuffer;
+
+  /* pS1 initialized to pState */
+  pS1 = pState;
+
+  /* Initializing the loop counter */
+  i = S->N;
+
+  do
+  {
+    /* Writing the re-ordered output back to inplace input buffer */
+    *pbuff++ = *pS1++;
+
+    /* Decrement the loop counter */
+    i--;
+  } while (i > 0U);
+
+
+  /* ---------------------------------------------------------
+   *     Step2: Calculate RFFT for N-point input
+   * ---------------------------------------------------------- */
+  /* pInlineBuffer is real input of length N , pState is the complex output of length 2N */
+  arm_rfft_f32 (S->pRfft, pInlineBuffer, pState);
+
+  /*----------------------------------------------------------------------
+   *  Step3: Multiply the FFT output with the weights.
+   *----------------------------------------------------------------------*/
+  arm_cmplx_mult_cmplx_f32 (pState, weights, pState, S->N);
+
+  /* ----------- Post-processing ---------- */
+  /* DCT-IV can be obtained from DCT-II by the equation,
+   *       Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
+   *       Hence, Y4(0) = Y2(0)/2  */
+  /* Getting only real part from the output and Converting to DCT-IV */
+
+  /* pbuff initialized to input buffer. */
+  pbuff = pInlineBuffer;
+
+  /* pS1 initialized to pState */
+  pS1 = pState;
+
+  /* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */
+  in = *pS1++ * (float32_t) 0.5;
+  /* input buffer acts as inplace, so output values are stored in the input itself. */
+  *pbuff++ = in;
+
+  /* pState pointer is incremented twice as the real values are located alternatively in the array */
+  pS1++;
+
+  /* Initializing the loop counter */
+  i = (S->N - 1U);
+
+  do
+  {
+    /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
+    /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
+    in = *pS1++ - in;
+    *pbuff++ = in;
+
+    /* points to the next real value */
+    pS1++;
+
+    /* Decrement loop counter */
+    i--;
+  } while (i > 0U);
+
+  /*------------ Normalizing the output by multiplying with the normalizing factor ----------*/
+
+  /* Initializing loop counter */
+  i = S->N;
+
+  /* pbuff initialized to the pInlineBuffer (now contains the output values) */
+  pbuff = pInlineBuffer;
+
+  do
+  {
+    /* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */
+    in = *pbuff;
+    *pbuff++ = in * S->normalize;
+
+    /* Decrement loop counter */
+    i--;
+  } while (i > 0U);
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+}
+
+/**
+  @} end of DCT4_IDCT4 group
+ */
diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_dct4_init_f32.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_dct4_init_f32.c
index e1d80c0..7522454 100644
--- a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_dct4_init_f32.c
+++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_dct4_init_f32.c
@@ -1,130 +1,131 @@
-/* ----------------------------------------------------------------------
- * Project:      CMSIS DSP Library
- * Title:        arm_dct4_init_f32.c
- * Description:  Initialization function of DCT-4 & IDCT4 F32
- *
- * $Date:        23 April 2021
- * $Revision:    V1.9.0
- *
- * Target Processor: Cortex-M and Cortex-A cores
- * -------------------------------------------------------------------- */
-/*
- * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "dsp/transform_functions.h"
-#include "arm_common_tables.h"
-
-/**
-  @ingroup groupTransforms
- */
-
- /**
-  @addtogroup DCT4_IDCT4
-  @{
- */
-
-/**
-  @brief         Initialization function for the floating-point DCT4/IDCT4.
-  @param[in,out] S          points to an instance of floating-point DCT4/IDCT4 structure
-  @param[in]     S_RFFT     points to an instance of floating-point RFFT/RIFFT structure
-  @param[in]     S_CFFT     points to an instance of floating-point CFFT/CIFFT structure
-  @param[in]     N			length of the DCT4
-  @param[in]     Nby2       half of the length of the DCT4
-  @param[in]     normalize  normalizing factor.
-  @return        execution status
-                   - \ref ARM_MATH_SUCCESS        : Operation successful
-                   - \ref ARM_MATH_ARGUMENT_ERROR : <code>N</code> is not a supported transform length
-
-  @par           Normalizing factor
-                   The normalizing factor is <code>sqrt(2/N)</code>, which depends on the size of transform <code>N</code>.
-                   Floating-point normalizing factors are mentioned in the table below for different DCT sizes:
-
-                   \image html dct4NormalizingF32Table.gif
- */
-
-arm_status arm_dct4_init_f32(
-  arm_dct4_instance_f32 * S,
-  arm_rfft_instance_f32 * S_RFFT,
-  arm_cfft_radix4_instance_f32 * S_CFFT,
-  uint16_t N,
-  uint16_t Nby2,
-  float32_t normalize)
-{
-  /* Initialize the default arm status */
-  arm_status status = ARM_MATH_SUCCESS;
-
-
-  /* Initialize the DCT4 length */
-  S->N = N;
-
-  /* Initialize the half of DCT4 length */
-  S->Nby2 = Nby2;
-
-  /* Initialize the DCT4 Normalizing factor */
-  S->normalize = normalize;
-
-  /* Initialize Real FFT Instance */
-  S->pRfft = S_RFFT;
-
-  /* Initialize Complex FFT Instance */
-  S->pCfft = S_CFFT;
-
-  switch (N)
-  {
-  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_F32_8192)
-    /* Initialize the table modifier values */
-  case 8192U:
-    S->pTwiddle = Weights_8192;
-    S->pCosFactor = cos_factors_8192;
-    break;
-  #endif
-
-  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_F32_2048)
-  case 2048U:
-    S->pTwiddle = Weights_2048;
-    S->pCosFactor = cos_factors_2048;
-    break;
-  #endif
-
-  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_F32_512)
-  case 512U:
-    S->pTwiddle = Weights_512;
-    S->pCosFactor = cos_factors_512;
-    break;
-  #endif
-
-  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_F32_128)
-  case 128U:
-    S->pTwiddle = Weights_128;
-    S->pCosFactor = cos_factors_128;
-    break;
-  #endif
-  default:
-    status = ARM_MATH_ARGUMENT_ERROR;
-  }
-
-  /* Initialize the RFFT/RIFFT Function */
-  arm_rfft_init_f32(S->pRfft, S->pCfft, S->N, 0U, 1U);
-
-  /* return the status of DCT4 Init function */
-  return (status);
-}
-
-/**
-  @} end of DCT4_IDCT4 group
- */
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_dct4_init_f32.c
+ * Description:  Initialization function of DCT-4 & IDCT4 F32
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+#include "arm_common_tables.h"
+
+/**
+  @ingroup DCT4_IDCT4
+ */
+
+
+/**
+  @addtogroup DCT4_IDCT4
+  @{
+ */
+
+/**
+  @brief         Initialization function for the floating-point DCT4/IDCT4.
+  @param[in,out] S          points to an instance of floating-point DCT4/IDCT4 structure
+  @param[in]     S_RFFT     points to an instance of floating-point RFFT/RIFFT structure
+  @param[in]     S_CFFT     points to an instance of floating-point CFFT/CIFFT structure
+  @param[in]     N			length of the DCT4
+  @param[in]     Nby2       half of the length of the DCT4
+  @param[in]     normalize  normalizing factor.
+  @return        execution status
+                   - \ref ARM_MATH_SUCCESS        : Operation successful
+                   - \ref ARM_MATH_ARGUMENT_ERROR : <code>N</code> is not a supported transform length
+
+  @par           Normalizing factor
+                   The normalizing factor is <code>sqrt(2/N)</code>, which depends on the size of transform <code>N</code>.
+                   Floating-point normalizing factors are mentioned in the table below for different DCT sizes:
+
+                   \image html dct4NormalizingF32Table.gif
+ */
+
+arm_status arm_dct4_init_f32(
+  arm_dct4_instance_f32 * S,
+  arm_rfft_instance_f32 * S_RFFT,
+  arm_cfft_radix4_instance_f32 * S_CFFT,
+  uint16_t N,
+  uint16_t Nby2,
+  float32_t normalize)
+{
+  /* Initialize the default arm status */
+  arm_status status = ARM_MATH_SUCCESS;
+
+
+  /* Initialize the DCT4 length */
+  S->N = N;
+
+  /* Initialize the half of DCT4 length */
+  S->Nby2 = Nby2;
+
+  /* Initialize the DCT4 Normalizing factor */
+  S->normalize = normalize;
+
+  /* Initialize Real FFT Instance */
+  S->pRfft = S_RFFT;
+
+  /* Initialize Complex FFT Instance */
+  S->pCfft = S_CFFT;
+
+  switch (N)
+  {
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_F32_8192)
+    /* Initialize the table modifier values */
+  case 8192U:
+    S->pTwiddle = Weights_8192;
+    S->pCosFactor = cos_factors_8192;
+    break;
+  #endif
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_F32_2048)
+  case 2048U:
+    S->pTwiddle = Weights_2048;
+    S->pCosFactor = cos_factors_2048;
+    break;
+  #endif
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_F32_512)
+  case 512U:
+    S->pTwiddle = Weights_512;
+    S->pCosFactor = cos_factors_512;
+    break;
+  #endif
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_F32_128)
+  case 128U:
+    S->pTwiddle = Weights_128;
+    S->pCosFactor = cos_factors_128;
+    break;
+  #endif
+  default:
+    status = ARM_MATH_ARGUMENT_ERROR;
+  }
+
+  /* Initialize the RFFT/RIFFT Function */
+  arm_rfft_init_f32(S->pRfft, S->pCfft, S->N, 0U, 1U);
+
+  /* return the status of DCT4 Init function */
+  return (status);
+}
+
+/**
+  @} end of DCT4_IDCT4 group
+ */
diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_dct4_init_q15.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_dct4_init_q15.c
index 5390da3..c5d834a 100644
--- a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_dct4_init_q15.c
+++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_dct4_init_q15.c
@@ -1,130 +1,130 @@
-/* ----------------------------------------------------------------------
- * Project:      CMSIS DSP Library
- * Title:        arm_dct4_init_q15.c
- * Description:  Initialization function of DCT-4 & IDCT4 Q15
- *
- * $Date:        23 April 2021
- * $Revision:    V1.9.0
- *
- * Target Processor: Cortex-M and Cortex-A cores
- * -------------------------------------------------------------------- */
-/*
- * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "dsp/transform_functions.h"
-#include "arm_common_tables.h"
-
-/**
-  @ingroup groupTransforms
- */
-
- /**
-  @addtogroup DCT4_IDCT4
-  @{
- */
-
-/**
-  @brief         Initialization function for the Q15 DCT4/IDCT4.
-  @param[in,out] S         points to an instance of Q15 DCT4/IDCT4 structure
-  @param[in]     S_RFFT    points to an instance of Q15 RFFT/RIFFT structure
-  @param[in]     S_CFFT    points to an instance of Q15 CFFT/CIFFT structure
-  @param[in]     N          length of the DCT4
-  @param[in]     Nby2       half of the length of the DCT4
-  @param[in]     normalize  normalizing factor
-  @return        execution status
-                   - \ref ARM_MATH_SUCCESS        : Operation successful
-                   - \ref ARM_MATH_ARGUMENT_ERROR : <code>N</code> is not a supported transform length
-
-  @par           Normalizing factor
-                   The normalizing factor is <code>sqrt(2/N)</code>, which depends on the size of transform <code>N</code>.
-                   Normalizing factors in 1.15 format are mentioned in the table below for different DCT sizes:
-
-                   \image html dct4NormalizingQ15Table.gif
- */
-
-arm_status arm_dct4_init_q15(
-  arm_dct4_instance_q15 * S,
-  arm_rfft_instance_q15 * S_RFFT,
-  arm_cfft_radix4_instance_q15 * S_CFFT,
-  uint16_t N,
-  uint16_t Nby2,
-  q15_t normalize)
-{
-  /*  Initialise the default arm status */
-  arm_status status = ARM_MATH_SUCCESS;
-
-  /* Initialize the DCT4 length */
-  S->N = N;
-
-  /* Initialize the half of DCT4 length */
-  S->Nby2 = Nby2;
-
-  /* Initialize the DCT4 Normalizing factor */
-  S->normalize = normalize;
-
-  /* Initialize Real FFT Instance */
-  S->pRfft = S_RFFT;
-
-  /* Initialize Complex FFT Instance */
-  S->pCfft = S_CFFT;
-
-  switch (N)
-  {
-  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_Q15_8192)
-    /* Initialize the table modifier values */
-  case 8192U:
-    S->pTwiddle = WeightsQ15_8192;
-    S->pCosFactor = cos_factorsQ15_8192;
-    break;
-  #endif
-
-  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_Q15_2048)
-  case 2048U:
-    S->pTwiddle = WeightsQ15_2048;
-    S->pCosFactor = cos_factorsQ15_2048;
-    break;
-  #endif
-
-  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_Q15_512)
-  case 512U:
-    S->pTwiddle = WeightsQ15_512;
-    S->pCosFactor = cos_factorsQ15_512;
-    break;
-  #endif 
-
-  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_Q15_128)
-  case 128U:
-    S->pTwiddle = WeightsQ15_128;
-    S->pCosFactor = cos_factorsQ15_128;
-    break;
-  #endif 
-
-  default:
-    status = ARM_MATH_ARGUMENT_ERROR;
-  }
-
-  /* Initialize the RFFT/RIFFT */
-  arm_rfft_init_q15(S->pRfft, S->N, 0U, 1U);
-
-  /* return the status of DCT4 Init function */
-  return (status);
-}
-
-/**
-  @} end of DCT4_IDCT4 group
- */
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_dct4_init_q15.c
+ * Description:  Initialization function of DCT-4 & IDCT4 Q15
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+#include "arm_common_tables.h"
+
+/**
+  @ingroup DCT4_IDCT4
+ */
+
+/**
+  @addtogroup DCT4_IDCT4
+  @{
+ */
+
+/**
+  @brief         Initialization function for the Q15 DCT4/IDCT4.
+  @param[in,out] S         points to an instance of Q15 DCT4/IDCT4 structure
+  @param[in]     S_RFFT    points to an instance of Q15 RFFT/RIFFT structure
+  @param[in]     S_CFFT    points to an instance of Q15 CFFT/CIFFT structure
+  @param[in]     N          length of the DCT4
+  @param[in]     Nby2       half of the length of the DCT4
+  @param[in]     normalize  normalizing factor
+  @return        execution status
+                   - \ref ARM_MATH_SUCCESS        : Operation successful
+                   - \ref ARM_MATH_ARGUMENT_ERROR : <code>N</code> is not a supported transform length
+
+  @par           Normalizing factor
+                   The normalizing factor is <code>sqrt(2/N)</code>, which depends on the size of transform <code>N</code>.
+                   Normalizing factors in 1.15 format are mentioned in the table below for different DCT sizes:
+
+                   \image html dct4NormalizingQ15Table.gif
+ */
+
+arm_status arm_dct4_init_q15(
+  arm_dct4_instance_q15 * S,
+  arm_rfft_instance_q15 * S_RFFT,
+  arm_cfft_radix4_instance_q15 * S_CFFT,
+  uint16_t N,
+  uint16_t Nby2,
+  q15_t normalize)
+{
+  /*  Initialise the default arm status */
+  arm_status status = ARM_MATH_SUCCESS;
+
+  /* Initialize the DCT4 length */
+  S->N = N;
+
+  /* Initialize the half of DCT4 length */
+  S->Nby2 = Nby2;
+
+  /* Initialize the DCT4 Normalizing factor */
+  S->normalize = normalize;
+
+  /* Initialize Real FFT Instance */
+  S->pRfft = S_RFFT;
+
+  /* Initialize Complex FFT Instance */
+  S->pCfft = S_CFFT;
+
+  switch (N)
+  {
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_Q15_8192)
+    /* Initialize the table modifier values */
+  case 8192U:
+    S->pTwiddle = WeightsQ15_8192;
+    S->pCosFactor = cos_factorsQ15_8192;
+    break;
+  #endif
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_Q15_2048)
+  case 2048U:
+    S->pTwiddle = WeightsQ15_2048;
+    S->pCosFactor = cos_factorsQ15_2048;
+    break;
+  #endif
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_Q15_512)
+  case 512U:
+    S->pTwiddle = WeightsQ15_512;
+    S->pCosFactor = cos_factorsQ15_512;
+    break;
+  #endif 
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_Q15_128)
+  case 128U:
+    S->pTwiddle = WeightsQ15_128;
+    S->pCosFactor = cos_factorsQ15_128;
+    break;
+  #endif 
+
+  default:
+    status = ARM_MATH_ARGUMENT_ERROR;
+  }
+
+  /* Initialize the RFFT/RIFFT */
+  arm_rfft_init_q15(S->pRfft, S->N, 0U, 1U);
+
+  /* return the status of DCT4 Init function */
+  return (status);
+}
+
+/**
+  @} end of DCT4_IDCT4 group
+ */
diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_dct4_init_q31.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_dct4_init_q31.c
index 4c7622a..c0294d7 100644
--- a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_dct4_init_q31.c
+++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_dct4_init_q31.c
@@ -1,129 +1,130 @@
-/* ----------------------------------------------------------------------
- * Project:      CMSIS DSP Library
- * Title:        arm_dct4_init_q31.c
- * Description:  Initialization function of DCT-4 & IDCT4 Q31
- *
- * $Date:        23 April 2021
- * $Revision:    V1.9.0
- *
- * Target Processor: Cortex-M and Cortex-A cores
- * -------------------------------------------------------------------- */
-/*
- * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "dsp/transform_functions.h"
-#include "arm_common_tables.h"
-
-/**
-  @ingroup groupTransforms
- */
-
- /**
-  @addtogroup DCT4_IDCT4
-  @{
- */
-
-/**
-  @brief  Initialization function for the Q31 DCT4/IDCT4.
-  @param[in,out] S          points to an instance of Q31 DCT4/IDCT4 structure.
-  @param[in]     S_RFFT     points to an instance of Q31 RFFT/RIFFT structure
-  @param[in]     S_CFFT     points to an instance of Q31 CFFT/CIFFT structure
-  @param[in]     N          length of the DCT4.
-  @param[in]     Nby2       half of the length of the DCT4.
-  @param[in]     normalize  normalizing factor.
-  @return        execution status
-                   - \ref ARM_MATH_SUCCESS        : Operation successful
-                   - \ref ARM_MATH_ARGUMENT_ERROR : <code>N</code> is not a supported transform length
-
-  @par           Normalizing factor:
-                   The normalizing factor is <code>sqrt(2/N)</code>, which depends on the size of transform <code>N</code>.
-                   Normalizing factors in 1.31 format are mentioned in the table below for different DCT sizes:
-
-                   \image html dct4NormalizingQ31Table.gif
- */
-
-arm_status arm_dct4_init_q31(
-  arm_dct4_instance_q31 * S,
-  arm_rfft_instance_q31 * S_RFFT,
-  arm_cfft_radix4_instance_q31 * S_CFFT,
-  uint16_t N,
-  uint16_t Nby2,
-  q31_t normalize)
-{
-  /* Initialize the default arm status */
-  arm_status status = ARM_MATH_SUCCESS;
-
-  /* Initialize the DCT4 length */
-  S->N = N;
-
-  /* Initialize the half of DCT4 length */
-  S->Nby2 = Nby2;
-
-  /* Initialize the DCT4 Normalizing factor */
-  S->normalize = normalize;
-
-  /* Initialize Real FFT Instance */
-  S->pRfft = S_RFFT;
-
-  /* Initialize Complex FFT Instance */
-  S->pCfft = S_CFFT;
-
-  switch (N)
-  {
-  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_Q31_8192)
-    /* Initialize the table modifier values */
-  case 8192U:
-    S->pTwiddle = WeightsQ31_8192;
-    S->pCosFactor = cos_factorsQ31_8192;
-    break;
-  #endif
-
-  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_Q31_2048)
-  case 2048U:
-    S->pTwiddle = WeightsQ31_2048;
-    S->pCosFactor = cos_factorsQ31_2048;
-    break;
-  #endif 
-
-  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_Q31_512)
-  case 512U:
-    S->pTwiddle = WeightsQ31_512;
-    S->pCosFactor = cos_factorsQ31_512;
-    break;
-  #endif 
-
-  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_Q31_128)
-  case 128U:
-    S->pTwiddle = WeightsQ31_128;
-    S->pCosFactor = cos_factorsQ31_128;
-    break;
-  #endif
-  default:
-    status = ARM_MATH_ARGUMENT_ERROR;
-  }
-
-  /* Initialize the RFFT/RIFFT Function */
-  arm_rfft_init_q31(S->pRfft,  S->N, 0U, 1U);
-
-  /* return the status of DCT4 Init function */
-  return (status);
-}
-
-/**
-  @} end of DCT4_IDCT4 group
- */
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_dct4_init_q31.c
+ * Description:  Initialization function of DCT-4 & IDCT4 Q31
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+#include "arm_common_tables.h"
+
+/**
+  @ingroup DCT4_IDCT4
+ */
+
+
+/**
+  @addtogroup DCT4_IDCT4
+  @{
+ */
+
+/**
+  @brief  Initialization function for the Q31 DCT4/IDCT4.
+  @param[in,out] S          points to an instance of Q31 DCT4/IDCT4 structure.
+  @param[in]     S_RFFT     points to an instance of Q31 RFFT/RIFFT structure
+  @param[in]     S_CFFT     points to an instance of Q31 CFFT/CIFFT structure
+  @param[in]     N          length of the DCT4.
+  @param[in]     Nby2       half of the length of the DCT4.
+  @param[in]     normalize  normalizing factor.
+  @return        execution status
+                   - \ref ARM_MATH_SUCCESS        : Operation successful
+                   - \ref ARM_MATH_ARGUMENT_ERROR : <code>N</code> is not a supported transform length
+
+  @par           Normalizing factor:
+                   The normalizing factor is <code>sqrt(2/N)</code>, which depends on the size of transform <code>N</code>.
+                   Normalizing factors in 1.31 format are mentioned in the table below for different DCT sizes:
+
+                   \image html dct4NormalizingQ31Table.gif
+ */
+
+arm_status arm_dct4_init_q31(
+  arm_dct4_instance_q31 * S,
+  arm_rfft_instance_q31 * S_RFFT,
+  arm_cfft_radix4_instance_q31 * S_CFFT,
+  uint16_t N,
+  uint16_t Nby2,
+  q31_t normalize)
+{
+  /* Initialize the default arm status */
+  arm_status status = ARM_MATH_SUCCESS;
+
+  /* Initialize the DCT4 length */
+  S->N = N;
+
+  /* Initialize the half of DCT4 length */
+  S->Nby2 = Nby2;
+
+  /* Initialize the DCT4 Normalizing factor */
+  S->normalize = normalize;
+
+  /* Initialize Real FFT Instance */
+  S->pRfft = S_RFFT;
+
+  /* Initialize Complex FFT Instance */
+  S->pCfft = S_CFFT;
+
+  switch (N)
+  {
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_Q31_8192)
+    /* Initialize the table modifier values */
+  case 8192U:
+    S->pTwiddle = WeightsQ31_8192;
+    S->pCosFactor = cos_factorsQ31_8192;
+    break;
+  #endif
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_Q31_2048)
+  case 2048U:
+    S->pTwiddle = WeightsQ31_2048;
+    S->pCosFactor = cos_factorsQ31_2048;
+    break;
+  #endif 
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_Q31_512)
+  case 512U:
+    S->pTwiddle = WeightsQ31_512;
+    S->pCosFactor = cos_factorsQ31_512;
+    break;
+  #endif 
+
+  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_Q31_128)
+  case 128U:
+    S->pTwiddle = WeightsQ31_128;
+    S->pCosFactor = cos_factorsQ31_128;
+    break;
+  #endif
+  default:
+    status = ARM_MATH_ARGUMENT_ERROR;
+  }
+
+  /* Initialize the RFFT/RIFFT Function */
+  arm_rfft_init_q31(S->pRfft,  S->N, 0U, 1U);
+
+  /* return the status of DCT4 Init function */
+  return (status);
+}
+
+/**
+  @} end of DCT4_IDCT4 group
+ */
diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_dct4_q15.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_dct4_q15.c
index a4650da..ba26300 100644
--- a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_dct4_q15.c
+++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_dct4_q15.c
@@ -1,381 +1,381 @@
-/* ----------------------------------------------------------------------
- * Project:      CMSIS DSP Library
- * Title:        arm_dct4_q15.c
- * Description:  Processing function of DCT4 & IDCT4 Q15
- *
- * $Date:        23 April 2021
- * $Revision:    V1.9.0
- *
- * Target Processor: Cortex-M and Cortex-A cores
- * -------------------------------------------------------------------- */
-/*
- * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "dsp/transform_functions.h"
-
-/**
-  @addtogroup DCT4_IDCT4
-  @{
- */
-
-/**
-  @brief         Processing function for the Q15 DCT4/IDCT4.
-  @param[in]     S             points to an instance of the Q15 DCT4 structure.
-  @param[in]     pState        points to state buffer.
-  @param[in,out] pInlineBuffer points to the in-place input and output buffer.
-  @return        none
- 
-  @par           Input an output formats
-                   Internally inputs are downscaled in the RFFT process function to avoid overflows.
-                   Number of bits downscaled, depends on the size of the transform. The input and output
-                   formats for different DCT sizes and number of bits to upscale are mentioned in the table below:
-
-                   \image html dct4FormatsQ15Table.gif
- */
-
-void arm_dct4_q15(
-  const arm_dct4_instance_q15 * S,
-        q15_t * pState,
-        q15_t * pInlineBuffer)
-{
-  const q15_t *weights = S->pTwiddle;                  /* Pointer to the Weights table */
-  const q15_t *cosFact = S->pCosFactor;                /* Pointer to the cos factors table */
-        q15_t *pS1, *pS2, *pbuff;                      /* Temporary pointers for input buffer and pState buffer */
-        q15_t in;                                      /* Temporary variable */
-        uint32_t i;                                    /* Loop counter */
-
-
-  /* DCT4 computation involves DCT2 (which is calculated using RFFT)
-   * along with some pre-processing and post-processing.
-   * Computational procedure is explained as follows:
-   * (a) Pre-processing involves multiplying input with cos factor,
-   *     r(n) = 2 * u(n) * cos(pi*(2*n+1)/(4*n))
-   *              where,
-   *                 r(n) -- output of preprocessing
-   *                 u(n) -- input to preprocessing(actual Source buffer)
-   * (b) Calculation of DCT2 using FFT is divided into three steps:
-   *                  Step1: Re-ordering of even and odd elements of input.
-   *                  Step2: Calculating FFT of the re-ordered input.
-   *                  Step3: Taking the real part of the product of FFT output and weights.
-   * (c) Post-processing - DCT4 can be obtained from DCT2 output using the following equation:
-   *                   Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
-   *                        where,
-   *                           Y4 -- DCT4 output,   Y2 -- DCT2 output
-   * (d) Multiplying the output with the normalizing factor sqrt(2/N).
-   */
-
-  /*-------- Pre-processing ------------*/
-  /* Multiplying input with cos factor i.e. r(n) = 2 * x(n) * cos(pi*(2*n+1)/(4*n)) */
-  arm_mult_q15 (pInlineBuffer, cosFact, pInlineBuffer, S->N);
-  arm_shift_q15 (pInlineBuffer, 1, pInlineBuffer, S->N);
-
-  /* ----------------------------------------------------------------
-   * Step1: Re-ordering of even and odd elements as
-   *             pState[i] =  pInlineBuffer[2*i] and
-   *             pState[N-i-1] = pInlineBuffer[2*i+1] where i = 0 to N/2
-   ---------------------------------------------------------------------*/
-
-  /* pS1 initialized to pState */
-  pS1 = pState;
-
-  /* pS2 initialized to pState+N-1, so that it points to the end of the state buffer */
-  pS2 = pState + (S->N - 1U);
-
-  /* pbuff initialized to input buffer */
-  pbuff = pInlineBuffer;
-
-
-#if defined (ARM_MATH_LOOPUNROLL)
-
-  /* Initializing the loop counter to N/2 >> 2 for loop unrolling by 4 */
-  i = S->Nby2 >> 2U;
-
-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-   ** a second loop below computes the remaining 1 to 3 samples. */
-  do
-  {
-    /* Re-ordering of even and odd elements */
-    /* pState[i] =  pInlineBuffer[2*i] */
-    *pS1++ = *pbuff++;
-    /* pState[N-i-1] = pInlineBuffer[2*i+1] */
-    *pS2-- = *pbuff++;
-
-    *pS1++ = *pbuff++;
-    *pS2-- = *pbuff++;
-
-    *pS1++ = *pbuff++;
-    *pS2-- = *pbuff++;
-
-    *pS1++ = *pbuff++;
-    *pS2-- = *pbuff++;
-
-    /* Decrement loop counter */
-    i--;
-  } while (i > 0U);
-
-  /* pbuff initialized to input buffer */
-  pbuff = pInlineBuffer;
-
-  /* pS1 initialized to pState */
-  pS1 = pState;
-
-  /* Initializing the loop counter to N/4 instead of N for loop unrolling */
-  i = S->N >> 2U;
-
-  /* Processing with loop unrolling 4 times as N is always multiple of 4.
-   * Compute 4 outputs at a time */
-  do
-  {
-    /* Writing the re-ordered output back to inplace input buffer */
-    *pbuff++ = *pS1++;
-    *pbuff++ = *pS1++;
-    *pbuff++ = *pS1++;
-    *pbuff++ = *pS1++;
-
-    /* Decrement the loop counter */
-    i--;
-  } while (i > 0U);
-
-
-  /* ---------------------------------------------------------
-   *     Step2: Calculate RFFT for N-point input
-   * ---------------------------------------------------------- */
-  /* pInlineBuffer is real input of length N , pState is the complex output of length 2N */
-  arm_rfft_q15 (S->pRfft, pInlineBuffer, pState);
-
-  /*----------------------------------------------------------------------
-   *  Step3: Multiply the FFT output with the weights.
-   *----------------------------------------------------------------------*/
-  arm_cmplx_mult_cmplx_q15 (pState, weights, pState, S->N);
-
-  /* The output of complex multiplication is in 3.13 format.
-   * Hence changing the format of N (i.e. 2*N elements) complex numbers to 1.15 format by shifting left by 2 bits. */
-  arm_shift_q15 (pState, 2, pState, S->N * 2);
-
-  /* ----------- Post-processing ---------- */
-  /* DCT-IV can be obtained from DCT-II by the equation,
-   *       Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
-   *       Hence, Y4(0) = Y2(0)/2  */
-  /* Getting only real part from the output and Converting to DCT-IV */
-
-  /* Initializing the loop counter to N >> 2 for loop unrolling by 4 */
-  i = (S->N - 1U) >> 2U;
-
-  /* pbuff initialized to input buffer. */
-  pbuff = pInlineBuffer;
-
-  /* pS1 initialized to pState */
-  pS1 = pState;
-
-  /* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */
-  in = *pS1++ >> 1U;
-  /* input buffer acts as inplace, so output values are stored in the input itself. */
-  *pbuff++ = in;
-
-  /* pState pointer is incremented twice as the real values are located alternatively in the array */
-  pS1++;
-
-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-   ** a second loop below computes the remaining 1 to 3 samples. */
-  do
-  {
-    /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
-    /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
-    in = *pS1++ - in;
-    *pbuff++ = in;
-    /* points to the next real value */
-    pS1++;
-
-    in = *pS1++ - in;
-    *pbuff++ = in;
-    pS1++;
-
-    in = *pS1++ - in;
-    *pbuff++ = in;
-    pS1++;
-
-    in = *pS1++ - in;
-    *pbuff++ = in;
-    pS1++;
-
-    /* Decrement the loop counter */
-    i--;
-  } while (i > 0U);
-
-  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
-   ** No loop unrolling is used. */
-  i = (S->N - 1U) % 0x4U;
-
-  while (i > 0U)
-  {
-    /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
-    /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
-    in = *pS1++ - in;
-    *pbuff++ = in;
-
-    /* points to the next real value */
-    pS1++;
-
-    /* Decrement loop counter */
-    i--;
-  }
-
-
-  /*------------ Normalizing the output by multiplying with the normalizing factor ----------*/
-
-  /* Initializing the loop counter to N/4 instead of N for loop unrolling */
-  i = S->N >> 2U;
-
-  /* pbuff initialized to the pInlineBuffer(now contains the output values) */
-  pbuff = pInlineBuffer;
-
-  /* Processing with loop unrolling 4 times as N is always multiple of 4.  Compute 4 outputs at a time */
-  do
-  {
-    /* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */
-    in = *pbuff;
-    *pbuff++ = ((q15_t) (((q31_t) in * S->normalize) >> 15));
-
-    in = *pbuff;
-    *pbuff++ = ((q15_t) (((q31_t) in * S->normalize) >> 15));
-
-    in = *pbuff;
-    *pbuff++ = ((q15_t) (((q31_t) in * S->normalize) >> 15));
-
-    in = *pbuff;
-    *pbuff++ = ((q15_t) (((q31_t) in * S->normalize) >> 15));
-
-    /* Decrement loop counter */
-    i--;
-  } while (i > 0U);
-
-
-#else
-
-  /* Initializing the loop counter to N/2 */
-  i = S->Nby2;
-
-  do
-  {
-    /* Re-ordering of even and odd elements */
-    /* pState[i] =  pInlineBuffer[2*i] */
-    *pS1++ = *pbuff++;
-    /* pState[N-i-1] = pInlineBuffer[2*i+1] */
-    *pS2-- = *pbuff++;
-
-    /* Decrement the loop counter */
-    i--;
-  } while (i > 0U);
-
-  /* pbuff initialized to input buffer */
-  pbuff = pInlineBuffer;
-
-  /* pS1 initialized to pState */
-  pS1 = pState;
-
-  /* Initializing the loop counter */
-  i = S->N;
-
-  do
-  {
-    /* Writing the re-ordered output back to inplace input buffer */
-    *pbuff++ = *pS1++;
-
-    /* Decrement the loop counter */
-    i--;
-  } while (i > 0U);
-
-
-  /* ---------------------------------------------------------
-   *     Step2: Calculate RFFT for N-point input
-   * ---------------------------------------------------------- */
-  /* pInlineBuffer is real input of length N , pState is the complex output of length 2N */
-  arm_rfft_q15 (S->pRfft, pInlineBuffer, pState);
-
-  /*----------------------------------------------------------------------
-   *  Step3: Multiply the FFT output with the weights.
-   *----------------------------------------------------------------------*/
-  arm_cmplx_mult_cmplx_q15 (pState, weights, pState, S->N);
-
-  /* The output of complex multiplication is in 3.13 format.
-   * Hence changing the format of N (i.e. 2*N elements) complex numbers to 1.15 format by shifting left by 2 bits. */
-  arm_shift_q15 (pState, 2, pState, S->N * 2);
-
-  /* ----------- Post-processing ---------- */
-  /* DCT-IV can be obtained from DCT-II by the equation,
-   *       Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
-   *       Hence, Y4(0) = Y2(0)/2  */
-  /* Getting only real part from the output and Converting to DCT-IV */
-
-  /* pbuff initialized to input buffer. */
-  pbuff = pInlineBuffer;
-
-  /* pS1 initialized to pState */
-  pS1 = pState;
-
-  /* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */
-  in = *pS1++ >> 1U;
-  /* input buffer acts as inplace, so output values are stored in the input itself. */
-  *pbuff++ = in;
-
-  /* pState pointer is incremented twice as the real values are located alternatively in the array */
-  pS1++;
-
-  /* Initializing the loop counter */
-  i = (S->N - 1U);
-
-  do
-  {
-    /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
-    /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
-    in = *pS1++ - in;
-    *pbuff++ = in;
-
-    /* points to the next real value */
-    pS1++;
-
-    /* Decrement loop counter */
-    i--;
-  } while (i > 0U);
-
-  /*------------ Normalizing the output by multiplying with the normalizing factor ----------*/
-
-  /* Initializing loop counter */
-  i = S->N;
-
-  /* pbuff initialized to the pInlineBuffer (now contains the output values) */
-  pbuff = pInlineBuffer;
-
-  do
-  {
-    /* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */
-    in = *pbuff;
-    *pbuff++ = ((q15_t) (((q31_t) in * S->normalize) >> 15));
-
-    /* Decrement loop counter */
-    i--;
-
-  } while (i > 0U);
-
-#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
-
-}
-
-/**
-  @} end of DCT4_IDCT4 group
- */
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_dct4_q15.c
+ * Description:  Processing function of DCT4 & IDCT4 Q15
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @addtogroup DCT4_IDCT4
+  @{
+ */
+
+/**
+  @brief         Processing function for the Q15 DCT4/IDCT4.
+  @param[in]     S             points to an instance of the Q15 DCT4 structure.
+  @param[in]     pState        points to state buffer.
+  @param[in,out] pInlineBuffer points to the in-place input and output buffer.
+  @return        none
+ 
+  @par           Input an output formats
+                   Internally inputs are downscaled in the RFFT process function to avoid overflows.
+                   Number of bits downscaled, depends on the size of the transform. The input and output
+                   formats for different DCT sizes and number of bits to upscale are mentioned in the table below:
+
+                   \image html dct4FormatsQ15Table.gif
+ */
+
+void arm_dct4_q15(
+  const arm_dct4_instance_q15 * S,
+        q15_t * pState,
+        q15_t * pInlineBuffer)
+{
+  const q15_t *weights = S->pTwiddle;                  /* Pointer to the Weights table */
+  const q15_t *cosFact = S->pCosFactor;                /* Pointer to the cos factors table */
+        q15_t *pS1, *pS2, *pbuff;                      /* Temporary pointers for input buffer and pState buffer */
+        q15_t in;                                      /* Temporary variable */
+        uint32_t i;                                    /* Loop counter */
+
+
+  /* DCT4 computation involves DCT2 (which is calculated using RFFT)
+   * along with some pre-processing and post-processing.
+   * Computational procedure is explained as follows:
+   * (a) Pre-processing involves multiplying input with cos factor,
+   *     r(n) = 2 * u(n) * cos(pi*(2*n+1)/(4*n))
+   *              where,
+   *                 r(n) -- output of preprocessing
+   *                 u(n) -- input to preprocessing(actual Source buffer)
+   * (b) Calculation of DCT2 using FFT is divided into three steps:
+   *                  Step1: Re-ordering of even and odd elements of input.
+   *                  Step2: Calculating FFT of the re-ordered input.
+   *                  Step3: Taking the real part of the product of FFT output and weights.
+   * (c) Post-processing - DCT4 can be obtained from DCT2 output using the following equation:
+   *                   Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
+   *                        where,
+   *                           Y4 -- DCT4 output,   Y2 -- DCT2 output
+   * (d) Multiplying the output with the normalizing factor sqrt(2/N).
+   */
+
+  /*-------- Pre-processing ------------*/
+  /* Multiplying input with cos factor i.e. r(n) = 2 * x(n) * cos(pi*(2*n+1)/(4*n)) */
+  arm_mult_q15 (pInlineBuffer, cosFact, pInlineBuffer, S->N);
+  arm_shift_q15 (pInlineBuffer, 1, pInlineBuffer, S->N);
+
+  /* ----------------------------------------------------------------
+   * Step1: Re-ordering of even and odd elements as
+   *             pState[i] =  pInlineBuffer[2*i] and
+   *             pState[N-i-1] = pInlineBuffer[2*i+1] where i = 0 to N/2
+   ---------------------------------------------------------------------*/
+
+  /* pS1 initialized to pState */
+  pS1 = pState;
+
+  /* pS2 initialized to pState+N-1, so that it points to the end of the state buffer */
+  pS2 = pState + (S->N - 1U);
+
+  /* pbuff initialized to input buffer */
+  pbuff = pInlineBuffer;
+
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Initializing the loop counter to N/2 >> 2 for loop unrolling by 4 */
+  i = S->Nby2 >> 2U;
+
+  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
+   ** a second loop below computes the remaining 1 to 3 samples. */
+  do
+  {
+    /* Re-ordering of even and odd elements */
+    /* pState[i] =  pInlineBuffer[2*i] */
+    *pS1++ = *pbuff++;
+    /* pState[N-i-1] = pInlineBuffer[2*i+1] */
+    *pS2-- = *pbuff++;
+
+    *pS1++ = *pbuff++;
+    *pS2-- = *pbuff++;
+
+    *pS1++ = *pbuff++;
+    *pS2-- = *pbuff++;
+
+    *pS1++ = *pbuff++;
+    *pS2-- = *pbuff++;
+
+    /* Decrement loop counter */
+    i--;
+  } while (i > 0U);
+
+  /* pbuff initialized to input buffer */
+  pbuff = pInlineBuffer;
+
+  /* pS1 initialized to pState */
+  pS1 = pState;
+
+  /* Initializing the loop counter to N/4 instead of N for loop unrolling */
+  i = S->N >> 2U;
+
+  /* Processing with loop unrolling 4 times as N is always multiple of 4.
+   * Compute 4 outputs at a time */
+  do
+  {
+    /* Writing the re-ordered output back to inplace input buffer */
+    *pbuff++ = *pS1++;
+    *pbuff++ = *pS1++;
+    *pbuff++ = *pS1++;
+    *pbuff++ = *pS1++;
+
+    /* Decrement the loop counter */
+    i--;
+  } while (i > 0U);
+
+
+  /* ---------------------------------------------------------
+   *     Step2: Calculate RFFT for N-point input
+   * ---------------------------------------------------------- */
+  /* pInlineBuffer is real input of length N , pState is the complex output of length 2N */
+  arm_rfft_q15 (S->pRfft, pInlineBuffer, pState);
+
+  /*----------------------------------------------------------------------
+   *  Step3: Multiply the FFT output with the weights.
+   *----------------------------------------------------------------------*/
+  arm_cmplx_mult_cmplx_q15 (pState, weights, pState, S->N);
+
+  /* The output of complex multiplication is in 3.13 format.
+   * Hence changing the format of N (i.e. 2*N elements) complex numbers to 1.15 format by shifting left by 2 bits. */
+  arm_shift_q15 (pState, 2, pState, S->N * 2);
+
+  /* ----------- Post-processing ---------- */
+  /* DCT-IV can be obtained from DCT-II by the equation,
+   *       Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
+   *       Hence, Y4(0) = Y2(0)/2  */
+  /* Getting only real part from the output and Converting to DCT-IV */
+
+  /* Initializing the loop counter to N >> 2 for loop unrolling by 4 */
+  i = (S->N - 1U) >> 2U;
+
+  /* pbuff initialized to input buffer. */
+  pbuff = pInlineBuffer;
+
+  /* pS1 initialized to pState */
+  pS1 = pState;
+
+  /* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */
+  in = *pS1++ >> 1U;
+  /* input buffer acts as inplace, so output values are stored in the input itself. */
+  *pbuff++ = in;
+
+  /* pState pointer is incremented twice as the real values are located alternatively in the array */
+  pS1++;
+
+  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
+   ** a second loop below computes the remaining 1 to 3 samples. */
+  do
+  {
+    /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
+    /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
+    in = *pS1++ - in;
+    *pbuff++ = in;
+    /* points to the next real value */
+    pS1++;
+
+    in = *pS1++ - in;
+    *pbuff++ = in;
+    pS1++;
+
+    in = *pS1++ - in;
+    *pbuff++ = in;
+    pS1++;
+
+    in = *pS1++ - in;
+    *pbuff++ = in;
+    pS1++;
+
+    /* Decrement the loop counter */
+    i--;
+  } while (i > 0U);
+
+  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+   ** No loop unrolling is used. */
+  i = (S->N - 1U) % 0x4U;
+
+  while (i > 0U)
+  {
+    /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
+    /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
+    in = *pS1++ - in;
+    *pbuff++ = in;
+
+    /* points to the next real value */
+    pS1++;
+
+    /* Decrement loop counter */
+    i--;
+  }
+
+
+  /*------------ Normalizing the output by multiplying with the normalizing factor ----------*/
+
+  /* Initializing the loop counter to N/4 instead of N for loop unrolling */
+  i = S->N >> 2U;
+
+  /* pbuff initialized to the pInlineBuffer(now contains the output values) */
+  pbuff = pInlineBuffer;
+
+  /* Processing with loop unrolling 4 times as N is always multiple of 4.  Compute 4 outputs at a time */
+  do
+  {
+    /* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */
+    in = *pbuff;
+    *pbuff++ = ((q15_t) (((q31_t) in * S->normalize) >> 15));
+
+    in = *pbuff;
+    *pbuff++ = ((q15_t) (((q31_t) in * S->normalize) >> 15));
+
+    in = *pbuff;
+    *pbuff++ = ((q15_t) (((q31_t) in * S->normalize) >> 15));
+
+    in = *pbuff;
+    *pbuff++ = ((q15_t) (((q31_t) in * S->normalize) >> 15));
+
+    /* Decrement loop counter */
+    i--;
+  } while (i > 0U);
+
+
+#else
+
+  /* Initializing the loop counter to N/2 */
+  i = S->Nby2;
+
+  do
+  {
+    /* Re-ordering of even and odd elements */
+    /* pState[i] =  pInlineBuffer[2*i] */
+    *pS1++ = *pbuff++;
+    /* pState[N-i-1] = pInlineBuffer[2*i+1] */
+    *pS2-- = *pbuff++;
+
+    /* Decrement the loop counter */
+    i--;
+  } while (i > 0U);
+
+  /* pbuff initialized to input buffer */
+  pbuff = pInlineBuffer;
+
+  /* pS1 initialized to pState */
+  pS1 = pState;
+
+  /* Initializing the loop counter */
+  i = S->N;
+
+  do
+  {
+    /* Writing the re-ordered output back to inplace input buffer */
+    *pbuff++ = *pS1++;
+
+    /* Decrement the loop counter */
+    i--;
+  } while (i > 0U);
+
+
+  /* ---------------------------------------------------------
+   *     Step2: Calculate RFFT for N-point input
+   * ---------------------------------------------------------- */
+  /* pInlineBuffer is real input of length N , pState is the complex output of length 2N */
+  arm_rfft_q15 (S->pRfft, pInlineBuffer, pState);
+
+  /*----------------------------------------------------------------------
+   *  Step3: Multiply the FFT output with the weights.
+   *----------------------------------------------------------------------*/
+  arm_cmplx_mult_cmplx_q15 (pState, weights, pState, S->N);
+
+  /* The output of complex multiplication is in 3.13 format.
+   * Hence changing the format of N (i.e. 2*N elements) complex numbers to 1.15 format by shifting left by 2 bits. */
+  arm_shift_q15 (pState, 2, pState, S->N * 2);
+
+  /* ----------- Post-processing ---------- */
+  /* DCT-IV can be obtained from DCT-II by the equation,
+   *       Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
+   *       Hence, Y4(0) = Y2(0)/2  */
+  /* Getting only real part from the output and Converting to DCT-IV */
+
+  /* pbuff initialized to input buffer. */
+  pbuff = pInlineBuffer;
+
+  /* pS1 initialized to pState */
+  pS1 = pState;
+
+  /* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */
+  in = *pS1++ >> 1U;
+  /* input buffer acts as inplace, so output values are stored in the input itself. */
+  *pbuff++ = in;
+
+  /* pState pointer is incremented twice as the real values are located alternatively in the array */
+  pS1++;
+
+  /* Initializing the loop counter */
+  i = (S->N - 1U);
+
+  do
+  {
+    /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
+    /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
+    in = *pS1++ - in;
+    *pbuff++ = in;
+
+    /* points to the next real value */
+    pS1++;
+
+    /* Decrement loop counter */
+    i--;
+  } while (i > 0U);
+
+  /*------------ Normalizing the output by multiplying with the normalizing factor ----------*/
+
+  /* Initializing loop counter */
+  i = S->N;
+
+  /* pbuff initialized to the pInlineBuffer (now contains the output values) */
+  pbuff = pInlineBuffer;
+
+  do
+  {
+    /* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */
+    in = *pbuff;
+    *pbuff++ = ((q15_t) (((q31_t) in * S->normalize) >> 15));
+
+    /* Decrement loop counter */
+    i--;
+
+  } while (i > 0U);
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+}
+
+/**
+  @} end of DCT4_IDCT4 group
+ */
diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_dct4_q31.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_dct4_q31.c
index 6cbccff..5757083 100644
--- a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_dct4_q31.c
+++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_dct4_q31.c
@@ -1,383 +1,383 @@
-/* ----------------------------------------------------------------------
- * Project:      CMSIS DSP Library
- * Title:        arm_dct4_q31.c
- * Description:  Processing function of DCT4 & IDCT4 Q31
- *
- * $Date:        23 April 2021
- * $Revision:    V1.9.0
- *
- * Target Processor: Cortex-M and Cortex-A cores
- * -------------------------------------------------------------------- */
-/*
- * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "dsp/transform_functions.h"
-
-/**
-  @addtogroup DCT4_IDCT4
-  @{
- */
-
-/**
-  @brief         Processing function for the Q31 DCT4/IDCT4.
-  @param[in]     S             points to an instance of the Q31 DCT4 structure.
-  @param[in]     pState        points to state buffer.
-  @param[in,out] pInlineBuffer points to the in-place input and output buffer.
-  @return        none
-
-  @par           Input an output formats
-                   Input samples need to be downscaled by 1 bit to avoid saturations in the Q31 DCT process,
-                   as the conversion from DCT2 to DCT4 involves one subtraction.
-                   Internally inputs are downscaled in the RFFT process function to avoid overflows.
-                   Number of bits downscaled, depends on the size of the transform.
-                   The input and output formats for different DCT sizes and number of bits to upscale are
-                   mentioned in the table below:
-
-                   \image html dct4FormatsQ31Table.gif
- */
-
-void arm_dct4_q31(
-  const arm_dct4_instance_q31 * S,
-        q31_t * pState,
-        q31_t * pInlineBuffer)
-{
-  const q31_t *weights = S->pTwiddle;                  /* Pointer to the Weights table */
-  const q31_t *cosFact = S->pCosFactor;                /* Pointer to the cos factors table */
-        q31_t *pS1, *pS2, *pbuff;                      /* Temporary pointers for input buffer and pState buffer */
-        q31_t in;                                      /* Temporary variable */
-        uint32_t i;                                    /* Loop counter */
-
-
-  /* DCT4 computation involves DCT2 (which is calculated using RFFT)
-   * along with some pre-processing and post-processing.
-   * Computational procedure is explained as follows:
-   * (a) Pre-processing involves multiplying input with cos factor,
-   *     r(n) = 2 * u(n) * cos(pi*(2*n+1)/(4*n))
-   *              where,
-   *                 r(n) -- output of preprocessing
-   *                 u(n) -- input to preprocessing(actual Source buffer)
-   * (b) Calculation of DCT2 using FFT is divided into three steps:
-   *                  Step1: Re-ordering of even and odd elements of input.
-   *                  Step2: Calculating FFT of the re-ordered input.
-   *                  Step3: Taking the real part of the product of FFT output and weights.
-   * (c) Post-processing - DCT4 can be obtained from DCT2 output using the following equation:
-   *                   Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
-   *                        where,
-   *                           Y4 -- DCT4 output,   Y2 -- DCT2 output
-   * (d) Multiplying the output with the normalizing factor sqrt(2/N).
-   */
-
-  /*-------- Pre-processing ------------*/
-  /* Multiplying input with cos factor i.e. r(n) = 2 * x(n) * cos(pi*(2*n+1)/(4*n)) */
-  arm_mult_q31 (pInlineBuffer, cosFact, pInlineBuffer, S->N);
-  arm_shift_q31 (pInlineBuffer, 1, pInlineBuffer, S->N);
-
-  /* ----------------------------------------------------------------
-   * Step1: Re-ordering of even and odd elements as
-   *             pState[i] =  pInlineBuffer[2*i] and
-   *             pState[N-i-1] = pInlineBuffer[2*i+1] where i = 0 to N/2
-   ---------------------------------------------------------------------*/
-
-  /* pS1 initialized to pState */
-  pS1 = pState;
-
-  /* pS2 initialized to pState+N-1, so that it points to the end of the state buffer */
-  pS2 = pState + (S->N - 1U);
-
-  /* pbuff initialized to input buffer */
-  pbuff = pInlineBuffer;
-
-
-#if defined (ARM_MATH_LOOPUNROLL)
-
-  /* Initializing the loop counter to N/2 >> 2 for loop unrolling by 4 */
-  i = S->Nby2 >> 2U;
-
-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-   ** a second loop below computes the remaining 1 to 3 samples. */
-  do
-  {
-    /* Re-ordering of even and odd elements */
-    /* pState[i] =  pInlineBuffer[2*i] */
-    *pS1++ = *pbuff++;
-    /* pState[N-i-1] = pInlineBuffer[2*i+1] */
-    *pS2-- = *pbuff++;
-
-    *pS1++ = *pbuff++;
-    *pS2-- = *pbuff++;
-
-    *pS1++ = *pbuff++;
-    *pS2-- = *pbuff++;
-
-    *pS1++ = *pbuff++;
-    *pS2-- = *pbuff++;
-
-    /* Decrement loop counter */
-    i--;
-  } while (i > 0U);
-
-  /* pbuff initialized to input buffer */
-  pbuff = pInlineBuffer;
-
-  /* pS1 initialized to pState */
-  pS1 = pState;
-
-  /* Initializing the loop counter to N/4 instead of N for loop unrolling */
-  i = S->N >> 2U;
-
-  /* Processing with loop unrolling 4 times as N is always multiple of 4.
-   * Compute 4 outputs at a time */
-  do
-  {
-    /* Writing the re-ordered output back to inplace input buffer */
-    *pbuff++ = *pS1++;
-    *pbuff++ = *pS1++;
-    *pbuff++ = *pS1++;
-    *pbuff++ = *pS1++;
-
-    /* Decrement the loop counter */
-    i--;
-  } while (i > 0U);
-
-
-  /* ---------------------------------------------------------
-   *     Step2: Calculate RFFT for N-point input
-   * ---------------------------------------------------------- */
-  /* pInlineBuffer is real input of length N , pState is the complex output of length 2N */
-  arm_rfft_q31 (S->pRfft, pInlineBuffer, pState);
-
-  /*----------------------------------------------------------------------
-   *  Step3: Multiply the FFT output with the weights.
-   *----------------------------------------------------------------------*/
-  arm_cmplx_mult_cmplx_q31 (pState, weights, pState, S->N);
-
-  /* The output of complex multiplication is in 3.29 format.
-   * Hence changing the format of N (i.e. 2*N elements) complex numbers to 1.31 format by shifting left by 2 bits. */
-  arm_shift_q31 (pState, 2, pState, S->N * 2);
-
-  /* ----------- Post-processing ---------- */
-  /* DCT-IV can be obtained from DCT-II by the equation,
-   *       Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
-   *       Hence, Y4(0) = Y2(0)/2  */
-  /* Getting only real part from the output and Converting to DCT-IV */
-
-  /* Initializing the loop counter to N >> 2 for loop unrolling by 4 */
-  i = (S->N - 1U) >> 2U;
-
-  /* pbuff initialized to input buffer. */
-  pbuff = pInlineBuffer;
-
-  /* pS1 initialized to pState */
-  pS1 = pState;
-
-  /* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */
-  in = *pS1++ >> 1U;
-  /* input buffer acts as inplace, so output values are stored in the input itself. */
-  *pbuff++ = in;
-
-  /* pState pointer is incremented twice as the real values are located alternatively in the array */
-  pS1++;
-
-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-   ** a second loop below computes the remaining 1 to 3 samples. */
-  do
-  {
-    /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
-    /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
-    in = *pS1++ - in;
-    *pbuff++ = in;
-    /* points to the next real value */
-    pS1++;
-
-    in = *pS1++ - in;
-    *pbuff++ = in;
-    pS1++;
-
-    in = *pS1++ - in;
-    *pbuff++ = in;
-    pS1++;
-
-    in = *pS1++ - in;
-    *pbuff++ = in;
-    pS1++;
-
-    /* Decrement the loop counter */
-    i--;
-  } while (i > 0U);
-
-  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
-   ** No loop unrolling is used. */
-  i = (S->N - 1U) % 0x4U;
-
-  while (i > 0U)
-  {
-    /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
-    /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
-    in = *pS1++ - in;
-    *pbuff++ = in;
-
-    /* points to the next real value */
-    pS1++;
-
-    /* Decrement loop counter */
-    i--;
-  }
-
-
-  /*------------ Normalizing the output by multiplying with the normalizing factor ----------*/
-
-  /* Initializing the loop counter to N/4 instead of N for loop unrolling */
-  i = S->N >> 2U;
-
-  /* pbuff initialized to the pInlineBuffer(now contains the output values) */
-  pbuff = pInlineBuffer;
-
-  /* Processing with loop unrolling 4 times as N is always multiple of 4.  Compute 4 outputs at a time */
-  do
-  {
-    /* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */
-    in = *pbuff;
-    *pbuff++ = ((q31_t) (((q63_t) in * S->normalize) >> 31));
-
-    in = *pbuff;
-    *pbuff++ = ((q31_t) (((q63_t) in * S->normalize) >> 31));
-
-    in = *pbuff;
-    *pbuff++ = ((q31_t) (((q63_t) in * S->normalize) >> 31));
-
-    in = *pbuff;
-    *pbuff++ = ((q31_t) (((q63_t) in * S->normalize) >> 31));
-
-    /* Decrement loop counter */
-    i--;
-  } while (i > 0U);
-
-
-#else
-
-  /* Initializing the loop counter to N/2 */
-  i = S->Nby2;
-
-  do
-  {
-    /* Re-ordering of even and odd elements */
-    /* pState[i] =  pInlineBuffer[2*i] */
-    *pS1++ = *pbuff++;
-    /* pState[N-i-1] = pInlineBuffer[2*i+1] */
-    *pS2-- = *pbuff++;
-
-    /* Decrement the loop counter */
-    i--;
-  } while (i > 0U);
-
-  /* pbuff initialized to input buffer */
-  pbuff = pInlineBuffer;
-
-  /* pS1 initialized to pState */
-  pS1 = pState;
-
-  /* Initializing the loop counter */
-  i = S->N;
-
-  do
-  {
-    /* Writing the re-ordered output back to inplace input buffer */
-    *pbuff++ = *pS1++;
-
-    /* Decrement the loop counter */
-    i--;
-  } while (i > 0U);
-
-
-  /* ---------------------------------------------------------
-   *     Step2: Calculate RFFT for N-point input
-   * ---------------------------------------------------------- */
-  /* pInlineBuffer is real input of length N , pState is the complex output of length 2N */
-  arm_rfft_q31 (S->pRfft, pInlineBuffer, pState);
-
-  /*----------------------------------------------------------------------
-   *  Step3: Multiply the FFT output with the weights.
-   *----------------------------------------------------------------------*/
-  arm_cmplx_mult_cmplx_q31 (pState, weights, pState, S->N);
-
-  /* The output of complex multiplication is in 3.29 format.
-   * Hence changing the format of N (i.e. 2*N elements) complex numbers to 1.31 format by shifting left by 2 bits. */
-  arm_shift_q31(pState, 2, pState, S->N * 2);
-
-  /* ----------- Post-processing ---------- */
-  /* DCT-IV can be obtained from DCT-II by the equation,
-   *       Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
-   *       Hence, Y4(0) = Y2(0)/2  */
-  /* Getting only real part from the output and Converting to DCT-IV */
-
-  /* pbuff initialized to input buffer. */
-  pbuff = pInlineBuffer;
-
-  /* pS1 initialized to pState */
-  pS1 = pState;
-
-  /* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */
-  in = *pS1++ >> 1U;
-  /* input buffer acts as inplace, so output values are stored in the input itself. */
-  *pbuff++ = in;
-
-  /* pState pointer is incremented twice as the real values are located alternatively in the array */
-  pS1++;
-
-  /* Initializing the loop counter */
-  i = (S->N - 1U);
-
-  while (i > 0U)
-  {
-    /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
-    /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
-    in = *pS1++ - in;
-    *pbuff++ = in;
-
-    /* points to the next real value */
-    pS1++;
-
-    /* Decrement loop counter */
-    i--;
-  }
-
-  /*------------ Normalizing the output by multiplying with the normalizing factor ----------*/
-
-  /* Initializing loop counter */
-  i = S->N;
-
-  /* pbuff initialized to the pInlineBuffer (now contains the output values) */
-  pbuff = pInlineBuffer;
-
-  do
-  {
-    /* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */
-    in = *pbuff;
-    *pbuff++ = ((q31_t) (((q63_t) in * S->normalize) >> 31));
-
-    /* Decrement loop counter */
-    i--;
-  } while (i > 0U);
-
-#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
-
-}
-
-/**
-  @} end of DCT4_IDCT4 group
- */
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_dct4_q31.c
+ * Description:  Processing function of DCT4 & IDCT4 Q31
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @addtogroup DCT4_IDCT4
+  @{
+ */
+
+/**
+  @brief         Processing function for the Q31 DCT4/IDCT4.
+  @param[in]     S             points to an instance of the Q31 DCT4 structure.
+  @param[in]     pState        points to state buffer.
+  @param[in,out] pInlineBuffer points to the in-place input and output buffer.
+  @return        none
+
+  @par           Input an output formats
+                   Input samples need to be downscaled by 1 bit to avoid saturations in the Q31 DCT process,
+                   as the conversion from DCT2 to DCT4 involves one subtraction.
+                   Internally inputs are downscaled in the RFFT process function to avoid overflows.
+                   Number of bits downscaled, depends on the size of the transform.
+                   The input and output formats for different DCT sizes and number of bits to upscale are
+                   mentioned in the table below:
+
+                   \image html dct4FormatsQ31Table.gif
+ */
+
+void arm_dct4_q31(
+  const arm_dct4_instance_q31 * S,
+        q31_t * pState,
+        q31_t * pInlineBuffer)
+{
+  const q31_t *weights = S->pTwiddle;                  /* Pointer to the Weights table */
+  const q31_t *cosFact = S->pCosFactor;                /* Pointer to the cos factors table */
+        q31_t *pS1, *pS2, *pbuff;                      /* Temporary pointers for input buffer and pState buffer */
+        q31_t in;                                      /* Temporary variable */
+        uint32_t i;                                    /* Loop counter */
+
+
+  /* DCT4 computation involves DCT2 (which is calculated using RFFT)
+   * along with some pre-processing and post-processing.
+   * Computational procedure is explained as follows:
+   * (a) Pre-processing involves multiplying input with cos factor,
+   *     r(n) = 2 * u(n) * cos(pi*(2*n+1)/(4*n))
+   *              where,
+   *                 r(n) -- output of preprocessing
+   *                 u(n) -- input to preprocessing(actual Source buffer)
+   * (b) Calculation of DCT2 using FFT is divided into three steps:
+   *                  Step1: Re-ordering of even and odd elements of input.
+   *                  Step2: Calculating FFT of the re-ordered input.
+   *                  Step3: Taking the real part of the product of FFT output and weights.
+   * (c) Post-processing - DCT4 can be obtained from DCT2 output using the following equation:
+   *                   Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
+   *                        where,
+   *                           Y4 -- DCT4 output,   Y2 -- DCT2 output
+   * (d) Multiplying the output with the normalizing factor sqrt(2/N).
+   */
+
+  /*-------- Pre-processing ------------*/
+  /* Multiplying input with cos factor i.e. r(n) = 2 * x(n) * cos(pi*(2*n+1)/(4*n)) */
+  arm_mult_q31 (pInlineBuffer, cosFact, pInlineBuffer, S->N);
+  arm_shift_q31 (pInlineBuffer, 1, pInlineBuffer, S->N);
+
+  /* ----------------------------------------------------------------
+   * Step1: Re-ordering of even and odd elements as
+   *             pState[i] =  pInlineBuffer[2*i] and
+   *             pState[N-i-1] = pInlineBuffer[2*i+1] where i = 0 to N/2
+   ---------------------------------------------------------------------*/
+
+  /* pS1 initialized to pState */
+  pS1 = pState;
+
+  /* pS2 initialized to pState+N-1, so that it points to the end of the state buffer */
+  pS2 = pState + (S->N - 1U);
+
+  /* pbuff initialized to input buffer */
+  pbuff = pInlineBuffer;
+
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Initializing the loop counter to N/2 >> 2 for loop unrolling by 4 */
+  i = S->Nby2 >> 2U;
+
+  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
+   ** a second loop below computes the remaining 1 to 3 samples. */
+  do
+  {
+    /* Re-ordering of even and odd elements */
+    /* pState[i] =  pInlineBuffer[2*i] */
+    *pS1++ = *pbuff++;
+    /* pState[N-i-1] = pInlineBuffer[2*i+1] */
+    *pS2-- = *pbuff++;
+
+    *pS1++ = *pbuff++;
+    *pS2-- = *pbuff++;
+
+    *pS1++ = *pbuff++;
+    *pS2-- = *pbuff++;
+
+    *pS1++ = *pbuff++;
+    *pS2-- = *pbuff++;
+
+    /* Decrement loop counter */
+    i--;
+  } while (i > 0U);
+
+  /* pbuff initialized to input buffer */
+  pbuff = pInlineBuffer;
+
+  /* pS1 initialized to pState */
+  pS1 = pState;
+
+  /* Initializing the loop counter to N/4 instead of N for loop unrolling */
+  i = S->N >> 2U;
+
+  /* Processing with loop unrolling 4 times as N is always multiple of 4.
+   * Compute 4 outputs at a time */
+  do
+  {
+    /* Writing the re-ordered output back to inplace input buffer */
+    *pbuff++ = *pS1++;
+    *pbuff++ = *pS1++;
+    *pbuff++ = *pS1++;
+    *pbuff++ = *pS1++;
+
+    /* Decrement the loop counter */
+    i--;
+  } while (i > 0U);
+
+
+  /* ---------------------------------------------------------
+   *     Step2: Calculate RFFT for N-point input
+   * ---------------------------------------------------------- */
+  /* pInlineBuffer is real input of length N , pState is the complex output of length 2N */
+  arm_rfft_q31 (S->pRfft, pInlineBuffer, pState);
+
+  /*----------------------------------------------------------------------
+   *  Step3: Multiply the FFT output with the weights.
+   *----------------------------------------------------------------------*/
+  arm_cmplx_mult_cmplx_q31 (pState, weights, pState, S->N);
+
+  /* The output of complex multiplication is in 3.29 format.
+   * Hence changing the format of N (i.e. 2*N elements) complex numbers to 1.31 format by shifting left by 2 bits. */
+  arm_shift_q31 (pState, 2, pState, S->N * 2);
+
+  /* ----------- Post-processing ---------- */
+  /* DCT-IV can be obtained from DCT-II by the equation,
+   *       Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
+   *       Hence, Y4(0) = Y2(0)/2  */
+  /* Getting only real part from the output and Converting to DCT-IV */
+
+  /* Initializing the loop counter to N >> 2 for loop unrolling by 4 */
+  i = (S->N - 1U) >> 2U;
+
+  /* pbuff initialized to input buffer. */
+  pbuff = pInlineBuffer;
+
+  /* pS1 initialized to pState */
+  pS1 = pState;
+
+  /* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */
+  in = *pS1++ >> 1U;
+  /* input buffer acts as inplace, so output values are stored in the input itself. */
+  *pbuff++ = in;
+
+  /* pState pointer is incremented twice as the real values are located alternatively in the array */
+  pS1++;
+
+  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
+   ** a second loop below computes the remaining 1 to 3 samples. */
+  do
+  {
+    /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
+    /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
+    in = *pS1++ - in;
+    *pbuff++ = in;
+    /* points to the next real value */
+    pS1++;
+
+    in = *pS1++ - in;
+    *pbuff++ = in;
+    pS1++;
+
+    in = *pS1++ - in;
+    *pbuff++ = in;
+    pS1++;
+
+    in = *pS1++ - in;
+    *pbuff++ = in;
+    pS1++;
+
+    /* Decrement the loop counter */
+    i--;
+  } while (i > 0U);
+
+  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+   ** No loop unrolling is used. */
+  i = (S->N - 1U) % 0x4U;
+
+  while (i > 0U)
+  {
+    /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
+    /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
+    in = *pS1++ - in;
+    *pbuff++ = in;
+
+    /* points to the next real value */
+    pS1++;
+
+    /* Decrement loop counter */
+    i--;
+  }
+
+
+  /*------------ Normalizing the output by multiplying with the normalizing factor ----------*/
+
+  /* Initializing the loop counter to N/4 instead of N for loop unrolling */
+  i = S->N >> 2U;
+
+  /* pbuff initialized to the pInlineBuffer(now contains the output values) */
+  pbuff = pInlineBuffer;
+
+  /* Processing with loop unrolling 4 times as N is always multiple of 4.  Compute 4 outputs at a time */
+  do
+  {
+    /* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */
+    in = *pbuff;
+    *pbuff++ = ((q31_t) (((q63_t) in * S->normalize) >> 31));
+
+    in = *pbuff;
+    *pbuff++ = ((q31_t) (((q63_t) in * S->normalize) >> 31));
+
+    in = *pbuff;
+    *pbuff++ = ((q31_t) (((q63_t) in * S->normalize) >> 31));
+
+    in = *pbuff;
+    *pbuff++ = ((q31_t) (((q63_t) in * S->normalize) >> 31));
+
+    /* Decrement loop counter */
+    i--;
+  } while (i > 0U);
+
+
+#else
+
+  /* Initializing the loop counter to N/2 */
+  i = S->Nby2;
+
+  do
+  {
+    /* Re-ordering of even and odd elements */
+    /* pState[i] =  pInlineBuffer[2*i] */
+    *pS1++ = *pbuff++;
+    /* pState[N-i-1] = pInlineBuffer[2*i+1] */
+    *pS2-- = *pbuff++;
+
+    /* Decrement the loop counter */
+    i--;
+  } while (i > 0U);
+
+  /* pbuff initialized to input buffer */
+  pbuff = pInlineBuffer;
+
+  /* pS1 initialized to pState */
+  pS1 = pState;
+
+  /* Initializing the loop counter */
+  i = S->N;
+
+  do
+  {
+    /* Writing the re-ordered output back to inplace input buffer */
+    *pbuff++ = *pS1++;
+
+    /* Decrement the loop counter */
+    i--;
+  } while (i > 0U);
+
+
+  /* ---------------------------------------------------------
+   *     Step2: Calculate RFFT for N-point input
+   * ---------------------------------------------------------- */
+  /* pInlineBuffer is real input of length N , pState is the complex output of length 2N */
+  arm_rfft_q31 (S->pRfft, pInlineBuffer, pState);
+
+  /*----------------------------------------------------------------------
+   *  Step3: Multiply the FFT output with the weights.
+   *----------------------------------------------------------------------*/
+  arm_cmplx_mult_cmplx_q31 (pState, weights, pState, S->N);
+
+  /* The output of complex multiplication is in 3.29 format.
+   * Hence changing the format of N (i.e. 2*N elements) complex numbers to 1.31 format by shifting left by 2 bits. */
+  arm_shift_q31(pState, 2, pState, S->N * 2);
+
+  /* ----------- Post-processing ---------- */
+  /* DCT-IV can be obtained from DCT-II by the equation,
+   *       Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
+   *       Hence, Y4(0) = Y2(0)/2  */
+  /* Getting only real part from the output and Converting to DCT-IV */
+
+  /* pbuff initialized to input buffer. */
+  pbuff = pInlineBuffer;
+
+  /* pS1 initialized to pState */
+  pS1 = pState;
+
+  /* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */
+  in = *pS1++ >> 1U;
+  /* input buffer acts as inplace, so output values are stored in the input itself. */
+  *pbuff++ = in;
+
+  /* pState pointer is incremented twice as the real values are located alternatively in the array */
+  pS1++;
+
+  /* Initializing the loop counter */
+  i = (S->N - 1U);
+
+  while (i > 0U)
+  {
+    /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
+    /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
+    in = *pS1++ - in;
+    *pbuff++ = in;
+
+    /* points to the next real value */
+    pS1++;
+
+    /* Decrement loop counter */
+    i--;
+  }
+
+  /*------------ Normalizing the output by multiplying with the normalizing factor ----------*/
+
+  /* Initializing loop counter */
+  i = S->N;
+
+  /* pbuff initialized to the pInlineBuffer (now contains the output values) */
+  pbuff = pInlineBuffer;
+
+  do
+  {
+    /* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */
+    in = *pbuff;
+    *pbuff++ = ((q31_t) (((q63_t) in * S->normalize) >> 31));
+
+    /* Decrement loop counter */
+    i--;
+  } while (i > 0U);
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+}
+
+/**
+  @} end of DCT4_IDCT4 group
+ */
diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_rfft_f32.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_rfft_f32.c
index 8844b73..2ad336d 100644
--- a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_rfft_f32.c
+++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_rfft_f32.c
@@ -1,318 +1,309 @@
-/* ----------------------------------------------------------------------
- * Project:      CMSIS DSP Library
- * Title:        arm_rfft_f32.c
- * Description:  RFFT & RIFFT Floating point process function
- *
- * $Date:        23 April 2021
- * $Revision:    V1.9.0
- *
- * Target Processor: Cortex-M and Cortex-A cores
- * -------------------------------------------------------------------- */
-/*
- * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "dsp/transform_functions.h"
-
-/* ----------------------------------------------------------------------
- * Internal functions prototypes
- * -------------------------------------------------------------------- */
-
-extern void arm_radix4_butterfly_f32(
-        float32_t * pSrc,
-        uint16_t fftLen,
-  const float32_t * pCoef,
-        uint16_t twidCoefModifier);
-
-extern void arm_radix4_butterfly_inverse_f32(
-        float32_t * pSrc,
-        uint16_t fftLen,
-  const float32_t * pCoef,
-        uint16_t twidCoefModifier,
-        float32_t onebyfftLen);
-
-extern void arm_bitreversal_f32(
-        float32_t * pSrc,
-        uint16_t fftSize,
-        uint16_t bitRevFactor,
-  const uint16_t * pBitRevTab);
-
-void arm_split_rfft_f32(
-        float32_t * pSrc,
-        uint32_t fftLen,
-  const float32_t * pATable,
-  const float32_t * pBTable,
-        float32_t * pDst,
-        uint32_t modifier);
-
-void arm_split_rifft_f32(
-        float32_t * pSrc,
-        uint32_t fftLen,
-  const float32_t * pATable,
-  const float32_t * pBTable,
-        float32_t * pDst,
-        uint32_t modifier);
-
-/**
-  @ingroup groupTransforms
- */
-
-/**
-  @addtogroup RealFFT
-  @{
- */
-
-/**
-  @brief         Processing function for the floating-point RFFT/RIFFT.
-                 Source buffer is modified by this function.
-                 
-  @deprecated    Do not use this function.  It has been superceded by \ref arm_rfft_fast_f32 and will be removed in the future.
-  @param[in]     S    points to an instance of the floating-point RFFT/RIFFT structure
-  @param[in]     pSrc points to the input buffer
-  @param[out]    pDst points to the output buffer
-  @return        none
-
-  @par
-                   For the RIFFT, the source buffer must at least have length 
-                   fftLenReal + 2.
-                   The last two elements must be equal to what would be generated
-                   by the RFFT:
-                     (pSrc[0] - pSrc[1]) and 0.0f
- */
-
-void arm_rfft_f32(
-  const arm_rfft_instance_f32 * S,
-        float32_t * pSrc,
-        float32_t * pDst)
-{
-  const arm_cfft_radix4_instance_f32 *S_CFFT = S->pCfft;
-
-  /* Calculation of Real IFFT of input */
-  if (S->ifftFlagR == 1U)
-  {
-     /*  Real IFFT core process */
-     arm_split_rifft_f32 (pSrc, S->fftLenBy2, S->pTwiddleAReal, S->pTwiddleBReal, pDst, S->twidCoefRModifier);
-
-
-     /* Complex radix-4 IFFT process */
-     arm_radix4_butterfly_inverse_f32 (pDst, S_CFFT->fftLen, S_CFFT->pTwiddle, S_CFFT->twidCoefModifier, S_CFFT->onebyfftLen);
-
-    /* Bit reversal process */
-    if (S->bitReverseFlagR == 1U)
-    {
-      arm_bitreversal_f32 (pDst, S_CFFT->fftLen, S_CFFT->bitRevFactor, S_CFFT->pBitRevTable);
-    }
-  }
-  else
-  {
-    /* Calculation of RFFT of input */
-
-    /* Complex radix-4 FFT process */
-    arm_radix4_butterfly_f32 (pSrc, S_CFFT->fftLen, S_CFFT->pTwiddle, S_CFFT->twidCoefModifier);
-
-    /* Bit reversal process */
-    if (S->bitReverseFlagR == 1U)
-    {
-      arm_bitreversal_f32 (pSrc, S_CFFT->fftLen, S_CFFT->bitRevFactor, S_CFFT->pBitRevTable);
-    }
-
-    /*  Real FFT core process */
-    arm_split_rfft_f32 (pSrc, S->fftLenBy2, S->pTwiddleAReal, S->pTwiddleBReal, pDst, S->twidCoefRModifier);
-  }
-
-}
-
-/**
-  @} end of RealFFT group
- */
-
-/**
-  @brief         Core Real FFT process
-  @param[in]     pSrc      points to input buffer
-  @param[in]     fftLen    length of FFT
-  @param[in]     pATable   points to twiddle Coef A buffer
-  @param[in]     pBTable   points to twiddle Coef B buffer
-  @param[out]    pDst      points to output buffer
-  @param[in]     modifier  twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table
-  @return        none
- */
-
-void arm_split_rfft_f32(
-        float32_t * pSrc,
-        uint32_t fftLen,
-  const float32_t * pATable,
-  const float32_t * pBTable,
-        float32_t * pDst,
-        uint32_t modifier)
-{
-        uint32_t i;                                    /* Loop Counter */
-        float32_t outR, outI;                          /* Temporary variables for output */
-  const float32_t *pCoefA, *pCoefB;                    /* Temporary pointers for twiddle factors */
-        float32_t CoefA1, CoefA2, CoefB1;              /* Temporary variables for twiddle coefficients */
-        float32_t *pDst1 = &pDst[2], *pDst2 = &pDst[(4U * fftLen) - 1U];      /* temp pointers for output buffer */
-        float32_t *pSrc1 = &pSrc[2], *pSrc2 = &pSrc[(2U * fftLen) - 1U];      /* temp pointers for input buffer */
-
-  /* Init coefficient pointers */
-  pCoefA = &pATable[modifier * 2];
-  pCoefB = &pBTable[modifier * 2];
-
-  i = fftLen - 1U;
-
-  while (i > 0U)
-  {
-     /*
-       outR = (  pSrc[2 * i]             * pATable[2 * i]
-               - pSrc[2 * i + 1]         * pATable[2 * i + 1]
-               + pSrc[2 * n - 2 * i]     * pBTable[2 * i]
-               + pSrc[2 * n - 2 * i + 1] * pBTable[2 * i + 1]);
-
-       outI = (  pIn[2 * i + 1]         * pATable[2 * i]
-               + pIn[2 * i]             * pATable[2 * i + 1]
-               + pIn[2 * n - 2 * i]     * pBTable[2 * i + 1]
-               - pIn[2 * n - 2 * i + 1] * pBTable[2 * i]);
-      */
-
-    /* read pATable[2 * i] */
-    CoefA1 = *pCoefA++;
-    /* pATable[2 * i + 1] */
-    CoefA2 = *pCoefA;
-
-    /* pSrc[2 * i] * pATable[2 * i] */
-    outR = *pSrc1 * CoefA1;
-    /* pSrc[2 * i] * CoefA2 */
-    outI = *pSrc1++ * CoefA2;
-
-    /* (pSrc[2 * i + 1] + pSrc[2 * fftLen - 2 * i + 1]) * CoefA2 */
-    outR -= (*pSrc1 + *pSrc2) * CoefA2;
-    /* pSrc[2 * i + 1] * CoefA1 */
-    outI += *pSrc1++ * CoefA1;
-
-    CoefB1 = *pCoefB;
-
-    /* pSrc[2 * fftLen - 2 * i + 1] * CoefB1 */
-    outI -= *pSrc2-- * CoefB1;
-    /* pSrc[2 * fftLen - 2 * i] * CoefA2 */
-    outI -= *pSrc2 * CoefA2;
-
-    /* pSrc[2 * fftLen - 2 * i] * CoefB1 */
-    outR += *pSrc2-- * CoefB1;
-
-    /* write output */
-    *pDst1++ = outR;
-    *pDst1++ = outI;
-
-    /* write complex conjugate output */
-    *pDst2-- = -outI;
-    *pDst2-- = outR;
-
-    /* update coefficient pointer */
-    pCoefB = pCoefB + (modifier * 2U);
-    pCoefA = pCoefA + ((modifier * 2U) - 1U);
-
-    i--;
-
-  }
-
-  pDst[2U * fftLen] = pSrc[0] - pSrc[1];
-  pDst[(2U * fftLen) + 1U] = 0.0f;
-
-  pDst[0] = pSrc[0] + pSrc[1];
-  pDst[1] = 0.0f;
-
-}
-
-
-/**
-  @brief         Core Real IFFT process
-  @param[in]     pSrc      points to input buffer
-  @param[in]     fftLen    length of FFT
-  @param[in]     pATable   points to twiddle Coef A buffer
-  @param[in]     pBTable   points to twiddle Coef B buffer
-  @param[out]    pDst      points to output buffer
-  @param[in]     modifier  twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table
-  @return        none
- */
-
-void arm_split_rifft_f32(
-        float32_t * pSrc,
-        uint32_t fftLen,
-  const float32_t * pATable,
-  const float32_t * pBTable,
-        float32_t * pDst,
-        uint32_t modifier)
-{
-        float32_t outR, outI;                          /* Temporary variables for output */
-  const float32_t *pCoefA, *pCoefB;                    /* Temporary pointers for twiddle factors */
-        float32_t CoefA1, CoefA2, CoefB1;              /* Temporary variables for twiddle coefficients */
-        float32_t *pSrc1 = &pSrc[0], *pSrc2 = &pSrc[(2U * fftLen) + 1U];
-
-  pCoefA = &pATable[0];
-  pCoefB = &pBTable[0];
-
-  while (fftLen > 0U)
-  {
-     /*
-       outR = (  pIn[2 * i]             * pATable[2 * i]
-               + pIn[2 * i + 1]         * pATable[2 * i + 1]
-               + pIn[2 * n - 2 * i]     * pBTable[2 * i]
-               - pIn[2 * n - 2 * i + 1] * pBTable[2 * i + 1]);
-
-       outI = (  pIn[2 * i + 1]         * pATable[2 * i]
-               - pIn[2 * i]             * pATable[2 * i + 1]
-               - pIn[2 * n - 2 * i]     * pBTable[2 * i + 1]
-               - pIn[2 * n - 2 * i + 1] * pBTable[2 * i]);
-      */
-
-     CoefA1 = *pCoefA++;
-     CoefA2 = *pCoefA;
-
-     /* outR = (pSrc[2 * i] * CoefA1 */
-     outR = *pSrc1 * CoefA1;
-
-     /* - pSrc[2 * i] * CoefA2 */
-     outI = -(*pSrc1++) * CoefA2;
-
-     /* (pSrc[2 * i + 1] + pSrc[2 * fftLen - 2 * i + 1]) * CoefA2 */
-     outR += (*pSrc1 + *pSrc2) * CoefA2;
-
-     /* pSrc[2 * i + 1] * CoefA1 */
-     outI += (*pSrc1++) * CoefA1;
-
-     CoefB1 = *pCoefB;
-
-     /* - pSrc[2 * fftLen - 2 * i + 1] * CoefB1 */
-     outI -= *pSrc2-- * CoefB1;
-
-     /* pSrc[2 * fftLen - 2 * i] * CoefB1 */
-     outR += *pSrc2 * CoefB1;
-
-     /* pSrc[2 * fftLen - 2 * i] * CoefA2 */
-     outI += *pSrc2-- * CoefA2;
-
-     /* write output */
-     *pDst++ = outR;
-     *pDst++ = outI;
-
-     /* update coefficient pointer */
-     pCoefB = pCoefB + (modifier * 2);
-     pCoefA = pCoefA + (modifier * 2 - 1);
-
-     /* Decrement loop count */
-     fftLen--;
-  }
-
-}
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_rfft_f32.c
+ * Description:  RFFT & RIFFT Floating point process function
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/* ----------------------------------------------------------------------
+ * Internal functions prototypes
+ * -------------------------------------------------------------------- */
+
+extern void arm_radix4_butterfly_f32(
+        float32_t * pSrc,
+        uint16_t fftLen,
+  const float32_t * pCoef,
+        uint16_t twidCoefModifier);
+
+extern void arm_radix4_butterfly_inverse_f32(
+        float32_t * pSrc,
+        uint16_t fftLen,
+  const float32_t * pCoef,
+        uint16_t twidCoefModifier,
+        float32_t onebyfftLen);
+
+extern void arm_bitreversal_f32(
+        float32_t * pSrc,
+        uint16_t fftSize,
+        uint16_t bitRevFactor,
+  const uint16_t * pBitRevTab);
+
+void arm_split_rfft_f32(
+        float32_t * pSrc,
+        uint32_t fftLen,
+  const float32_t * pATable,
+  const float32_t * pBTable,
+        float32_t * pDst,
+        uint32_t modifier);
+
+void arm_split_rifft_f32(
+        float32_t * pSrc,
+        uint32_t fftLen,
+  const float32_t * pATable,
+  const float32_t * pBTable,
+        float32_t * pDst,
+        uint32_t modifier);
+
+/**
+  @ingroup groupTransforms
+ */
+
+/**
+  @addtogroup RealFFT
+  @{
+ */
+
+/**
+  @brief         Processing function for the floating-point RFFT/RIFFT.
+  @deprecated    Do not use this function.  It has been superceded by \ref arm_rfft_fast_f32 and will be removed in the future.
+  @param[in]     S    points to an instance of the floating-point RFFT/RIFFT structure
+  @param[in]     pSrc points to the input buffer
+  @param[out]    pDst points to the output buffer
+  @return        none
+ */
+
+void arm_rfft_f32(
+  const arm_rfft_instance_f32 * S,
+        float32_t * pSrc,
+        float32_t * pDst)
+{
+  const arm_cfft_radix4_instance_f32 *S_CFFT = S->pCfft;
+
+  /* Calculation of Real IFFT of input */
+  if (S->ifftFlagR == 1U)
+  {
+     /*  Real IFFT core process */
+     arm_split_rifft_f32 (pSrc, S->fftLenBy2, S->pTwiddleAReal, S->pTwiddleBReal, pDst, S->twidCoefRModifier);
+
+
+     /* Complex radix-4 IFFT process */
+     arm_radix4_butterfly_inverse_f32 (pDst, S_CFFT->fftLen, S_CFFT->pTwiddle, S_CFFT->twidCoefModifier, S_CFFT->onebyfftLen);
+
+    /* Bit reversal process */
+    if (S->bitReverseFlagR == 1U)
+    {
+      arm_bitreversal_f32 (pDst, S_CFFT->fftLen, S_CFFT->bitRevFactor, S_CFFT->pBitRevTable);
+    }
+  }
+  else
+  {
+    /* Calculation of RFFT of input */
+
+    /* Complex radix-4 FFT process */
+    arm_radix4_butterfly_f32 (pSrc, S_CFFT->fftLen, S_CFFT->pTwiddle, S_CFFT->twidCoefModifier);
+
+    /* Bit reversal process */
+    if (S->bitReverseFlagR == 1U)
+    {
+      arm_bitreversal_f32 (pSrc, S_CFFT->fftLen, S_CFFT->bitRevFactor, S_CFFT->pBitRevTable);
+    }
+
+    /*  Real FFT core process */
+    arm_split_rfft_f32 (pSrc, S->fftLenBy2, S->pTwiddleAReal, S->pTwiddleBReal, pDst, S->twidCoefRModifier);
+  }
+
+}
+
+/**
+  @} end of RealFFT group
+ */
+
+/**
+  @brief         Core Real FFT process
+  @param[in]     pSrc      points to input buffer
+  @param[in]     fftLen    length of FFT
+  @param[in]     pATable   points to twiddle Coef A buffer
+  @param[in]     pBTable   points to twiddle Coef B buffer
+  @param[out]    pDst      points to output buffer
+  @param[in]     modifier  twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table
+  @return        none
+ */
+
+void arm_split_rfft_f32(
+        float32_t * pSrc,
+        uint32_t fftLen,
+  const float32_t * pATable,
+  const float32_t * pBTable,
+        float32_t * pDst,
+        uint32_t modifier)
+{
+        uint32_t i;                                    /* Loop Counter */
+        float32_t outR, outI;                          /* Temporary variables for output */
+  const float32_t *pCoefA, *pCoefB;                    /* Temporary pointers for twiddle factors */
+        float32_t CoefA1, CoefA2, CoefB1;              /* Temporary variables for twiddle coefficients */
+        float32_t *pDst1 = &pDst[2], *pDst2 = &pDst[(4U * fftLen) - 1U];      /* temp pointers for output buffer */
+        float32_t *pSrc1 = &pSrc[2], *pSrc2 = &pSrc[(2U * fftLen) - 1U];      /* temp pointers for input buffer */
+
+  /* Init coefficient pointers */
+  pCoefA = &pATable[modifier * 2];
+  pCoefB = &pBTable[modifier * 2];
+
+  i = fftLen - 1U;
+
+  while (i > 0U)
+  {
+     /*
+       outR = (  pSrc[2 * i]             * pATable[2 * i]
+               - pSrc[2 * i + 1]         * pATable[2 * i + 1]
+               + pSrc[2 * n - 2 * i]     * pBTable[2 * i]
+               + pSrc[2 * n - 2 * i + 1] * pBTable[2 * i + 1]);
+
+       outI = (  pIn[2 * i + 1]         * pATable[2 * i]
+               + pIn[2 * i]             * pATable[2 * i + 1]
+               + pIn[2 * n - 2 * i]     * pBTable[2 * i + 1]
+               - pIn[2 * n - 2 * i + 1] * pBTable[2 * i]);
+      */
+
+    /* read pATable[2 * i] */
+    CoefA1 = *pCoefA++;
+    /* pATable[2 * i + 1] */
+    CoefA2 = *pCoefA;
+
+    /* pSrc[2 * i] * pATable[2 * i] */
+    outR = *pSrc1 * CoefA1;
+    /* pSrc[2 * i] * CoefA2 */
+    outI = *pSrc1++ * CoefA2;
+
+    /* (pSrc[2 * i + 1] + pSrc[2 * fftLen - 2 * i + 1]) * CoefA2 */
+    outR -= (*pSrc1 + *pSrc2) * CoefA2;
+    /* pSrc[2 * i + 1] * CoefA1 */
+    outI += *pSrc1++ * CoefA1;
+
+    CoefB1 = *pCoefB;
+
+    /* pSrc[2 * fftLen - 2 * i + 1] * CoefB1 */
+    outI -= *pSrc2-- * CoefB1;
+    /* pSrc[2 * fftLen - 2 * i] * CoefA2 */
+    outI -= *pSrc2 * CoefA2;
+
+    /* pSrc[2 * fftLen - 2 * i] * CoefB1 */
+    outR += *pSrc2-- * CoefB1;
+
+    /* write output */
+    *pDst1++ = outR;
+    *pDst1++ = outI;
+
+    /* write complex conjugate output */
+    *pDst2-- = -outI;
+    *pDst2-- = outR;
+
+    /* update coefficient pointer */
+    pCoefB = pCoefB + (modifier * 2U);
+    pCoefA = pCoefA + ((modifier * 2U) - 1U);
+
+    i--;
+
+  }
+
+  pDst[2U * fftLen] = pSrc[0] - pSrc[1];
+  pDst[(2U * fftLen) + 1U] = 0.0f;
+
+  pDst[0] = pSrc[0] + pSrc[1];
+  pDst[1] = 0.0f;
+
+}
+
+
+/**
+  @brief         Core Real IFFT process
+  @param[in]     pSrc      points to input buffer
+  @param[in]     fftLen    length of FFT
+  @param[in]     pATable   points to twiddle Coef A buffer
+  @param[in]     pBTable   points to twiddle Coef B buffer
+  @param[out]    pDst      points to output buffer
+  @param[in]     modifier  twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table
+  @return        none
+ */
+
+void arm_split_rifft_f32(
+        float32_t * pSrc,
+        uint32_t fftLen,
+  const float32_t * pATable,
+  const float32_t * pBTable,
+        float32_t * pDst,
+        uint32_t modifier)
+{
+        float32_t outR, outI;                          /* Temporary variables for output */
+  const float32_t *pCoefA, *pCoefB;                    /* Temporary pointers for twiddle factors */
+        float32_t CoefA1, CoefA2, CoefB1;              /* Temporary variables for twiddle coefficients */
+        float32_t *pSrc1 = &pSrc[0], *pSrc2 = &pSrc[(2U * fftLen) + 1U];
+
+  pCoefA = &pATable[0];
+  pCoefB = &pBTable[0];
+
+  while (fftLen > 0U)
+  {
+     /*
+       outR = (  pIn[2 * i]             * pATable[2 * i]
+               + pIn[2 * i + 1]         * pATable[2 * i + 1]
+               + pIn[2 * n - 2 * i]     * pBTable[2 * i]
+               - pIn[2 * n - 2 * i + 1] * pBTable[2 * i + 1]);
+
+       outI = (  pIn[2 * i + 1]         * pATable[2 * i]
+               - pIn[2 * i]             * pATable[2 * i + 1]
+               - pIn[2 * n - 2 * i]     * pBTable[2 * i + 1]
+               - pIn[2 * n - 2 * i + 1] * pBTable[2 * i]);
+      */
+
+     CoefA1 = *pCoefA++;
+     CoefA2 = *pCoefA;
+
+     /* outR = (pSrc[2 * i] * CoefA1 */
+     outR = *pSrc1 * CoefA1;
+
+     /* - pSrc[2 * i] * CoefA2 */
+     outI = -(*pSrc1++) * CoefA2;
+
+     /* (pSrc[2 * i + 1] + pSrc[2 * fftLen - 2 * i + 1]) * CoefA2 */
+     outR += (*pSrc1 + *pSrc2) * CoefA2;
+
+     /* pSrc[2 * i + 1] * CoefA1 */
+     outI += (*pSrc1++) * CoefA1;
+
+     CoefB1 = *pCoefB;
+
+     /* - pSrc[2 * fftLen - 2 * i + 1] * CoefB1 */
+     outI -= *pSrc2-- * CoefB1;
+
+     /* pSrc[2 * fftLen - 2 * i] * CoefB1 */
+     outR += *pSrc2 * CoefB1;
+
+     /* pSrc[2 * fftLen - 2 * i] * CoefA2 */
+     outI += *pSrc2-- * CoefA2;
+
+     /* write output */
+     *pDst++ = outR;
+     *pDst++ = outI;
+
+     /* update coefficient pointer */
+     pCoefB = pCoefB + (modifier * 2);
+     pCoefA = pCoefA + (modifier * 2 - 1);
+
+     /* Decrement loop count */
+     fftLen--;
+  }
+
+}
diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_rfft_fast_f32.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_rfft_fast_f32.c
index 6712504..ebaa7d9 100644
--- a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_rfft_fast_f32.c
+++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_rfft_fast_f32.c
@@ -1,603 +1,320 @@
-/* ----------------------------------------------------------------------
- * Project:      CMSIS DSP Library
- * Title:        arm_rfft_fast_f32.c
- * Description:  RFFT & RIFFT Floating point process function
- *
- * $Date:        23 April 2021
- * $Revision:    V1.9.0
- *
- * Target Processor: Cortex-M and Cortex-A cores
- * -------------------------------------------------------------------- */
-/*
- * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "dsp/transform_functions.h"
-
-#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
-void stage_rfft_f32(
-  const arm_rfft_fast_instance_f32 * S,
-        float32_t * p,
-        float32_t * pOut)
-{
-        int32_t  k;                                /* Loop Counter */
-        float32_t twR, twI;                         /* RFFT Twiddle coefficients */
-  const float32_t * pCoeff = S->pTwiddleRFFT;       /* Points to RFFT Twiddle factors */
-        float32_t *pA = p;                          /* increasing pointer */
-        float32_t *pB = p;                          /* decreasing pointer */
-        float32_t xAR, xAI, xBR, xBI;               /* temporary variables */
-        float32_t t1a, t1b;                         /* temporary variables */
-        float32_t p0, p1, p2, p3;                   /* temporary variables */
-
-        float32x4x2_t tw,xA,xB;
-        float32x4x2_t tmp1, tmp2, res;
-
-        uint32x4_t     vecStridesFwd, vecStridesBkwd;
-
-        vecStridesFwd = vidupq_u32((uint32_t)0, 2);
-        vecStridesBkwd = -vecStridesFwd;
-
-        int blockCnt;
-
-
-   k = (S->Sint).fftLen - 1;
-
-   /* Pack first and last sample of the frequency domain together */
-
-   xBR = pB[0];
-   xBI = pB[1];
-   xAR = pA[0];
-   xAI = pA[1];
-
-   twR = *pCoeff++ ;
-   twI = *pCoeff++ ;
-
-   // U1 = XA(1) + XB(1); % It is real
-   t1a = xBR + xAR  ;
-
-   // U2 = XB(1) - XA(1); % It is imaginary
-   t1b = xBI + xAI  ;
-
-   // real(tw * (xB - xA)) = twR * (xBR - xAR) - twI * (xBI - xAI);
-   // imag(tw * (xB - xA)) = twI * (xBR - xAR) + twR * (xBI - xAI);
-   *pOut++ = 0.5f * ( t1a + t1b );
-   *pOut++ = 0.5f * ( t1a - t1b );
-
-   // XA(1) = 1/2*( U1 - imag(U2) +  i*( U1 +imag(U2) ));
-   pB  = p + 2*k;
-   pA += 2;
-
-   blockCnt = k >> 2;
-   while (blockCnt > 0)
-   {
-      /*
-         function X = my_split_rfft(X, ifftFlag)
-         % X is a series of real numbers
-         L  = length(X);
-         XC = X(1:2:end) +i*X(2:2:end);
-         XA = fft(XC);
-         XB = conj(XA([1 end:-1:2]));
-         TW = i*exp(-2*pi*i*[0:L/2-1]/L).';
-         for l = 2:L/2
-            XA(l) = 1/2 * (XA(l) + XB(l) + TW(l) * (XB(l) - XA(l)));
-         end
-         XA(1) = 1/2* (XA(1) + XB(1) + TW(1) * (XB(1) - XA(1))) + i*( 1/2*( XA(1) + XB(1) + i*( XA(1) - XB(1))));
-         X = XA;
-      */
-
-
-      xA = vld2q_f32(pA);
-      pA += 8;
-
-      xB = vld2q_f32(pB);
-
-      xB.val[0] = vldrwq_gather_shifted_offset_f32(pB, vecStridesBkwd);
-      xB.val[1] = vldrwq_gather_shifted_offset_f32(&pB[1], vecStridesBkwd);
-
-      xB.val[1] = vnegq_f32(xB.val[1]);
-      pB -= 8;
-
-
-      tw = vld2q_f32(pCoeff);
-      pCoeff += 8;
-
-
-      tmp1.val[0] = vaddq_f32(xA.val[0],xB.val[0]);
-      tmp1.val[1] = vaddq_f32(xA.val[1],xB.val[1]);
-
-      tmp2.val[0] = vsubq_f32(xB.val[0],xA.val[0]);
-      tmp2.val[1] = vsubq_f32(xB.val[1],xA.val[1]);
-
-      res.val[0] = vmulq(tw.val[0], tmp2.val[0]);
-      res.val[0] = vfmsq(res.val[0],tw.val[1], tmp2.val[1]);
-
-      res.val[1] = vmulq(tw.val[0], tmp2.val[1]);
-      res.val[1] = vfmaq(res.val[1], tw.val[1], tmp2.val[0]);
-
-      res.val[0] = vaddq_f32(res.val[0],tmp1.val[0] );
-      res.val[1] = vaddq_f32(res.val[1],tmp1.val[1] );
-
-      res.val[0] = vmulq_n_f32(res.val[0], 0.5f);
-      res.val[1] = vmulq_n_f32(res.val[1], 0.5f);
-
-
-      vst2q_f32(pOut, res);
-      pOut += 8;
-
-    
-      blockCnt--;
-   } 
-
-   blockCnt = k & 3;
-   while (blockCnt > 0)
-   {
-      /*
-         function X = my_split_rfft(X, ifftFlag)
-         % X is a series of real numbers
-         L  = length(X);
-         XC = X(1:2:end) +i*X(2:2:end);
-         XA = fft(XC);
-         XB = conj(XA([1 end:-1:2]));
-         TW = i*exp(-2*pi*i*[0:L/2-1]/L).';
-         for l = 2:L/2
-            XA(l) = 1/2 * (XA(l) + XB(l) + TW(l) * (XB(l) - XA(l)));
-         end
-         XA(1) = 1/2* (XA(1) + XB(1) + TW(1) * (XB(1) - XA(1))) + i*( 1/2*( XA(1) + XB(1) + i*( XA(1) - XB(1))));
-         X = XA;
-      */
-
-      xBI = pB[1];
-      xBR = pB[0];
-      xAR = pA[0];
-      xAI = pA[1];
-
-      twR = *pCoeff++;
-      twI = *pCoeff++;
-
-      t1a = xBR - xAR ;
-      t1b = xBI + xAI ;
-
-      // real(tw * (xB - xA)) = twR * (xBR - xAR) - twI * (xBI - xAI);
-      // imag(tw * (xB - xA)) = twI * (xBR - xAR) + twR * (xBI - xAI);
-      p0 = twR * t1a;
-      p1 = twI * t1a;
-      p2 = twR * t1b;
-      p3 = twI * t1b;
-
-      *pOut++ = 0.5f * (xAR + xBR + p0 + p3 ); //xAR
-      *pOut++ = 0.5f * (xAI - xBI + p1 - p2 ); //xAI
-
-      pA += 2;
-      pB -= 2;
-      blockCnt--;
-   }
-}
-
-/* Prepares data for inverse cfft */
-void merge_rfft_f32(
-  const arm_rfft_fast_instance_f32 * S,
-        float32_t * p,
-        float32_t * pOut)
-{
-        int32_t  k;                                /* Loop Counter */
-        float32_t twR, twI;                         /* RFFT Twiddle coefficients */
-  const float32_t *pCoeff = S->pTwiddleRFFT;        /* Points to RFFT Twiddle factors */
-        float32_t *pA = p;                          /* increasing pointer */
-        float32_t *pB = p;                          /* decreasing pointer */
-        float32_t xAR, xAI, xBR, xBI;               /* temporary variables */
-        float32_t t1a, t1b, r, s, t, u;             /* temporary variables */
-
-        float32x4x2_t tw,xA,xB;
-        float32x4x2_t tmp1, tmp2, res;
-        uint32x4_t     vecStridesFwd, vecStridesBkwd;
-
-        vecStridesFwd = vidupq_u32((uint32_t)0, 2);
-        vecStridesBkwd = -vecStridesFwd;
-
-        int blockCnt;
-        
-
-   k = (S->Sint).fftLen - 1;
-
-   xAR = pA[0];
-   xAI = pA[1];
-
-   pCoeff += 2 ;
-
-   *pOut++ = 0.5f * ( xAR + xAI );
-   *pOut++ = 0.5f * ( xAR - xAI );
-
-   pB  =  p + 2*k ;
-   pA +=  2    ;
-
-   blockCnt = k >> 2;
-   while (blockCnt > 0)
-   {
-      /* G is half of the frequency complex spectrum */
-      //for k = 2:N
-      //    Xk(k) = 1/2 * (G(k) + conj(G(N-k+2)) + Tw(k)*( G(k) - conj(G(N-k+2))));
-      xA = vld2q_f32(pA);
-      pA += 8;
-
-      xB = vld2q_f32(pB);
-
-      xB.val[0] = vldrwq_gather_shifted_offset_f32(pB, vecStridesBkwd);
-      xB.val[1] = vldrwq_gather_shifted_offset_f32(&pB[1], vecStridesBkwd);
-
-      xB.val[1] = vnegq_f32(xB.val[1]);
-      pB -= 8;
-
-
-      tw = vld2q_f32(pCoeff);
-      tw.val[1] = vnegq_f32(tw.val[1]);
-      pCoeff += 8;
-
-
-      tmp1.val[0] = vaddq_f32(xA.val[0],xB.val[0]);
-      tmp1.val[1] = vaddq_f32(xA.val[1],xB.val[1]);
-
-      tmp2.val[0] = vsubq_f32(xB.val[0],xA.val[0]);
-      tmp2.val[1] = vsubq_f32(xB.val[1],xA.val[1]);
-
-      res.val[0] = vmulq(tw.val[0], tmp2.val[0]);
-      res.val[0] = vfmsq(res.val[0],tw.val[1], tmp2.val[1]);
-
-      res.val[1] = vmulq(tw.val[0], tmp2.val[1]);
-      res.val[1] = vfmaq(res.val[1], tw.val[1], tmp2.val[0]);
-
-      res.val[0] = vaddq_f32(res.val[0],tmp1.val[0] );
-      res.val[1] = vaddq_f32(res.val[1],tmp1.val[1] );
-
-      res.val[0] = vmulq_n_f32(res.val[0], 0.5f);
-      res.val[1] = vmulq_n_f32(res.val[1], 0.5f);
-
-
-      vst2q_f32(pOut, res);
-      pOut += 8;
-
-    
-      blockCnt--;
-   }
-
-   blockCnt = k & 3;
-   while (blockCnt > 0)
-   {
-      /* G is half of the frequency complex spectrum */
-      //for k = 2:N
-      //    Xk(k) = 1/2 * (G(k) + conj(G(N-k+2)) + Tw(k)*( G(k) - conj(G(N-k+2))));
-      xBI =   pB[1]    ;
-      xBR =   pB[0]    ;
-      xAR =  pA[0];
-      xAI =  pA[1];
-
-      twR = *pCoeff++;
-      twI = *pCoeff++;
-
-      t1a = xAR - xBR ;
-      t1b = xAI + xBI ;
-
-      r = twR * t1a;
-      s = twI * t1b;
-      t = twI * t1a;
-      u = twR * t1b;
-
-      // real(tw * (xA - xB)) = twR * (xAR - xBR) - twI * (xAI - xBI);
-      // imag(tw * (xA - xB)) = twI * (xAR - xBR) + twR * (xAI - xBI);
-      *pOut++ = 0.5f * (xAR + xBR - r - s ); //xAR
-      *pOut++ = 0.5f * (xAI - xBI + t - u ); //xAI
-
-      pA += 2;
-      pB -= 2;
-      blockCnt--;
-   }
-
-}
-#else
-void stage_rfft_f32(
-  const arm_rfft_fast_instance_f32 * S,
-        float32_t * p,
-        float32_t * pOut)
-{
-        int32_t  k;                                /* Loop Counter */
-        float32_t twR, twI;                         /* RFFT Twiddle coefficients */
-  const float32_t * pCoeff = S->pTwiddleRFFT;       /* Points to RFFT Twiddle factors */
-        float32_t *pA = p;                          /* increasing pointer */
-        float32_t *pB = p;                          /* decreasing pointer */
-        float32_t xAR, xAI, xBR, xBI;               /* temporary variables */
-        float32_t t1a, t1b;                         /* temporary variables */
-        float32_t p0, p1, p2, p3;                   /* temporary variables */
-
-
-   k = (S->Sint).fftLen - 1;
-
-   /* Pack first and last sample of the frequency domain together */
-
-   xBR = pB[0];
-   xBI = pB[1];
-   xAR = pA[0];
-   xAI = pA[1];
-
-   twR = *pCoeff++ ;
-   twI = *pCoeff++ ;
-
-
-   // U1 = XA(1) + XB(1); % It is real
-   t1a = xBR + xAR  ;
-
-   // U2 = XB(1) - XA(1); % It is imaginary
-   t1b = xBI + xAI  ;
-
-   // real(tw * (xB - xA)) = twR * (xBR - xAR) - twI * (xBI - xAI);
-   // imag(tw * (xB - xA)) = twI * (xBR - xAR) + twR * (xBI - xAI);
-   *pOut++ = 0.5f * ( t1a + t1b );
-   *pOut++ = 0.5f * ( t1a - t1b );
-
-   // XA(1) = 1/2*( U1 - imag(U2) +  i*( U1 +imag(U2) ));
-   pB  = p + 2*k;
-   pA += 2;
-
-   do
-   {
-      /*
-         function X = my_split_rfft(X, ifftFlag)
-         % X is a series of real numbers
-         L  = length(X);
-         XC = X(1:2:end) +i*X(2:2:end);
-         XA = fft(XC);
-         XB = conj(XA([1 end:-1:2]));
-         TW = i*exp(-2*pi*i*[0:L/2-1]/L).';
-         for l = 2:L/2
-            XA(l) = 1/2 * (XA(l) + XB(l) + TW(l) * (XB(l) - XA(l)));
-         end
-         XA(1) = 1/2* (XA(1) + XB(1) + TW(1) * (XB(1) - XA(1))) + i*( 1/2*( XA(1) + XB(1) + i*( XA(1) - XB(1))));
-         X = XA;
-      */
-
-      xBI = pB[1];
-      xBR = pB[0];
-      xAR = pA[0];
-      xAI = pA[1];
-
-      twR = *pCoeff++;
-      twI = *pCoeff++;
-
-      t1a = xBR - xAR ;
-      t1b = xBI + xAI ;
-
-      // real(tw * (xB - xA)) = twR * (xBR - xAR) - twI * (xBI - xAI);
-      // imag(tw * (xB - xA)) = twI * (xBR - xAR) + twR * (xBI - xAI);
-      p0 = twR * t1a;
-      p1 = twI * t1a;
-      p2 = twR * t1b;
-      p3 = twI * t1b;
-
-      *pOut++ = 0.5f * (xAR + xBR + p0 + p3 ); //xAR
-      *pOut++ = 0.5f * (xAI - xBI + p1 - p2 ); //xAI
-
-
-      pA += 2;
-      pB -= 2;
-      k--;
-   } while (k > 0);
-}
-
-/* Prepares data for inverse cfft */
-void merge_rfft_f32(
-  const arm_rfft_fast_instance_f32 * S,
-        float32_t * p,
-        float32_t * pOut)
-{
-        int32_t  k;                                /* Loop Counter */
-        float32_t twR, twI;                         /* RFFT Twiddle coefficients */
-  const float32_t *pCoeff = S->pTwiddleRFFT;        /* Points to RFFT Twiddle factors */
-        float32_t *pA = p;                          /* increasing pointer */
-        float32_t *pB = p;                          /* decreasing pointer */
-        float32_t xAR, xAI, xBR, xBI;               /* temporary variables */
-        float32_t t1a, t1b, r, s, t, u;             /* temporary variables */
-
-   k = (S->Sint).fftLen - 1;
-
-   xAR = pA[0];
-   xAI = pA[1];
-
-   pCoeff += 2 ;
-
-   *pOut++ = 0.5f * ( xAR + xAI );
-   *pOut++ = 0.5f * ( xAR - xAI );
-
-   pB  =  p + 2*k ;
-   pA +=  2	   ;
-
-   while (k > 0)
-   {
-      /* G is half of the frequency complex spectrum */
-      //for k = 2:N
-      //    Xk(k) = 1/2 * (G(k) + conj(G(N-k+2)) + Tw(k)*( G(k) - conj(G(N-k+2))));
-      xBI =   pB[1]    ;
-      xBR =   pB[0]    ;
-      xAR =  pA[0];
-      xAI =  pA[1];
-
-      twR = *pCoeff++;
-      twI = *pCoeff++;
-
-      t1a = xAR - xBR ;
-      t1b = xAI + xBI ;
-
-      r = twR * t1a;
-      s = twI * t1b;
-      t = twI * t1a;
-      u = twR * t1b;
-
-      // real(tw * (xA - xB)) = twR * (xAR - xBR) - twI * (xAI - xBI);
-      // imag(tw * (xA - xB)) = twI * (xAR - xBR) + twR * (xAI - xBI);
-      *pOut++ = 0.5f * (xAR + xBR - r - s ); //xAR
-      *pOut++ = 0.5f * (xAI - xBI + t - u ); //xAI
-
-      pA += 2;
-      pB -= 2;
-      k--;
-   }
-
-}
-
-#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
-
-/**
-  @ingroup groupTransforms
-*/
-
-/**
-  @defgroup RealFFT Real FFT Functions
- 
-  @par
-                   The CMSIS DSP library includes specialized algorithms for computing the
-                   FFT of real data sequences.  The FFT is defined over complex data but
-                   in many applications the input is real.  Real FFT algorithms take advantage
-                   of the symmetry properties of the FFT and have a speed advantage over complex
-                   algorithms of the same length.
-  @par
-                   The Fast RFFT algorithm relays on the mixed radix CFFT that save processor usage.
-  @par
-                   The real length N forward FFT of a sequence is computed using the steps shown below.
-  @par
-                   \image html RFFT.gif "Real Fast Fourier Transform"
-  @par
-                   The real sequence is initially treated as if it were complex to perform a CFFT.
-                   Later, a processing stage reshapes the data to obtain half of the frequency spectrum
-                   in complex format. Except the first complex number that contains the two real numbers
-                   X[0] and X[N/2] all the data is complex. In other words, the first complex sample
-                   contains two real values packed.
-  @par
-                   The input for the inverse RFFT should keep the same format as the output of the
-                   forward RFFT. A first processing stage pre-process the data to later perform an
-                   inverse CFFT.
-  @par
-                   \image html RIFFT.gif "Real Inverse Fast Fourier Transform"
-  @par
-                   The algorithms for floating-point, Q15, and Q31 data are slightly different
-                   and we describe each algorithm in turn.
-  @par           Floating-point
-                   The main functions are \ref arm_rfft_fast_f32() and \ref arm_rfft_fast_init_f32().
-                   The older functions \ref arm_rfft_f32() and \ref arm_rfft_init_f32() have been deprecated
-                   but are still documented.
-  @par
-                   The FFT of a real N-point sequence has even symmetry in the frequency domain. 
-                   The second half of the data equals the conjugate of the first half flipped in frequency. 
-                   Looking at the data, we see that we can uniquely represent the FFT using only N/2 complex numbers.
-                   These are packed into the output array in alternating real and imaginary components:
-  @par
-                   X = { real[0], imag[0], real[1], imag[1], real[2], imag[2] ...
-                   real[(N/2)-1], imag[(N/2)-1 }
-  @par
-                   It happens that the first complex number (real[0], imag[0]) is actually
-                   all real. real[0] represents the DC offset, and imag[0] should be 0.
-                   (real[1], imag[1]) is the fundamental frequency, (real[2], imag[2]) is
-                   the first harmonic and so on.
-  @par
-                   The real FFT functions pack the frequency domain data in this fashion.
-                   The forward transform outputs the data in this form and the inverse
-                   transform expects input data in this form. The function always performs
-                   the needed bitreversal so that the input and output data is always in
-                   normal order. The functions support lengths of [32, 64, 128, ..., 4096]
-                   samples.
-  @par           Q15 and Q31
-                   The real algorithms are defined in a similar manner and utilize N/2 complex
-                   transforms behind the scenes.
-  @par
-                   The complex transforms used internally include scaling to prevent fixed-point
-                   overflows.  The overall scaling equals 1/(fftLen/2).
-                   Due to the use of complex transform internally, the source buffer is
-                   modified by the rfft.
-  @par
-                   A separate instance structure must be defined for each transform used but
-                   twiddle factor and bit reversal tables can be reused.
-  @par
-                   There is also an associated initialization function for each data type.
-                   The initialization function performs the following operations:
-                    - Sets the values of the internal structure fields.
-                    - Initializes twiddle factor table and bit reversal table pointers.
-                    - Initializes the internal complex FFT data structure.
-  @par
-                   Use of the initialization function is optional **except for MVE versions where it is mandatory**.
-                   If you don't use the initialization functions, then the structures should be initialized with code
-                   similar to the one below:
-  <pre>
-      arm_rfft_instance_q31 S = {fftLenReal, fftLenBy2, ifftFlagR, bitReverseFlagR, twidCoefRModifier, pTwiddleAReal, pTwiddleBReal, pCfft};
-      arm_rfft_instance_q15 S = {fftLenReal, fftLenBy2, ifftFlagR, bitReverseFlagR, twidCoefRModifier, pTwiddleAReal, pTwiddleBReal, pCfft};
-  </pre>
-                   where <code>fftLenReal</code> is the length of the real transform;
-                   <code>fftLenBy2</code> length of  the internal complex transform (fftLenReal/2).
-                   <code>ifftFlagR</code> Selects forward (=0) or inverse (=1) transform.
-                   <code>bitReverseFlagR</code> Selects bit reversed output (=0) or normal order
-                   output (=1).
-                   <code>twidCoefRModifier</code> stride modifier for the twiddle factor table.
-                   The value is based on the FFT length;
-                   <code>pTwiddleAReal</code>points to the A array of twiddle coefficients;
-                   <code>pTwiddleBReal</code>points to the B array of twiddle coefficients;
-                   <code>pCfft</code> points to the CFFT Instance structure. The CFFT structure
-                   must also be initialized.  
-@par
-                   Note that with MVE versions you can't initialize instance structures directly and **must
-                   use the initialization function**.
- */
-
-/**
-  @addtogroup RealFFT
-  @{
-*/
-
-/**
-  @brief         Processing function for the floating-point real FFT.
-  @param[in]     S         points to an arm_rfft_fast_instance_f32 structure
-  @param[in]     p         points to input buffer (Source buffer is modified by this function.)
-  @param[in]     pOut      points to output buffer
-  @param[in]     ifftFlag
-                   - value = 0: RFFT
-                   - value = 1: RIFFT
-  @return        none
-*/
-
-void arm_rfft_fast_f32(
-  const arm_rfft_fast_instance_f32 * S,
-  float32_t * p,
-  float32_t * pOut,
-  uint8_t ifftFlag)
-{
-   const arm_cfft_instance_f32 * Sint = &(S->Sint);
-
-   /* Calculation of Real FFT */
-   if (ifftFlag)
-   {
-      /*  Real FFT compression */
-      merge_rfft_f32(S, p, pOut);
-      /* Complex radix-4 IFFT process */
-      arm_cfft_f32( Sint, pOut, ifftFlag, 1);
-   }
-   else
-   {
-      /* Calculation of RFFT of input */
-      arm_cfft_f32( Sint, p, ifftFlag, 1);
-
-      /*  Real FFT extraction */
-      stage_rfft_f32(S, p, pOut);
-   }
-}
-
-/**
-* @} end of RealFFT group
-*/
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_rfft_f32.c
+ * Description:  RFFT & RIFFT Floating point process function
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+void stage_rfft_f32(
+  const arm_rfft_fast_instance_f32 * S,
+        float32_t * p,
+        float32_t * pOut)
+{
+        uint32_t  k;                                /* Loop Counter */
+        float32_t twR, twI;                         /* RFFT Twiddle coefficients */
+  const float32_t * pCoeff = S->pTwiddleRFFT;       /* Points to RFFT Twiddle factors */
+        float32_t *pA = p;                          /* increasing pointer */
+        float32_t *pB = p;                          /* decreasing pointer */
+        float32_t xAR, xAI, xBR, xBI;               /* temporary variables */
+        float32_t t1a, t1b;                         /* temporary variables */
+        float32_t p0, p1, p2, p3;                   /* temporary variables */
+
+
+   k = (S->Sint).fftLen - 1;
+
+   /* Pack first and last sample of the frequency domain together */
+
+   xBR = pB[0];
+   xBI = pB[1];
+   xAR = pA[0];
+   xAI = pA[1];
+
+   twR = *pCoeff++ ;
+   twI = *pCoeff++ ;
+
+   // U1 = XA(1) + XB(1); % It is real
+   t1a = xBR + xAR  ;
+
+   // U2 = XB(1) - XA(1); % It is imaginary
+   t1b = xBI + xAI  ;
+
+   // real(tw * (xB - xA)) = twR * (xBR - xAR) - twI * (xBI - xAI);
+   // imag(tw * (xB - xA)) = twI * (xBR - xAR) + twR * (xBI - xAI);
+   *pOut++ = 0.5f * ( t1a + t1b );
+   *pOut++ = 0.5f * ( t1a - t1b );
+
+   // XA(1) = 1/2*( U1 - imag(U2) +  i*( U1 +imag(U2) ));
+   pB  = p + 2*k;
+   pA += 2;
+
+   do
+   {
+      /*
+         function X = my_split_rfft(X, ifftFlag)
+         % X is a series of real numbers
+         L  = length(X);
+         XC = X(1:2:end) +i*X(2:2:end);
+         XA = fft(XC);
+         XB = conj(XA([1 end:-1:2]));
+         TW = i*exp(-2*pi*i*[0:L/2-1]/L).';
+         for l = 2:L/2
+            XA(l) = 1/2 * (XA(l) + XB(l) + TW(l) * (XB(l) - XA(l)));
+         end
+         XA(1) = 1/2* (XA(1) + XB(1) + TW(1) * (XB(1) - XA(1))) + i*( 1/2*( XA(1) + XB(1) + i*( XA(1) - XB(1))));
+         X = XA;
+      */
+
+      xBI = pB[1];
+      xBR = pB[0];
+      xAR = pA[0];
+      xAI = pA[1];
+
+      twR = *pCoeff++;
+      twI = *pCoeff++;
+
+      t1a = xBR - xAR ;
+      t1b = xBI + xAI ;
+
+      // real(tw * (xB - xA)) = twR * (xBR - xAR) - twI * (xBI - xAI);
+      // imag(tw * (xB - xA)) = twI * (xBR - xAR) + twR * (xBI - xAI);
+      p0 = twR * t1a;
+      p1 = twI * t1a;
+      p2 = twR * t1b;
+      p3 = twI * t1b;
+
+      *pOut++ = 0.5f * (xAR + xBR + p0 + p3 ); //xAR
+      *pOut++ = 0.5f * (xAI - xBI + p1 - p2 ); //xAI
+
+      pA += 2;
+      pB -= 2;
+      k--;
+   } while (k > 0U);
+}
+
+/* Prepares data for inverse cfft */
+void merge_rfft_f32(
+  const arm_rfft_fast_instance_f32 * S,
+        float32_t * p,
+        float32_t * pOut)
+{
+        uint32_t  k;                                /* Loop Counter */
+        float32_t twR, twI;                         /* RFFT Twiddle coefficients */
+  const float32_t *pCoeff = S->pTwiddleRFFT;        /* Points to RFFT Twiddle factors */
+        float32_t *pA = p;                          /* increasing pointer */
+        float32_t *pB = p;                          /* decreasing pointer */
+        float32_t xAR, xAI, xBR, xBI;               /* temporary variables */
+        float32_t t1a, t1b, r, s, t, u;             /* temporary variables */
+
+   k = (S->Sint).fftLen - 1;
+
+   xAR = pA[0];
+   xAI = pA[1];
+
+   pCoeff += 2 ;
+
+   *pOut++ = 0.5f * ( xAR + xAI );
+   *pOut++ = 0.5f * ( xAR - xAI );
+
+   pB  =  p + 2*k ;
+   pA +=  2	   ;
+
+   while (k > 0U)
+   {
+      /* G is half of the frequency complex spectrum */
+      //for k = 2:N
+      //    Xk(k) = 1/2 * (G(k) + conj(G(N-k+2)) + Tw(k)*( G(k) - conj(G(N-k+2))));
+      xBI =   pB[1]    ;
+      xBR =   pB[0]    ;
+      xAR =  pA[0];
+      xAI =  pA[1];
+
+      twR = *pCoeff++;
+      twI = *pCoeff++;
+
+      t1a = xAR - xBR ;
+      t1b = xAI + xBI ;
+
+      r = twR * t1a;
+      s = twI * t1b;
+      t = twI * t1a;
+      u = twR * t1b;
+
+      // real(tw * (xA - xB)) = twR * (xAR - xBR) - twI * (xAI - xBI);
+      // imag(tw * (xA - xB)) = twI * (xAR - xBR) + twR * (xAI - xBI);
+      *pOut++ = 0.5f * (xAR + xBR - r - s ); //xAR
+      *pOut++ = 0.5f * (xAI - xBI + t - u ); //xAI
+
+      pA += 2;
+      pB -= 2;
+      k--;
+   }
+
+}
+
+/**
+  @ingroup groupTransforms
+*/
+
+/**
+  @defgroup RealFFT Real FFT Functions
+ 
+  @par
+                   The CMSIS DSP library includes specialized algorithms for computing the
+                   FFT of real data sequences.  The FFT is defined over complex data but
+                   in many applications the input is real.  Real FFT algorithms take advantage
+                   of the symmetry properties of the FFT and have a speed advantage over complex
+                   algorithms of the same length.
+  @par
+                   The Fast RFFT algorith relays on the mixed radix CFFT that save processor usage.
+  @par
+                   The real length N forward FFT of a sequence is computed using the steps shown below.
+  @par
+                   \image html RFFT.gif "Real Fast Fourier Transform"
+  @par
+                   The real sequence is initially treated as if it were complex to perform a CFFT.
+                   Later, a processing stage reshapes the data to obtain half of the frequency spectrum
+                   in complex format. Except the first complex number that contains the two real numbers
+                   X[0] and X[N/2] all the data is complex. In other words, the first complex sample
+                   contains two real values packed.
+  @par
+                   The input for the inverse RFFT should keep the same format as the output of the
+                   forward RFFT. A first processing stage pre-process the data to later perform an
+                   inverse CFFT.
+  @par
+                   \image html RIFFT.gif "Real Inverse Fast Fourier Transform"
+  @par
+                   The algorithms for floating-point, Q15, and Q31 data are slightly different
+                   and we describe each algorithm in turn.
+  @par           Floating-point
+                   The main functions are \ref arm_rfft_fast_f32() and \ref arm_rfft_fast_init_f32().
+                   The older functions \ref arm_rfft_f32() and \ref arm_rfft_init_f32() have been deprecated
+                   but are still documented.
+  @par
+                   The FFT of a real N-point sequence has even symmetry in the frequency domain. 
+                   The second half of the data equals the conjugate of the first half flipped in frequency. 
+                   Looking at the data, we see that we can uniquely represent the FFT using only N/2 complex numbers.
+                   These are packed into the output array in alternating real and imaginary components:
+  @par
+                   X = { real[0], imag[0], real[1], imag[1], real[2], imag[2] ...
+                   real[(N/2)-1], imag[(N/2)-1 }
+  @par
+                   It happens that the first complex number (real[0], imag[0]) is actually
+                   all real. real[0] represents the DC offset, and imag[0] should be 0.
+                   (real[1], imag[1]) is the fundamental frequency, (real[2], imag[2]) is
+                   the first harmonic and so on.
+  @par
+                   The real FFT functions pack the frequency domain data in this fashion.
+                   The forward transform outputs the data in this form and the inverse
+                   transform expects input data in this form. The function always performs
+                   the needed bitreversal so that the input and output data is always in
+                   normal order. The functions support lengths of [32, 64, 128, ..., 4096]
+                   samples.
+  @par           Q15 and Q31
+                   The real algorithms are defined in a similar manner and utilize N/2 complex
+                   transforms behind the scenes.
+  @par
+                   The complex transforms used internally include scaling to prevent fixed-point
+                   overflows.  The overall scaling equals 1/(fftLen/2).
+  @par
+                   A separate instance structure must be defined for each transform used but
+                   twiddle factor and bit reversal tables can be reused.
+  @par
+                   There is also an associated initialization function for each data type.
+                   The initialization function performs the following operations:
+                    - Sets the values of the internal structure fields.
+                    - Initializes twiddle factor table and bit reversal table pointers.
+                    - Initializes the internal complex FFT data structure.
+  @par
+                   Use of the initialization function is optional.
+                   However, if the initialization function is used, then the instance structure
+                   cannot be placed into a const data section. To place an instance structure
+                   into a const data section, the instance structure should be manually
+                   initialized as follows:
+  <pre>
+      arm_rfft_instance_q31 S = {fftLenReal, fftLenBy2, ifftFlagR, bitReverseFlagR, twidCoefRModifier, pTwiddleAReal, pTwiddleBReal, pCfft};
+      arm_rfft_instance_q15 S = {fftLenReal, fftLenBy2, ifftFlagR, bitReverseFlagR, twidCoefRModifier, pTwiddleAReal, pTwiddleBReal, pCfft};
+  </pre>
+                   where <code>fftLenReal</code> is the length of the real transform;
+                   <code>fftLenBy2</code> length of  the internal complex transform.
+                   <code>ifftFlagR</code> Selects forward (=0) or inverse (=1) transform.
+                   <code>bitReverseFlagR</code> Selects bit reversed output (=0) or normal order
+                   output (=1).
+                   <code>twidCoefRModifier</code> stride modifier for the twiddle factor table.
+                   The value is based on the FFT length;
+                   <code>pTwiddleAReal</code>points to the A array of twiddle coefficients;
+                   <code>pTwiddleBReal</code>points to the B array of twiddle coefficients;
+                   <code>pCfft</code> points to the CFFT Instance structure. The CFFT structure
+                   must also be initialized.  Refer to arm_cfft_radix4_f32() for details regarding
+                   static initialization of the complex FFT instance structure.
+ */
+
+/**
+  @addtogroup RealFFT
+  @{
+*/
+
+/**
+  @brief         Processing function for the floating-point real FFT.
+  @param[in]     S         points to an arm_rfft_fast_instance_f32 structure
+  @param[in]     p         points to input buffer
+  @param[in]     pOut      points to output buffer
+  @param[in]     ifftFlag
+                   - value = 0: RFFT
+                   - value = 1: RIFFT
+  @return        none
+*/
+
+void arm_rfft_fast_f32(
+  arm_rfft_fast_instance_f32 * S,
+  float32_t * p,
+  float32_t * pOut,
+  uint8_t ifftFlag)
+{
+   arm_cfft_instance_f32 * Sint = &(S->Sint);
+   Sint->fftLen = S->fftLenRFFT / 2;
+
+   /* Calculation of Real FFT */
+   if (ifftFlag)
+   {
+      /*  Real FFT compression */
+      merge_rfft_f32(S, p, pOut);
+
+      /* Complex radix-4 IFFT process */
+      arm_cfft_f32( Sint, pOut, ifftFlag, 1);
+   }
+   else
+   {
+      /* Calculation of RFFT of input */
+      arm_cfft_f32( Sint, p, ifftFlag, 1);
+
+      /*  Real FFT extraction */
+      stage_rfft_f32(S, p, pOut);
+   }
+}
+
+/**
+* @} end of RealFFT group
+*/
diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_rfft_fast_init_f32.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_rfft_fast_init_f32.c
index e8273b4..ca510cd 100644
--- a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_rfft_fast_init_f32.c
+++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_rfft_fast_init_f32.c
@@ -1,352 +1,344 @@
-/* ----------------------------------------------------------------------
- * Project:      CMSIS DSP Library
- * Title:        arm_rfft_fast_init_f32.c
- * Description:  Split Radix Decimation in Frequency CFFT Floating point processing function
- *
- * $Date:        23 April 2021
- * $Revision:    V1.9.0
- *
- * Target Processor: Cortex-M and Cortex-A cores
- * -------------------------------------------------------------------- */
-/*
- * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "dsp/transform_functions.h"
-#include "arm_common_tables.h"
-
-/**
-  @ingroup groupTransforms
- */
-
-/**
-  @addtogroup RealFFT
-  @{
- */
-
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_16) && defined(ARM_TABLE_BITREVIDX_FLT_16) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_32))
-
-/**
-  @private
-  @brief         Initialization function for the 32pt floating-point real FFT.
-  @param[in,out] S  points to an arm_rfft_fast_instance_f32 structure
-  @return        execution status
-                   - \ref ARM_MATH_SUCCESS        : Operation successful
-                   - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected
- */
-
-static arm_status arm_rfft_32_fast_init_f32( arm_rfft_fast_instance_f32 * S ) {
-
-  arm_status status;
-
-  if( !S ) return ARM_MATH_ARGUMENT_ERROR;
-
-  status=arm_cfft_init_f32(&(S->Sint),16);
-  if (status != ARM_MATH_SUCCESS)
-  {
-    return(status);
-  }
-
-  S->fftLenRFFT = 32U;
-  S->pTwiddleRFFT    = (float32_t *) twiddleCoef_rfft_32;
-
-  return ARM_MATH_SUCCESS;
-}
-#endif 
-
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_32) && defined(ARM_TABLE_BITREVIDX_FLT_32) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_64))
-
-/**
-  @private
-  @brief         Initialization function for the 64pt floating-point real FFT.
-  @param[in,out] S  points to an arm_rfft_fast_instance_f32 structure
-  @return        execution status
-                   - \ref ARM_MATH_SUCCESS        : Operation successful
-                   - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected
- */
-
-static arm_status arm_rfft_64_fast_init_f32( arm_rfft_fast_instance_f32 * S ) {
-
-  arm_status status;
-
-  if( !S ) return ARM_MATH_ARGUMENT_ERROR;
-
-  status=arm_cfft_init_f32(&(S->Sint),32);
-  if (status != ARM_MATH_SUCCESS)
-  {
-    return(status);
-  }
-  S->fftLenRFFT = 64U;
-
-  S->pTwiddleRFFT    = (float32_t *) twiddleCoef_rfft_64;
-
-  return ARM_MATH_SUCCESS;
-}
-#endif 
-
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_64) && defined(ARM_TABLE_BITREVIDX_FLT_64) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_128))
-
-/**
-  @private
-  @brief         Initialization function for the 128pt floating-point real FFT.
-  @param[in,out] S  points to an arm_rfft_fast_instance_f32 structure
-  @return        execution status
-                   - \ref ARM_MATH_SUCCESS        : Operation successful
-                   - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected
- */
-
-static arm_status arm_rfft_128_fast_init_f32( arm_rfft_fast_instance_f32 * S ) {
-
-  arm_status status;
-
-  if( !S ) return ARM_MATH_ARGUMENT_ERROR;
-
-  status=arm_cfft_init_f32(&(S->Sint),64);
-  if (status != ARM_MATH_SUCCESS)
-  {
-    return(status);
-  }
-  S->fftLenRFFT = 128;
-
-  S->pTwiddleRFFT    = (float32_t *) twiddleCoef_rfft_128;
-
-  return ARM_MATH_SUCCESS;
-}
-#endif 
-
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_128) && defined(ARM_TABLE_BITREVIDX_FLT_128) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_256))
-
-/**
-  @private
-  @brief         Initialization function for the 256pt floating-point real FFT.
-  @param[in,out] S  points to an arm_rfft_fast_instance_f32 structure
-  @return        execution status
-                   - \ref ARM_MATH_SUCCESS        : Operation successful
-                   - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected
-*/
-
-static arm_status arm_rfft_256_fast_init_f32( arm_rfft_fast_instance_f32 * S ) {
-
-  arm_status status;
-
-  if( !S ) return ARM_MATH_ARGUMENT_ERROR;
-
-  status=arm_cfft_init_f32(&(S->Sint),128);
-  if (status != ARM_MATH_SUCCESS)
-  {
-    return(status);
-  }
-  S->fftLenRFFT = 256U;
-
-  S->pTwiddleRFFT    = (float32_t *) twiddleCoef_rfft_256;
-
-  return ARM_MATH_SUCCESS;
-}
-#endif 
-
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_256) && defined(ARM_TABLE_BITREVIDX_FLT_256) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_512))
-
-/**
-  @private
-  @brief         Initialization function for the 512pt floating-point real FFT.
-  @param[in,out] S  points to an arm_rfft_fast_instance_f32 structure
-  @return        execution status
-                   - \ref ARM_MATH_SUCCESS        : Operation successful
-                   - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected
- */
-
-static arm_status arm_rfft_512_fast_init_f32( arm_rfft_fast_instance_f32 * S ) {
-
-  arm_status status;
-
-  if( !S ) return ARM_MATH_ARGUMENT_ERROR;
-
-  status=arm_cfft_init_f32(&(S->Sint),256);
-  if (status != ARM_MATH_SUCCESS)
-  {
-    return(status);
-  }
-  S->fftLenRFFT = 512U;
-
-  S->pTwiddleRFFT    = (float32_t *) twiddleCoef_rfft_512;
-
-  return ARM_MATH_SUCCESS;
-}
-#endif 
-
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_512) && defined(ARM_TABLE_BITREVIDX_FLT_512) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_1024))
-/**
-  @private
-  @brief         Initialization function for the 1024pt floating-point real FFT.
-  @param[in,out] S  points to an arm_rfft_fast_instance_f32 structure
-  @return        execution status
-                   - \ref ARM_MATH_SUCCESS        : Operation successful
-                   - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected
- */
-
-static arm_status arm_rfft_1024_fast_init_f32( arm_rfft_fast_instance_f32 * S ) {
-
-  arm_status status;
-
-  if( !S ) return ARM_MATH_ARGUMENT_ERROR;
-
-  status=arm_cfft_init_f32(&(S->Sint),512);
-  if (status != ARM_MATH_SUCCESS)
-  {
-    return(status);
-  }
-  S->fftLenRFFT = 1024U;
-
-  S->pTwiddleRFFT    = (float32_t *) twiddleCoef_rfft_1024;
-
-  return ARM_MATH_SUCCESS;
-}
-#endif
-
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_1024) && defined(ARM_TABLE_BITREVIDX_FLT_1024) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_2048))
-/**
-  @private
-  @brief         Initialization function for the 2048pt floating-point real FFT.
-  @param[in,out] S  points to an arm_rfft_fast_instance_f32 structure
-  @return        execution status
-                   - \ref ARM_MATH_SUCCESS        : Operation successful
-                   - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected
- */
-static arm_status arm_rfft_2048_fast_init_f32( arm_rfft_fast_instance_f32 * S ) {
-
-  arm_status status;
-
-  if( !S ) return ARM_MATH_ARGUMENT_ERROR;
-
-  status=arm_cfft_init_f32(&(S->Sint),1024);
-  if (status != ARM_MATH_SUCCESS)
-  {
-    return(status);
-  }
-  S->fftLenRFFT = 2048U;
-
-  S->pTwiddleRFFT    = (float32_t *) twiddleCoef_rfft_2048;
-
-  return ARM_MATH_SUCCESS;
-}
-#endif
-
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_2048) && defined(ARM_TABLE_BITREVIDX_FLT_2048) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_4096))
-/**
-  @private
-* @brief         Initialization function for the 4096pt floating-point real FFT.
-* @param[in,out] S  points to an arm_rfft_fast_instance_f32 structure
-  @return        execution status
-                   - \ref ARM_MATH_SUCCESS        : Operation successful
-                   - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected
- */
-
-static arm_status arm_rfft_4096_fast_init_f32( arm_rfft_fast_instance_f32 * S ) {
-
-  arm_status status;
-
-  if( !S ) return ARM_MATH_ARGUMENT_ERROR;
-
-  status=arm_cfft_init_f32(&(S->Sint),2048);
-  if (status != ARM_MATH_SUCCESS)
-  {
-    return(status);
-  }
-  S->fftLenRFFT = 4096U;
-
-  S->pTwiddleRFFT    = (float32_t *) twiddleCoef_rfft_4096;
-
-  return ARM_MATH_SUCCESS;
-}
-#endif 
-
-/**
-  @brief         Initialization function for the floating-point real FFT.
-  @param[in,out] S       points to an arm_rfft_fast_instance_f32 structure
-  @param[in]     fftLen  length of the Real Sequence
-  @return        execution status
-                   - \ref ARM_MATH_SUCCESS        : Operation successful
-                   - \ref ARM_MATH_ARGUMENT_ERROR : <code>fftLen</code> is not a supported length
-
-  @par           Description
-                   The parameter <code>fftLen</code> specifies the length of RFFT/CIFFT process.
-                   Supported FFT Lengths are 32, 64, 128, 256, 512, 1024, 2048, 4096.
-  @par
-                   This Function also initializes Twiddle factor table pointer and Bit reversal table pointer.
- */
-
-arm_status arm_rfft_fast_init_f32(
-  arm_rfft_fast_instance_f32 * S,
-  uint16_t fftLen)
-{
-  typedef arm_status(*fft_init_ptr)( arm_rfft_fast_instance_f32 *);
-  fft_init_ptr fptr = 0x0;
-
-  switch (fftLen)
-  {
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_2048) && defined(ARM_TABLE_BITREVIDX_FLT_2048) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_4096))
-  case 4096U:
-    fptr = arm_rfft_4096_fast_init_f32;
-    break;
-#endif
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_1024) && defined(ARM_TABLE_BITREVIDX_FLT_1024) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_2048))
-  case 2048U:
-    fptr = arm_rfft_2048_fast_init_f32;
-    break;
-#endif
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_512) && defined(ARM_TABLE_BITREVIDX_FLT_512) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_1024))
-  case 1024U:
-    fptr = arm_rfft_1024_fast_init_f32;
-    break;
-#endif
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_256) && defined(ARM_TABLE_BITREVIDX_FLT_256) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_512))
-  case 512U:
-    fptr = arm_rfft_512_fast_init_f32;
-    break;
-#endif
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_128) && defined(ARM_TABLE_BITREVIDX_FLT_128) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_256))
-  case 256U:
-    fptr = arm_rfft_256_fast_init_f32;
-    break;
-#endif
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_64) && defined(ARM_TABLE_BITREVIDX_FLT_64) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_128))
-  case 128U:
-    fptr = arm_rfft_128_fast_init_f32;
-    break;
-#endif
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_32) && defined(ARM_TABLE_BITREVIDX_FLT_32) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_64))
-  case 64U:
-    fptr = arm_rfft_64_fast_init_f32;
-    break;
-#endif
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_16) && defined(ARM_TABLE_BITREVIDX_FLT_16) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_32))
-  case 32U:
-    fptr = arm_rfft_32_fast_init_f32;
-    break;
-#endif
-  default:
-    break;
-  }
-
-  if( ! fptr ) return ARM_MATH_ARGUMENT_ERROR;
-  return fptr( S );
-
-}
-
-/**
-  @} end of RealFFT group
- */
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_cfft_init_f32.c
+ * Description:  Split Radix Decimation in Frequency CFFT Floating point processing function
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+#include "arm_common_tables.h"
+
+/**
+  @ingroup groupTransforms
+ */
+
+/**
+  @addtogroup RealFFT
+  @{
+ */
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_16) && defined(ARM_TABLE_BITREVIDX_FLT_16) && defined(ARM_TABLE_TWIDDLECOEF_F32_16) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_32))
+
+/**
+  @brief         Initialization function for the 32pt floating-point real FFT.
+  @param[in,out] S  points to an arm_rfft_fast_instance_f32 structure
+  @return        execution status
+                   - \ref ARM_MATH_SUCCESS        : Operation successful
+                   - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected
+ */
+
+arm_status arm_rfft_32_fast_init_f32( arm_rfft_fast_instance_f32 * S ) {
+
+  arm_cfft_instance_f32 * Sint;
+
+  if( !S ) return ARM_MATH_ARGUMENT_ERROR;
+
+  Sint = &(S->Sint);
+  Sint->fftLen = 16U;
+  S->fftLenRFFT = 32U;
+
+  Sint->bitRevLength = ARMBITREVINDEXTABLE_16_TABLE_LENGTH;
+  Sint->pBitRevTable = (uint16_t *)armBitRevIndexTable16;
+  Sint->pTwiddle     = (float32_t *) twiddleCoef_16;
+  S->pTwiddleRFFT    = (float32_t *) twiddleCoef_rfft_32;
+
+  return ARM_MATH_SUCCESS;
+}
+#endif 
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_32) && defined(ARM_TABLE_BITREVIDX_FLT_32) && defined(ARM_TABLE_TWIDDLECOEF_F32_32) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_64))
+
+/**
+  @brief         Initialization function for the 64pt floating-point real FFT.
+  @param[in,out] S  points to an arm_rfft_fast_instance_f32 structure
+  @return        execution status
+                   - \ref ARM_MATH_SUCCESS        : Operation successful
+                   - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected
+ */
+
+arm_status arm_rfft_64_fast_init_f32( arm_rfft_fast_instance_f32 * S ) {
+
+  arm_cfft_instance_f32 * Sint;
+
+  if( !S ) return ARM_MATH_ARGUMENT_ERROR;
+
+  Sint = &(S->Sint);
+  Sint->fftLen = 32U;
+  S->fftLenRFFT = 64U;
+
+  Sint->bitRevLength = ARMBITREVINDEXTABLE_32_TABLE_LENGTH;
+  Sint->pBitRevTable = (uint16_t *)armBitRevIndexTable32;
+  Sint->pTwiddle     = (float32_t *) twiddleCoef_32;
+  S->pTwiddleRFFT    = (float32_t *) twiddleCoef_rfft_64;
+
+  return ARM_MATH_SUCCESS;
+}
+#endif 
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_64) && defined(ARM_TABLE_BITREVIDX_FLT_64) && defined(ARM_TABLE_TWIDDLECOEF_F32_64) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_128))
+
+/**
+  @brief         Initialization function for the 128pt floating-point real FFT.
+  @param[in,out] S  points to an arm_rfft_fast_instance_f32 structure
+  @return        execution status
+                   - \ref ARM_MATH_SUCCESS        : Operation successful
+                   - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected
+ */
+
+arm_status arm_rfft_128_fast_init_f32( arm_rfft_fast_instance_f32 * S ) {
+
+  arm_cfft_instance_f32 * Sint;
+
+  if( !S ) return ARM_MATH_ARGUMENT_ERROR;
+
+  Sint = &(S->Sint);
+  Sint->fftLen = 64U;
+  S->fftLenRFFT = 128U;
+
+  Sint->bitRevLength = ARMBITREVINDEXTABLE_64_TABLE_LENGTH;
+  Sint->pBitRevTable = (uint16_t *)armBitRevIndexTable64;
+  Sint->pTwiddle     = (float32_t *) twiddleCoef_64;
+  S->pTwiddleRFFT    = (float32_t *) twiddleCoef_rfft_128;
+
+  return ARM_MATH_SUCCESS;
+}
+#endif 
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_128) && defined(ARM_TABLE_BITREVIDX_FLT_128) && defined(ARM_TABLE_TWIDDLECOEF_F32_128) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_256))
+
+/**
+  @brief         Initialization function for the 256pt floating-point real FFT.
+  @param[in,out] S  points to an arm_rfft_fast_instance_f32 structure
+  @return        execution status
+                   - \ref ARM_MATH_SUCCESS        : Operation successful
+                   - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected
+*/
+
+arm_status arm_rfft_256_fast_init_f32( arm_rfft_fast_instance_f32 * S ) {
+
+  arm_cfft_instance_f32 * Sint;
+
+  if( !S ) return ARM_MATH_ARGUMENT_ERROR;
+
+  Sint = &(S->Sint);
+  Sint->fftLen = 128U;
+  S->fftLenRFFT = 256U;
+
+  Sint->bitRevLength = ARMBITREVINDEXTABLE_128_TABLE_LENGTH;
+  Sint->pBitRevTable = (uint16_t *)armBitRevIndexTable128;
+  Sint->pTwiddle     = (float32_t *) twiddleCoef_128;
+  S->pTwiddleRFFT    = (float32_t *) twiddleCoef_rfft_256;
+
+  return ARM_MATH_SUCCESS;
+}
+#endif 
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_256) && defined(ARM_TABLE_BITREVIDX_FLT_256) && defined(ARM_TABLE_TWIDDLECOEF_F32_256) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_512))
+
+/**
+  @brief         Initialization function for the 512pt floating-point real FFT.
+  @param[in,out] S  points to an arm_rfft_fast_instance_f32 structure
+  @return        execution status
+                   - \ref ARM_MATH_SUCCESS        : Operation successful
+                   - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected
+ */
+
+arm_status arm_rfft_512_fast_init_f32( arm_rfft_fast_instance_f32 * S ) {
+
+  arm_cfft_instance_f32 * Sint;
+
+  if( !S ) return ARM_MATH_ARGUMENT_ERROR;
+
+  Sint = &(S->Sint);
+  Sint->fftLen = 256U;
+  S->fftLenRFFT = 512U;
+
+  Sint->bitRevLength = ARMBITREVINDEXTABLE_256_TABLE_LENGTH;
+  Sint->pBitRevTable = (uint16_t *)armBitRevIndexTable256;
+  Sint->pTwiddle     = (float32_t *) twiddleCoef_256;
+  S->pTwiddleRFFT    = (float32_t *) twiddleCoef_rfft_512;
+
+  return ARM_MATH_SUCCESS;
+}
+#endif 
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_512) && defined(ARM_TABLE_BITREVIDX_FLT_512) && defined(ARM_TABLE_TWIDDLECOEF_F32_512) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_1024))
+/**
+  @brief         Initialization function for the 1024pt floating-point real FFT.
+  @param[in,out] S  points to an arm_rfft_fast_instance_f32 structure
+  @return        execution status
+                   - \ref ARM_MATH_SUCCESS        : Operation successful
+                   - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected
+ */
+
+arm_status arm_rfft_1024_fast_init_f32( arm_rfft_fast_instance_f32 * S ) {
+
+  arm_cfft_instance_f32 * Sint;
+
+  if( !S ) return ARM_MATH_ARGUMENT_ERROR;
+
+  Sint = &(S->Sint);
+  Sint->fftLen = 512U;
+  S->fftLenRFFT = 1024U;
+
+  Sint->bitRevLength = ARMBITREVINDEXTABLE_512_TABLE_LENGTH;
+  Sint->pBitRevTable = (uint16_t *)armBitRevIndexTable512;
+  Sint->pTwiddle     = (float32_t *) twiddleCoef_512;
+  S->pTwiddleRFFT    = (float32_t *) twiddleCoef_rfft_1024;
+
+  return ARM_MATH_SUCCESS;
+}
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_1024) && defined(ARM_TABLE_BITREVIDX_FLT_1024) && defined(ARM_TABLE_TWIDDLECOEF_F32_1024) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_2048))
+/**
+  @brief         Initialization function for the 2048pt floating-point real FFT.
+  @param[in,out] S  points to an arm_rfft_fast_instance_f32 structure
+  @return        execution status
+                   - \ref ARM_MATH_SUCCESS        : Operation successful
+                   - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected
+ */
+arm_status arm_rfft_2048_fast_init_f32( arm_rfft_fast_instance_f32 * S ) {
+
+  arm_cfft_instance_f32 * Sint;
+
+  if( !S ) return ARM_MATH_ARGUMENT_ERROR;
+
+  Sint = &(S->Sint);
+  Sint->fftLen = 1024U;
+  S->fftLenRFFT = 2048U;
+
+  Sint->bitRevLength = ARMBITREVINDEXTABLE_1024_TABLE_LENGTH;
+  Sint->pBitRevTable = (uint16_t *)armBitRevIndexTable1024;
+  Sint->pTwiddle     = (float32_t *) twiddleCoef_1024;
+  S->pTwiddleRFFT    = (float32_t *) twiddleCoef_rfft_2048;
+
+  return ARM_MATH_SUCCESS;
+}
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_2048) && defined(ARM_TABLE_BITREVIDX_FLT_2048) && defined(ARM_TABLE_TWIDDLECOEF_F32_2048) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_4096))
+/**
+* @brief         Initialization function for the 4096pt floating-point real FFT.
+* @param[in,out] S  points to an arm_rfft_fast_instance_f32 structure
+  @return        execution status
+                   - \ref ARM_MATH_SUCCESS        : Operation successful
+                   - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected
+ */
+
+arm_status arm_rfft_4096_fast_init_f32( arm_rfft_fast_instance_f32 * S ) {
+
+  arm_cfft_instance_f32 * Sint;
+
+  if( !S ) return ARM_MATH_ARGUMENT_ERROR;
+
+  Sint = &(S->Sint);
+  Sint->fftLen = 2048U;
+  S->fftLenRFFT = 4096U;
+
+  Sint->bitRevLength = ARMBITREVINDEXTABLE_2048_TABLE_LENGTH;
+  Sint->pBitRevTable = (uint16_t *)armBitRevIndexTable2048;
+  Sint->pTwiddle     = (float32_t *) twiddleCoef_2048;
+  S->pTwiddleRFFT    = (float32_t *) twiddleCoef_rfft_4096;
+
+  return ARM_MATH_SUCCESS;
+}
+#endif 
+
+/**
+  @brief         Initialization function for the floating-point real FFT.
+  @param[in,out] S       points to an arm_rfft_fast_instance_f32 structure
+  @param[in]     fftLen  length of the Real Sequence
+  @return        execution status
+                   - \ref ARM_MATH_SUCCESS        : Operation successful
+                   - \ref ARM_MATH_ARGUMENT_ERROR : <code>fftLen</code> is not a supported length
+
+  @par           Description
+                   The parameter <code>fftLen</code> specifies the length of RFFT/CIFFT process.
+                   Supported FFT Lengths are 32, 64, 128, 256, 512, 1024, 2048, 4096.
+  @par
+                   This Function also initializes Twiddle factor table pointer and Bit reversal table pointer.
+ */
+
+arm_status arm_rfft_fast_init_f32(
+  arm_rfft_fast_instance_f32 * S,
+  uint16_t fftLen)
+{
+  typedef arm_status(*fft_init_ptr)( arm_rfft_fast_instance_f32 *);
+  fft_init_ptr fptr = 0x0;
+
+  switch (fftLen)
+  {
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_2048) && defined(ARM_TABLE_BITREVIDX_FLT_2048) && defined(ARM_TABLE_TWIDDLECOEF_F32_2048) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_4096))
+  case 4096U:
+    fptr = arm_rfft_4096_fast_init_f32;
+    break;
+#endif
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_1024) && defined(ARM_TABLE_BITREVIDX_FLT_1024) && defined(ARM_TABLE_TWIDDLECOEF_F32_1024) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_2048))
+  case 2048U:
+    fptr = arm_rfft_2048_fast_init_f32;
+    break;
+#endif
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_512) && defined(ARM_TABLE_BITREVIDX_FLT_512) && defined(ARM_TABLE_TWIDDLECOEF_F32_512) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_1024))
+  case 1024U:
+    fptr = arm_rfft_1024_fast_init_f32;
+    break;
+#endif
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_256) && defined(ARM_TABLE_BITREVIDX_FLT_256) && defined(ARM_TABLE_TWIDDLECOEF_F32_256) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_512))
+  case 512U:
+    fptr = arm_rfft_512_fast_init_f32;
+    break;
+#endif
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_128) && defined(ARM_TABLE_BITREVIDX_FLT_128) && defined(ARM_TABLE_TWIDDLECOEF_F32_128) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_256))
+  case 256U:
+    fptr = arm_rfft_256_fast_init_f32;
+    break;
+#endif
+#if (defined(ARM_TABLE_TWIDDLECOEF_F32_64) && defined(ARM_TABLE_BITREVIDX_FLT_64) && defined(ARM_TABLE_TWIDDLECOEF_F32_64) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_128))
+  case 128U:
+    fptr = arm_rfft_128_fast_init_f32;
+    break;
+#endif
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_32) && defined(ARM_TABLE_BITREVIDX_FLT_32) && defined(ARM_TABLE_TWIDDLECOEF_F32_32) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_64))
+  case 64U:
+    fptr = arm_rfft_64_fast_init_f32;
+    break;
+#endif
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_16) && defined(ARM_TABLE_BITREVIDX_FLT_16) && defined(ARM_TABLE_TWIDDLECOEF_F32_16) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_32))
+  case 32U:
+    fptr = arm_rfft_32_fast_init_f32;
+    break;
+#endif
+  default:
+    return ARM_MATH_ARGUMENT_ERROR;
+  }
+
+  if( ! fptr ) return ARM_MATH_ARGUMENT_ERROR;
+  return fptr( S );
+
+}
+
+/**
+  @} end of RealFFT group
+ */
diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_rfft_init_f32.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_rfft_init_f32.c
index 0a32da6..3d57a21 100644
--- a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_rfft_init_f32.c
+++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_rfft_init_f32.c
@@ -1,147 +1,139 @@
-/* ----------------------------------------------------------------------
- * Project:      CMSIS DSP Library
- * Title:        arm_rfft_init_f32.c
- * Description:  RFFT & RIFFT Floating point initialisation function
- *
- * $Date:        23 April 2021
- * $Revision:    V1.9.0
- *
- * Target Processor: Cortex-M and Cortex-A cores
- * -------------------------------------------------------------------- */
-/*
- * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "dsp/transform_functions.h"
-#include "arm_common_tables.h"
-
-
-/**
-  @addtogroup RealFFT
-  @{
- */
-
-/**
-  @brief         Initialization function for the floating-point RFFT/RIFFT.
-  @deprecated    Do not use this function. It has been superceded by \ref arm_rfft_fast_init_f32 and will be removed in the future.
-  @param[in,out] S             points to an instance of the floating-point RFFT/RIFFT structure
-  @param[in,out] S_CFFT        points to an instance of the floating-point CFFT/CIFFT structure
-  @param[in]     fftLenReal     length of the FFT.
-  @param[in]     ifftFlagR      flag that selects transform direction
-                   - value = 0: forward transform
-                   - value = 1: inverse transform
-  @param[in]     bitReverseFlag flag that enables / disables bit reversal of output
-                   - value = 0: disables bit reversal of output
-                   - value = 1: enables bit reversal of output
-  @return        execution status
-                   - \ref ARM_MATH_SUCCESS        : Operation successful
-                   - \ref ARM_MATH_ARGUMENT_ERROR : <code>fftLenReal</code> is not a supported length
-
-  @par Description
-                   The parameter <code>fftLenReal</code>specifies length of RFFT/RIFFT Process.
-                   Supported FFT Lengths are 128, 512, 2048.
-  @par
-                   The parameter <code>ifftFlagR</code> controls whether a forward or inverse transform is computed.
-                   Set(=1) ifftFlagR to calculate RIFFT, otherwise RFFT is calculated.
-  @par
-                   The parameter <code>bitReverseFlag</code> controls whether output is in normal order or bit reversed order.
-                   Set(=1) bitReverseFlag for output to be in normal order otherwise output is in bit reversed order.
-  @par
-                   This function also initializes Twiddle factor table.
- */
-
-arm_status arm_rfft_init_f32(
-  arm_rfft_instance_f32 * S,
-  arm_cfft_radix4_instance_f32 * S_CFFT,
-  uint32_t fftLenReal,
-  uint32_t ifftFlagR,
-  uint32_t bitReverseFlag)
-{
-   /*  Initialise the default arm status */
-  arm_status status = ARM_MATH_ARGUMENT_ERROR;
-
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)
-
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_REALCOEF_F32)
-
-  /*  Initialise the default arm status */
-  status = ARM_MATH_SUCCESS;
-
-  /*  Initialize the Real FFT length */
-  S->fftLenReal = (uint16_t) fftLenReal;
-
-  /*  Initialize the Complex FFT length */
-  S->fftLenBy2 = (uint16_t) fftLenReal / 2U;
-
-  /*  Initialize the Twiddle coefficientA pointer */
-  S->pTwiddleAReal = (float32_t *) realCoefA;
-
-  /*  Initialize the Twiddle coefficientB pointer */
-  S->pTwiddleBReal = (float32_t *) realCoefB;
-
-  /*  Initialize the Flag for selection of RFFT or RIFFT */
-  S->ifftFlagR = (uint8_t) ifftFlagR;
-
-  /*  Initialize the Flag for calculation Bit reversal or not */
-  S->bitReverseFlagR = (uint8_t) bitReverseFlag;
-
-  /*  Initializations of structure parameters depending on the FFT length */
-  switch (S->fftLenReal)
-  {
-    /* Init table modifier value */
-  case 8192U:
-    S->twidCoefRModifier = 1U;
-    break;
-  case 2048U:
-    S->twidCoefRModifier = 4U;
-    break;
-  case 512U:
-    S->twidCoefRModifier = 16U;
-    break;
-  case 128U:
-    S->twidCoefRModifier = 64U;
-    break;
-  default:
-    /*  Reporting argument error if rfftSize is not valid value */
-    status = ARM_MATH_ARGUMENT_ERROR;
-    break;
-  }
-
-  /* Init Complex FFT Instance */
-  S->pCfft = S_CFFT;
-
-  if (S->ifftFlagR)
-  {
-    /* Initializes the CIFFT Module for fftLenreal/2 length */
-    arm_cfft_radix4_init_f32(S->pCfft, S->fftLenBy2, 1U, 0U);
-  }
-  else
-  {
-    /* Initializes the CFFT Module for fftLenreal/2 length */
-    arm_cfft_radix4_init_f32(S->pCfft, S->fftLenBy2, 0U, 0U);
-  }
-
-#endif
-#endif
-  /* return the status of RFFT Init function */
-  return (status);
-
-}
-
-/**
-  @} end of RealFFT group
- */
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_rfft_init_f32.c
+ * Description:  RFFT & RIFFT Floating point initialisation function
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+#include "arm_common_tables.h"
+
+
+/**
+  @addtogroup RealFFT
+  @{
+ */
+
+/**
+  @brief         Initialization function for the floating-point RFFT/RIFFT.
+  @deprecated    Do not use this function. It has been superceded by \ref arm_rfft_fast_init_f32 and will be removed in the future.
+  @param[in,out] S             points to an instance of the floating-point RFFT/RIFFT structure
+  @param[in,out] S_CFFT        points to an instance of the floating-point CFFT/CIFFT structure
+  @param[in]     fftLenReal     length of the FFT.
+  @param[in]     ifftFlagR      flag that selects transform direction
+                   - value = 0: forward transform
+                   - value = 1: inverse transform
+  @param[in]     bitReverseFlag flag that enables / disables bit reversal of output
+                   - value = 0: disables bit reversal of output
+                   - value = 1: enables bit reversal of output
+  @return        execution status
+                   - \ref ARM_MATH_SUCCESS        : Operation successful
+                   - \ref ARM_MATH_ARGUMENT_ERROR : <code>fftLenReal</code> is not a supported length
+
+  @par Description
+                   The parameter <code>fftLenReal</code>specifies length of RFFT/RIFFT Process.
+                   Supported FFT Lengths are 128, 512, 2048.
+  @par
+                   The parameter <code>ifftFlagR</code> controls whether a forward or inverse transform is computed.
+                   Set(=1) ifftFlagR to calculate RIFFT, otherwise RFFT is calculated.
+  @par
+                   The parameter <code>bitReverseFlag</code> controls whether output is in normal order or bit reversed order.
+                   Set(=1) bitReverseFlag for output to be in normal order otherwise output is in bit reversed order.
+  @par
+                   This function also initializes Twiddle factor table.
+ */
+
+arm_status arm_rfft_init_f32(
+  arm_rfft_instance_f32 * S,
+  arm_cfft_radix4_instance_f32 * S_CFFT,
+  uint32_t fftLenReal,
+  uint32_t ifftFlagR,
+  uint32_t bitReverseFlag)
+{
+
+  /*  Initialise the default arm status */
+  arm_status status = ARM_MATH_SUCCESS;
+
+  /*  Initialize the Real FFT length */
+  S->fftLenReal = (uint16_t) fftLenReal;
+
+  /*  Initialize the Complex FFT length */
+  S->fftLenBy2 = (uint16_t) fftLenReal / 2U;
+
+  /*  Initialize the Twiddle coefficientA pointer */
+  S->pTwiddleAReal = (float32_t *) realCoefA;
+
+  /*  Initialize the Twiddle coefficientB pointer */
+  S->pTwiddleBReal = (float32_t *) realCoefB;
+
+  /*  Initialize the Flag for selection of RFFT or RIFFT */
+  S->ifftFlagR = (uint8_t) ifftFlagR;
+
+  /*  Initialize the Flag for calculation Bit reversal or not */
+  S->bitReverseFlagR = (uint8_t) bitReverseFlag;
+
+  /*  Initializations of structure parameters depending on the FFT length */
+  switch (S->fftLenReal)
+  {
+    /* Init table modifier value */
+  case 8192U:
+    S->twidCoefRModifier = 1U;
+    break;
+  case 2048U:
+    S->twidCoefRModifier = 4U;
+    break;
+  case 512U:
+    S->twidCoefRModifier = 16U;
+    break;
+  case 128U:
+    S->twidCoefRModifier = 64U;
+    break;
+  default:
+    /*  Reporting argument error if rfftSize is not valid value */
+    status = ARM_MATH_ARGUMENT_ERROR;
+    break;
+  }
+
+  /* Init Complex FFT Instance */
+  S->pCfft = S_CFFT;
+
+  if (S->ifftFlagR)
+  {
+    /* Initializes the CIFFT Module for fftLenreal/2 length */
+    arm_cfft_radix4_init_f32(S->pCfft, S->fftLenBy2, 1U, 0U);
+  }
+  else
+  {
+    /* Initializes the CFFT Module for fftLenreal/2 length */
+    arm_cfft_radix4_init_f32(S->pCfft, S->fftLenBy2, 0U, 0U);
+  }
+
+  /* return the status of RFFT Init function */
+  return (status);
+
+}
+
+/**
+  @} end of RealFFT group
+ */
diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_rfft_init_q15.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_rfft_init_q15.c
index e70f8af..fb4c66c 100644
--- a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_rfft_init_q15.c
+++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_rfft_init_q15.c
@@ -1,248 +1,158 @@
-/* ----------------------------------------------------------------------
- * Project:      CMSIS DSP Library
- * Title:        arm_rfft_init_q15.c
- * Description:  RFFT & RIFFT Q15 initialisation function
- *
- * $Date:        23 April 2021
- * $Revision:    V1.9.0
- *
- * Target Processor: Cortex-M and Cortex-A cores
- * -------------------------------------------------------------------- */
-/*
- * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "dsp/transform_functions.h"
-#include "arm_common_tables.h"
-#include "arm_const_structs.h"
-
-/**
-  @addtogroup RealFFT
-  @{
- */
-
-/**
-  @brief         Initialization function for the Q15 RFFT/RIFFT.
-  @param[in,out] S              points to an instance of the Q15 RFFT/RIFFT structure
-  @param[in]     fftLenReal     length of the FFT
-  @param[in]     ifftFlagR      flag that selects transform direction
-                   - value = 0: forward transform
-                   - value = 1: inverse transform
-  @param[in]     bitReverseFlag flag that enables / disables bit reversal of output
-                   - value = 0: disables bit reversal of output
-                   - value = 1: enables bit reversal of output
-  @return        execution status
-                   - \ref ARM_MATH_SUCCESS        : Operation successful
-                   - \ref ARM_MATH_ARGUMENT_ERROR : <code>fftLenReal</code> is not a supported length
-
-  @par           Details
-                   The parameter <code>fftLenReal</code> specifies length of RFFT/RIFFT Process.
-                   Supported FFT Lengths are 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192.
-  @par
-                   The parameter <code>ifftFlagR</code> controls whether a forward or inverse transform is computed.
-                   Set(=1) ifftFlagR to calculate RIFFT, otherwise RFFT is calculated.
-  @par
-                   The parameter <code>bitReverseFlag</code> controls whether output is in normal order or bit reversed order.
-                   Set(=1) bitReverseFlag for output to be in normal order otherwise output is in bit reversed order.
-  @par
-                   This function also initializes Twiddle factor table.
- */
-
-arm_status arm_rfft_init_q15(
-    arm_rfft_instance_q15 * S,
-    uint32_t fftLenReal,
-    uint32_t ifftFlagR,
-    uint32_t bitReverseFlag)
-{
-     /*  Initialise the default arm status */
-    arm_status status = ARM_MATH_ARGUMENT_ERROR;
-
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)
-
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_REALCOEF_Q15)
-
-    /*  Initialise the default arm status */
-    status = ARM_MATH_SUCCESS;
-
-    /*  Initialize the Real FFT length */
-    S->fftLenReal = (uint16_t) fftLenReal;
-
-    /*  Initialize the Twiddle coefficientA pointer */
-    S->pTwiddleAReal = (q15_t *) realCoefAQ15;
-
-    /*  Initialize the Twiddle coefficientB pointer */
-    S->pTwiddleBReal = (q15_t *) realCoefBQ15;
-
-    /*  Initialize the Flag for selection of RFFT or RIFFT */
-    S->ifftFlagR = (uint8_t) ifftFlagR;
-
-    /*  Initialize the Flag for calculation Bit reversal or not */
-    S->bitReverseFlagR = (uint8_t) bitReverseFlag;
-
-    /*  Initialization of coef modifier depending on the FFT length */
-    switch (S->fftLenReal)
-    {
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_4096) && defined(ARM_TABLE_BITREVIDX_FXT_4096))
-    case 8192U:
-        S->twidCoefRModifier = 1U;
-
-        #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
-           status=arm_cfft_init_q15(&(S->cfftInst),4096);
-           if (status != ARM_MATH_SUCCESS)
-           {
-               return(status);
-           }
-        #else
-          S->pCfft = &arm_cfft_sR_q15_len4096;
-        #endif
-        break;
-#endif
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_2048) && defined(ARM_TABLE_BITREVIDX_FXT_2048))
-    case 4096U:
-        S->twidCoefRModifier = 2U;
-
-        #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
-           status=arm_cfft_init_q15(&(S->cfftInst),2048);
-           if (status != ARM_MATH_SUCCESS)
-           {
-               return(status);
-           }
-        #else
-           S->pCfft = &arm_cfft_sR_q15_len2048;
-        #endif
-        break;
-#endif
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_1024) && defined(ARM_TABLE_BITREVIDX_FXT_1024))
-    case 2048U:
-        S->twidCoefRModifier = 4U;
-
-        #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
-           status=arm_cfft_init_q15(&(S->cfftInst),1024);
-           if (status != ARM_MATH_SUCCESS)
-           {
-               return(status);
-           }
-        #else
-           S->pCfft = &arm_cfft_sR_q15_len1024;
-        #endif
-        break;
-#endif 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_512) && defined(ARM_TABLE_BITREVIDX_FXT_512))
-    case 1024U:
-        S->twidCoefRModifier = 8U;
-
-        #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
-           status=arm_cfft_init_q15(&(S->cfftInst),512);
-           if (status != ARM_MATH_SUCCESS)
-           {
-               return(status);
-           }
-        #else
-          S->pCfft = &arm_cfft_sR_q15_len512;
-        #endif
-        break;
-#endif 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_256) && defined(ARM_TABLE_BITREVIDX_FXT_256))
-    case 512U:
-        S->twidCoefRModifier = 16U;
-
-        #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
-           status=arm_cfft_init_q15(&(S->cfftInst),256);
-           if (status != ARM_MATH_SUCCESS)
-           {
-               return(status);
-           }
-        #else
-           S->pCfft = &arm_cfft_sR_q15_len256;
-        #endif
-        break;
-#endif
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_128) && defined(ARM_TABLE_BITREVIDX_FXT_128))
-    case 256U:
-        S->twidCoefRModifier = 32U;
-
-        #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
-           status=arm_cfft_init_q15(&(S->cfftInst),128);
-           if (status != ARM_MATH_SUCCESS)
-           {
-               return(status);
-           }
-        #else
-           S->pCfft = &arm_cfft_sR_q15_len128;
-        #endif
-        break;
-#endif
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_64) && defined(ARM_TABLE_BITREVIDX_FXT_64))
-    case 128U:
-        S->twidCoefRModifier = 64U;
-
-        #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
-           status=arm_cfft_init_q15(&(S->cfftInst),64);
-           if (status != ARM_MATH_SUCCESS)
-           {
-               return(status);
-           }
-        #else
-           S->pCfft = &arm_cfft_sR_q15_len64;
-        #endif
-        break;
-#endif 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_32) && defined(ARM_TABLE_BITREVIDX_FXT_32))
-    case 64U:
-        S->twidCoefRModifier = 128U;
-
-        #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
-           status=arm_cfft_init_q15(&(S->cfftInst),32);
-           if (status != ARM_MATH_SUCCESS)
-           {
-               return(status);
-           }
-        #else
-          S->pCfft = &arm_cfft_sR_q15_len32;
-        #endif
-        break;
-#endif 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_16) && defined(ARM_TABLE_BITREVIDX_FXT_16))
-    case 32U:
-        S->twidCoefRModifier = 256U;
-
-        #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
-           status=arm_cfft_init_q15(&(S->cfftInst),16);
-           if (status != ARM_MATH_SUCCESS)
-           {
-               return(status);
-           }
-        #else
-           S->pCfft = &arm_cfft_sR_q15_len16;
-        #endif
-        break;
-#endif
-    default:
-        /*  Reporting argument error if rfftSize is not valid value */
-        status = ARM_MATH_ARGUMENT_ERROR;
-        break;
-    }
-
-#endif
-#endif
-    /* return the status of RFFT Init function */
-    return (status);
-}
-
-/**
-  @} end of RealFFT group
- */
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_rfft_init_q15.c
+ * Description:  RFFT & RIFFT Q15 initialisation function
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+#include "arm_common_tables.h"
+#include "arm_const_structs.h"
+
+/**
+  @addtogroup RealFFT
+  @{
+ */
+
+/**
+  @brief         Initialization function for the Q15 RFFT/RIFFT.
+  @param[in,out] S              points to an instance of the Q15 RFFT/RIFFT structure
+  @param[in]     fftLenReal     length of the FFT
+  @param[in]     ifftFlagR      flag that selects transform direction
+                   - value = 0: forward transform
+                   - value = 1: inverse transform
+  @param[in]     bitReverseFlag flag that enables / disables bit reversal of output
+                   - value = 0: disables bit reversal of output
+                   - value = 1: enables bit reversal of output
+  @return        execution status
+                   - \ref ARM_MATH_SUCCESS        : Operation successful
+                   - \ref ARM_MATH_ARGUMENT_ERROR : <code>fftLenReal</code> is not a supported length
+
+  @par           Details
+                   The parameter <code>fftLenReal</code> specifies length of RFFT/RIFFT Process.
+                   Supported FFT Lengths are 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192.
+  @par
+                   The parameter <code>ifftFlagR</code> controls whether a forward or inverse transform is computed.
+                   Set(=1) ifftFlagR to calculate RIFFT, otherwise RFFT is calculated.
+  @par
+                   The parameter <code>bitReverseFlag</code> controls whether output is in normal order or bit reversed order.
+                   Set(=1) bitReverseFlag for output to be in normal order otherwise output is in bit reversed order.
+  @par
+                   This function also initializes Twiddle factor table.
+ */
+
+arm_status arm_rfft_init_q15(
+    arm_rfft_instance_q15 * S,
+    uint32_t fftLenReal,
+    uint32_t ifftFlagR,
+    uint32_t bitReverseFlag)
+{
+    /*  Initialise the default arm status */
+    arm_status status = ARM_MATH_SUCCESS;
+
+    /*  Initialize the Real FFT length */
+    S->fftLenReal = (uint16_t) fftLenReal;
+
+    /*  Initialize the Twiddle coefficientA pointer */
+    S->pTwiddleAReal = (q15_t *) realCoefAQ15;
+
+    /*  Initialize the Twiddle coefficientB pointer */
+    S->pTwiddleBReal = (q15_t *) realCoefBQ15;
+
+    /*  Initialize the Flag for selection of RFFT or RIFFT */
+    S->ifftFlagR = (uint8_t) ifftFlagR;
+
+    /*  Initialize the Flag for calculation Bit reversal or not */
+    S->bitReverseFlagR = (uint8_t) bitReverseFlag;
+
+    /*  Initialization of coef modifier depending on the FFT length */
+    switch (S->fftLenReal)
+    {
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_4096) && defined(ARM_TABLE_BITREVIDX_FXT_4096))
+    case 8192U:
+        S->twidCoefRModifier = 1U;
+        S->pCfft = &arm_cfft_sR_q15_len4096;
+        break;
+#endif
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_2048) && defined(ARM_TABLE_BITREVIDX_FXT_2048))
+    case 4096U:
+        S->twidCoefRModifier = 2U;
+        S->pCfft = &arm_cfft_sR_q15_len2048;
+        break;
+#endif
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_1024) && defined(ARM_TABLE_BITREVIDX_FXT_1024))
+    case 2048U:
+        S->twidCoefRModifier = 4U;
+        S->pCfft = &arm_cfft_sR_q15_len1024;
+        break;
+#endif 
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_512) && defined(ARM_TABLE_BITREVIDX_FXT_512))
+    case 1024U:
+        S->twidCoefRModifier = 8U;
+        S->pCfft = &arm_cfft_sR_q15_len512;
+        break;
+#endif 
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_256) && defined(ARM_TABLE_BITREVIDX_FXT_256))
+    case 512U:
+        S->twidCoefRModifier = 16U;
+        S->pCfft = &arm_cfft_sR_q15_len256;
+        break;
+#endif
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_128) && defined(ARM_TABLE_BITREVIDX_FXT_128))
+    case 256U:
+        S->twidCoefRModifier = 32U;
+        S->pCfft = &arm_cfft_sR_q15_len128;
+        break;
+#endif
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_64) && defined(ARM_TABLE_BITREVIDX_FXT_64))
+    case 128U:
+        S->twidCoefRModifier = 64U;
+        S->pCfft = &arm_cfft_sR_q15_len64;
+        break;
+#endif 
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_32) && defined(ARM_TABLE_BITREVIDX_FXT_32))
+    case 64U:
+        S->twidCoefRModifier = 128U;
+        S->pCfft = &arm_cfft_sR_q15_len32;
+        break;
+#endif 
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_16) && defined(ARM_TABLE_BITREVIDX_FXT_16))
+    case 32U:
+        S->twidCoefRModifier = 256U;
+        S->pCfft = &arm_cfft_sR_q15_len16;
+        break;
+#endif
+    default:
+        /*  Reporting argument error if rfftSize is not valid value */
+        status = ARM_MATH_ARGUMENT_ERROR;
+        break;
+    }
+
+    /* return the status of RFFT Init function */
+    return (status);
+}
+
+/**
+  @} end of RealFFT group
+ */
diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_rfft_init_q31.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_rfft_init_q31.c
index 0a28719..efae1ea 100644
--- a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_rfft_init_q31.c
+++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_rfft_init_q31.c
@@ -1,246 +1,160 @@
-/* ----------------------------------------------------------------------
- * Project:      CMSIS DSP Library
- * Title:        arm_rfft_init_q31.c
- * Description:  RFFT & RIFFT Q31 initialisation function
- *
- * $Date:        23 April 2021
- * $Revision:    V1.9.0
- *
- * Target Processor: Cortex-M and Cortex-A cores
- * -------------------------------------------------------------------- */
-/*
- * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "dsp/transform_functions.h"
-#include "arm_common_tables.h"
-#include "arm_const_structs.h"
-
-
-
-/**
-  @addtogroup RealFFT
-  @{
- */
-
-/**
-  @brief         Initialization function for the Q31 RFFT/RIFFT.
-  @param[in,out] S              points to an instance of the Q31 RFFT/RIFFT structure
-  @param[in]     fftLenReal     length of the FFT
-  @param[in]     ifftFlagR      flag that selects transform direction
-                   - value = 0: forward transform
-                   - value = 1: inverse transform
-  @param[in]     bitReverseFlag flag that enables / disables bit reversal of output
-                   - value = 0: disables bit reversal of output
-                   - value = 1: enables bit reversal of output
-  @return        execution status
-                   - \ref ARM_MATH_SUCCESS        : Operation successful
-                   - \ref ARM_MATH_ARGUMENT_ERROR : <code>fftLenReal</code> is not a supported length
-
-  @par           Details
-                   The parameter <code>fftLenReal</code> specifies length of RFFT/RIFFT Process.
-                   Supported FFT Lengths are 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192.
-  @par
-                   The parameter <code>ifftFlagR</code> controls whether a forward or inverse transform is computed.
-                   Set(=1) ifftFlagR to calculate RIFFT, otherwise RFFT is calculated.
-  @par
-                   The parameter <code>bitReverseFlag</code> controls whether output is in normal order or bit reversed order.
-                   Set(=1) bitReverseFlag for output to be in normal order otherwise output is in bit reversed order.
-  @par
-                   This function also initializes Twiddle factor table.
-*/
-
-arm_status arm_rfft_init_q31(
-    arm_rfft_instance_q31 * S,
-    uint32_t fftLenReal,
-    uint32_t ifftFlagR,
-    uint32_t bitReverseFlag)
-{
-     /*  Initialise the default arm status */
-    arm_status status = ARM_MATH_ARGUMENT_ERROR;
-
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)
-
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_REALCOEF_Q31)
-
-    /*  Initialise the default arm status */
-    status = ARM_MATH_SUCCESS;
-
-    /*  Initialize the Real FFT length */
-    S->fftLenReal = (uint16_t) fftLenReal;
-
-    /*  Initialize the Twiddle coefficientA pointer */
-    S->pTwiddleAReal = (q31_t *) realCoefAQ31;
-
-    /*  Initialize the Twiddle coefficientB pointer */
-    S->pTwiddleBReal = (q31_t *) realCoefBQ31;
-
-    /*  Initialize the Flag for selection of RFFT or RIFFT */
-    S->ifftFlagR = (uint8_t) ifftFlagR;
-
-    /*  Initialize the Flag for calculation Bit reversal or not */
-    S->bitReverseFlagR = (uint8_t) bitReverseFlag;
-
-    /*  Initialization of coef modifier depending on the FFT length */
-    switch (S->fftLenReal)
-    {
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_4096) && defined(ARM_TABLE_BITREVIDX_FXT_4096))
-    case 8192U:
-
-
-        S->twidCoefRModifier = 1U;
-
-        #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
-           status=arm_cfft_init_q31(&(S->cfftInst),4096);
-           if (status != ARM_MATH_SUCCESS)
-           {
-               return(status);
-           }
-        #else
-           S->pCfft = &arm_cfft_sR_q31_len4096;
-        #endif
-        break;
-#endif
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_2048) && defined(ARM_TABLE_BITREVIDX_FXT_2048))
-    case 4096U:
-        S->twidCoefRModifier = 2U;
-
-        #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
-           status=arm_cfft_init_q31(&(S->cfftInst),2048);
-           if (status != ARM_MATH_SUCCESS)
-           {
-               return(status);
-           }
-        #else
-           S->pCfft = &arm_cfft_sR_q31_len2048;
-        #endif
-        break;
-#endif
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_1024) && defined(ARM_TABLE_BITREVIDX_FXT_1024))
-    case 2048U:
-        S->twidCoefRModifier = 4U;
-
-        #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
-           status=arm_cfft_init_q31(&(S->cfftInst),1024);
-           if (status != ARM_MATH_SUCCESS)
-           {
-               return(status);
-           }
-        #else
-          S->pCfft = &arm_cfft_sR_q31_len1024;
-        #endif
-        break;
-#endif
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_512) && defined(ARM_TABLE_BITREVIDX_FXT_512))
-    case 1024U:
-        S->twidCoefRModifier = 8U;
-        #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
-           status=arm_cfft_init_q31(&(S->cfftInst),512);
-           if (status != ARM_MATH_SUCCESS)
-           {
-               return(status);
-           }
-        #else
-           S->pCfft = &arm_cfft_sR_q31_len512;
-        #endif
-        break;
-#endif
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_256) && defined(ARM_TABLE_BITREVIDX_FXT_256))
-    case 512U:
-        S->twidCoefRModifier = 16U;
-        #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
-           status=arm_cfft_init_q31(&(S->cfftInst),256);
-           if (status != ARM_MATH_SUCCESS)
-           {
-               return(status);
-           }
-        #else
-           S->pCfft = &arm_cfft_sR_q31_len256;
-        #endif
-        break;
-#endif 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_128) && defined(ARM_TABLE_BITREVIDX_FXT_128))
-    case 256U:
-        S->twidCoefRModifier = 32U;
-        #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
-           status=arm_cfft_init_q31(&(S->cfftInst),128);
-           if (status != ARM_MATH_SUCCESS)
-           {
-               return(status);
-           }
-        #else
-           S->pCfft = &arm_cfft_sR_q31_len128;
-        #endif
-        break;
-#endif 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_64) && defined(ARM_TABLE_BITREVIDX_FXT_64))
-    case 128U:
-        S->twidCoefRModifier = 64U;
-        #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
-           status=arm_cfft_init_q31(&(S->cfftInst),64);
-           if (status != ARM_MATH_SUCCESS)
-           {
-               return(status);
-           }
-        #else
-           S->pCfft = &arm_cfft_sR_q31_len64;
-        #endif
-        break;
-#endif
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_32) && defined(ARM_TABLE_BITREVIDX_FXT_32))
-    case 64U:
-        S->twidCoefRModifier = 128U;
-        #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
-           status=arm_cfft_init_q31(&(S->cfftInst),32);
-           if (status != ARM_MATH_SUCCESS)
-           {
-               return(status);
-           }
-        #else
-           S->pCfft = &arm_cfft_sR_q31_len32;
-        #endif
-        break;
-#endif 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_16) && defined(ARM_TABLE_BITREVIDX_FXT_16))
-    case 32U:
-        S->twidCoefRModifier = 256U;
-        #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
-           status=arm_cfft_init_q31(&(S->cfftInst),16);
-           if (status != ARM_MATH_SUCCESS)
-           {
-               return(status);
-           }
-        #else
-           S->pCfft = &arm_cfft_sR_q31_len16;
-        #endif
-        break;
-#endif
-    default:
-        /*  Reporting argument error if rfftSize is not valid value */
-        status = ARM_MATH_ARGUMENT_ERROR;
-        break;
-    }
-
-#endif
-#endif
-    /* return the status of RFFT Init function */
-    return (status);
-}
-
-/**
-  @} end of RealFFT group
- */
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_rfft_init_q31.c
+ * Description:  RFFT & RIFFT Q31 initialisation function
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+#include "arm_common_tables.h"
+#include "arm_const_structs.h"
+
+
+
+/**
+  @addtogroup RealFFT
+  @{
+ */
+
+/**
+  @brief         Initialization function for the Q31 RFFT/RIFFT.
+  @param[in,out] S              points to an instance of the Q31 RFFT/RIFFT structure
+  @param[in]     fftLenReal     length of the FFT
+  @param[in]     ifftFlagR      flag that selects transform direction
+                   - value = 0: forward transform
+                   - value = 1: inverse transform
+  @param[in]     bitReverseFlag flag that enables / disables bit reversal of output
+                   - value = 0: disables bit reversal of output
+                   - value = 1: enables bit reversal of output
+  @return        execution status
+                   - \ref ARM_MATH_SUCCESS        : Operation successful
+                   - \ref ARM_MATH_ARGUMENT_ERROR : <code>fftLenReal</code> is not a supported length
+
+  @par           Details
+                   The parameter <code>fftLenReal</code> specifies length of RFFT/RIFFT Process.
+                   Supported FFT Lengths are 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192.
+  @par
+                   The parameter <code>ifftFlagR</code> controls whether a forward or inverse transform is computed.
+                   Set(=1) ifftFlagR to calculate RIFFT, otherwise RFFT is calculated.
+  @par
+                   The parameter <code>bitReverseFlag</code> controls whether output is in normal order or bit reversed order.
+                   Set(=1) bitReverseFlag for output to be in normal order otherwise output is in bit reversed order.
+  @par
+                   This function also initializes Twiddle factor table.
+*/
+
+arm_status arm_rfft_init_q31(
+    arm_rfft_instance_q31 * S,
+    uint32_t fftLenReal,
+    uint32_t ifftFlagR,
+    uint32_t bitReverseFlag)
+{
+    /*  Initialise the default arm status */
+    arm_status status = ARM_MATH_SUCCESS;
+
+    /*  Initialize the Real FFT length */
+    S->fftLenReal = (uint16_t) fftLenReal;
+
+    /*  Initialize the Twiddle coefficientA pointer */
+    S->pTwiddleAReal = (q31_t *) realCoefAQ31;
+
+    /*  Initialize the Twiddle coefficientB pointer */
+    S->pTwiddleBReal = (q31_t *) realCoefBQ31;
+
+    /*  Initialize the Flag for selection of RFFT or RIFFT */
+    S->ifftFlagR = (uint8_t) ifftFlagR;
+
+    /*  Initialize the Flag for calculation Bit reversal or not */
+    S->bitReverseFlagR = (uint8_t) bitReverseFlag;
+
+    /*  Initialization of coef modifier depending on the FFT length */
+    switch (S->fftLenReal)
+    {
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_4096) && defined(ARM_TABLE_BITREVIDX_FXT_4096))
+    case 8192U:
+        S->twidCoefRModifier = 1U;
+        S->pCfft = &arm_cfft_sR_q31_len4096;
+        break;
+#endif
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_2048) && defined(ARM_TABLE_BITREVIDX_FXT_2048))
+    case 4096U:
+        S->twidCoefRModifier = 2U;
+        S->pCfft = &arm_cfft_sR_q31_len2048;
+        break;
+#endif
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_1024) && defined(ARM_TABLE_BITREVIDX_FXT_1024))
+    case 2048U:
+        S->twidCoefRModifier = 4U;
+        S->pCfft = &arm_cfft_sR_q31_len1024;
+        break;
+#endif
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_512) && defined(ARM_TABLE_BITREVIDX_FXT_512))
+    case 1024U:
+        S->twidCoefRModifier = 8U;
+        S->pCfft = &arm_cfft_sR_q31_len512;
+        break;
+#endif
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_256) && defined(ARM_TABLE_BITREVIDX_FXT_256))
+    case 512U:
+        S->twidCoefRModifier = 16U;
+        S->pCfft = &arm_cfft_sR_q31_len256;
+        break;
+#endif 
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_128) && defined(ARM_TABLE_BITREVIDX_FXT_128))
+    case 256U:
+        S->twidCoefRModifier = 32U;
+        S->pCfft = &arm_cfft_sR_q31_len128;
+        break;
+#endif 
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_64) && defined(ARM_TABLE_BITREVIDX_FXT_64))
+    case 128U:
+        S->twidCoefRModifier = 64U;
+        S->pCfft = &arm_cfft_sR_q31_len64;
+        break;
+#endif
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_32) && defined(ARM_TABLE_BITREVIDX_FXT_32))
+    case 64U:
+        S->twidCoefRModifier = 128U;
+        S->pCfft = &arm_cfft_sR_q31_len32;
+        break;
+#endif 
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_16) && defined(ARM_TABLE_BITREVIDX_FXT_16))
+    case 32U:
+        S->twidCoefRModifier = 256U;
+        S->pCfft = &arm_cfft_sR_q31_len16;
+        break;
+#endif
+    default:
+        /*  Reporting argument error if rfftSize is not valid value */
+        status = ARM_MATH_ARGUMENT_ERROR;
+        break;
+    }
+
+    /* return the status of RFFT Init function */
+    return (status);
+}
+
+/**
+  @} end of RealFFT group
+ */
diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_rfft_q15.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_rfft_q15.c
index 7d149c6..29d12b7 100644
--- a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_rfft_q15.c
+++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_rfft_q15.c
@@ -1,526 +1,380 @@
-/* ----------------------------------------------------------------------
- * Project:      CMSIS DSP Library
- * Title:        arm_rfft_q15.c
- * Description:  RFFT & RIFFT Q15 process function
- *
- * $Date:        23 April 2021
- * $Revision:    V1.9.0
- *
- * Target Processor: Cortex-M and Cortex-A cores
- * -------------------------------------------------------------------- */
-/*
- * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "dsp/transform_functions.h"
-
-/* ----------------------------------------------------------------------
- * Internal functions prototypes
- * -------------------------------------------------------------------- */
-
-void arm_split_rfft_q15(
-        q15_t * pSrc,
-        uint32_t fftLen,
-  const q15_t * pATable,
-  const q15_t * pBTable,
-        q15_t * pDst,
-        uint32_t modifier);
-
-void arm_split_rifft_q15(
-        q15_t * pSrc,
-        uint32_t fftLen,
-  const q15_t * pATable,
-  const q15_t * pBTable,
-        q15_t * pDst,
-        uint32_t modifier);
-
-/**
-  @addtogroup RealFFT
-  @{
- */
-
-/**
-  @brief         Processing function for the Q15 RFFT/RIFFT.
-  @param[in]     S     points to an instance of the Q15 RFFT/RIFFT structure
-  @param[in]     pSrc  points to input buffer (Source buffer is modified by this function.)
-  @param[out]    pDst  points to output buffer
-  @return        none
-
-  @par           Input an output formats
-                   Internally input is downscaled by 2 for every stage to avoid saturations inside CFFT/CIFFT process.
-                   Hence the output format is different for different RFFT sizes.
-                   The input and output formats for different RFFT sizes and number of bits to upscale are mentioned in the tables below for RFFT and RIFFT:
-  @par
-                   \image html RFFTQ15.gif "Input and Output Formats for Q15 RFFT"
-  @par
-                   \image html RIFFTQ15.gif "Input and Output Formats for Q15 RIFFT"
-  @par
-                   If the input buffer is of length N, the output buffer must have length 2*N.
-                   The input buffer is modified by this function.
-  @par
-                   For the RIFFT, the source buffer must at least have length 
-                   fftLenReal + 2.
-                   The last two elements must be equal to what would be generated
-                   by the RFFT:
-                     (pSrc[0] - pSrc[1]) >> 1 and 0
- */
-
-void arm_rfft_q15(
-  const arm_rfft_instance_q15 * S,
-        q15_t * pSrc,
-        q15_t * pDst)
-{
-#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
-  const arm_cfft_instance_q15 *S_CFFT = &(S->cfftInst);
-#else
-  const arm_cfft_instance_q15 *S_CFFT = S->pCfft;
-#endif
-        uint32_t L2 = S->fftLenReal >> 1U;
-
-  /* Calculation of RIFFT of input */
-  if (S->ifftFlagR == 1U)
-  {
-     /*  Real IFFT core process */
-     arm_split_rifft_q15 (pSrc, L2, S->pTwiddleAReal, S->pTwiddleBReal, pDst, S->twidCoefRModifier);
-
-     /* Complex IFFT process */
-     arm_cfft_q15 (S_CFFT, pDst, S->ifftFlagR, S->bitReverseFlagR);
-
-     arm_shift_q15(pDst, 1, pDst, S->fftLenReal);
-  }
-  else
-  {
-     /* Calculation of RFFT of input */
-
-     /* Complex FFT process */
-     arm_cfft_q15 (S_CFFT, pSrc, S->ifftFlagR, S->bitReverseFlagR);
-
-     /*  Real FFT core process */
-     arm_split_rfft_q15 (pSrc, L2, S->pTwiddleAReal, S->pTwiddleBReal, pDst, S->twidCoefRModifier);
-  }
-
-}
-
-/**
-  @} end of RealFFT group
- */
-
-/**
-  @brief         Core Real FFT process
-  @param[in]     pSrc      points to input buffer
-  @param[in]     fftLen    length of FFT
-  @param[in]     pATable   points to twiddle Coef A buffer
-  @param[in]     pBTable   points to twiddle Coef B buffer
-  @param[out]    pDst      points to output buffer
-  @param[in]     modifier  twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table
-  @return        none
-
-  @par
-                   The function implements a Real FFT
- */
-
-#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
-
-#include "arm_helium_utils.h"
-#include "arm_vec_fft.h"
-
-#if defined(__CMSIS_GCC_H)
-#define MVE_CMPLX_MULT_FX_AxB_S16(A,B)          vqdmladhxq_s16(vqdmlsdhq_s16((__typeof(A))vuninitializedq_s16(), A, B), A, B)
-#define MVE_CMPLX_MULT_FX_AxConjB_S16(A,B)      vqdmladhq_s16(vqdmlsdhxq_s16((__typeof(A))vuninitializedq_s16(), A, B), A, B)
-
-#endif 
-
-void arm_split_rfft_q15(
-        q15_t * pSrc,
-        uint32_t fftLen,
-  const q15_t * pATable,
-  const q15_t * pBTable,
-        q15_t * pDst,
-        uint32_t modifier)
-{
-   uint32_t        i;          /* Loop Counter */
-    const q15_t    *pCoefA, *pCoefB;    /* Temporary pointers for twiddle factors */
-    q15_t          *pOut1 = &pDst[2];
-    q15_t          *pIn1 = &pSrc[2];
-    uint16x8_t      offsetIn = { 6, 7, 4, 5, 2, 3, 0, 1 };
-    uint16x8_t      offsetCoef;
-    const uint16_t  offsetCoefArr[16] = {
-        0, 0, 2, 2, 4, 4, 6, 6,
-        0, 1, 0, 1, 0, 1, 0, 1
-    };
-
-    offsetCoef = vmulq_n_u16(vld1q_u16(offsetCoefArr), modifier) + vld1q_u16(offsetCoefArr + 8);
-    offsetIn = vaddq_n_u16(offsetIn, (2 * fftLen - 8));
-
-    /* Init coefficient pointers */
-    pCoefA = &pATable[modifier * 2];
-    pCoefB = &pBTable[modifier * 2];
-
-    const q15_t    *pCoefAb, *pCoefBb;
-    pCoefAb = pCoefA;
-    pCoefBb = pCoefB;
-
-    pIn1 = &pSrc[2];
-
-    i = fftLen - 1U;
-    i = i / 4 + 1;
-    while (i > 0U) {
-        q15x8_t         in1 = vld1q_s16(pIn1);
-        q15x8_t         in2 = vldrhq_gather_shifted_offset_s16(pSrc, offsetIn);
-        q15x8_t         coefA = vldrhq_gather_shifted_offset_s16(pCoefAb, offsetCoef);
-        q15x8_t         coefB = vldrhq_gather_shifted_offset_s16(pCoefBb, offsetCoef);
-
-#if defined(__CMSIS_GCC_H)
-        q15x8_t         out = vhaddq_s16(MVE_CMPLX_MULT_FX_AxB_S16(in1, coefA),
-                                     MVE_CMPLX_MULT_FX_AxConjB_S16(coefB, in2));
-#else
-        q15x8_t         out = vhaddq_s16(MVE_CMPLX_MULT_FX_AxB(in1, coefA, q15x8_t),
-                                         MVE_CMPLX_MULT_FX_AxConjB(coefB, in2, q15x8_t));
-#endif
-        vst1q_s16(pOut1, out);
-        pOut1 += 8;
-
-        offsetCoef = vaddq_n_u16(offsetCoef, modifier * 8);
-        offsetIn -= 8;
-        pIn1 += 8;
-        i -= 1;
-    }
-
-    pDst[2 * fftLen] = (pSrc[0] - pSrc[1]) >> 1U;
-    pDst[2 * fftLen + 1] = 0;
-
-    pDst[0] = (pSrc[0] + pSrc[1]) >> 1U;
-    pDst[1] = 0;
-}
-#else
-void arm_split_rfft_q15(
-        q15_t * pSrc,
-        uint32_t fftLen,
-  const q15_t * pATable,
-  const q15_t * pBTable,
-        q15_t * pDst,
-        uint32_t modifier)
-{       
-        uint32_t i;                                    /* Loop Counter */
-        q31_t outR, outI;                              /* Temporary variables for output */
-  const q15_t *pCoefA, *pCoefB;                        /* Temporary pointers for twiddle factors */
-        q15_t *pSrc1, *pSrc2;
-#if defined (ARM_MATH_DSP)
-        q15_t *pD1, *pD2;
-#endif
-
-  /* Init coefficient pointers */
-  pCoefA = &pATable[modifier * 2];
-  pCoefB = &pBTable[modifier * 2];
-
-  pSrc1 = &pSrc[2];
-  pSrc2 = &pSrc[(2U * fftLen) - 2U];
-
-#if defined (ARM_MATH_DSP)
-
-    i = 1U;
-    pD1 = pDst + 2;
-    pD2 = pDst + (4U * fftLen) - 2;
-
-    for (i = fftLen - 1; i > 0; i--)
-    {
-        /*
-          outR = (  pSrc[2 * i]             * pATable[2 * i]
-                  - pSrc[2 * i + 1]         * pATable[2 * i + 1]
-                  + pSrc[2 * n - 2 * i]     * pBTable[2 * i]
-                  + pSrc[2 * n - 2 * i + 1] * pBTable[2 * i + 1]);
-
-          outI = (  pIn[2 * i + 1]         * pATable[2 * i]
-                  + pIn[2 * i]             * pATable[2 * i + 1]
-                  + pIn[2 * n - 2 * i]     * pBTable[2 * i + 1]
-                  - pIn[2 * n - 2 * i + 1] * pBTable[2 * i])
-         */
-
-
-#ifndef ARM_MATH_BIG_ENDIAN
-        /* pSrc[2 * i] * pATable[2 * i] - pSrc[2 * i + 1] * pATable[2 * i + 1] */
-        outR = __SMUSD(read_q15x2 (pSrc1), read_q15x2((q15_t *) pCoefA));
-#else
-        /* -(pSrc[2 * i + 1] * pATable[2 * i + 1] - pSrc[2 * i] * pATable[2 * i]) */
-        outR = -(__SMUSD(read_q15x2 (pSrc1), read_q15x2((q15_t *) pCoefA)));
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
-        /* pSrc[2 * n - 2 * i] * pBTable[2 * i] + pSrc[2 * n - 2 * i + 1] * pBTable[2 * i + 1]) */
-        outR = __SMLAD(read_q15x2 (pSrc2), read_q15x2((q15_t *) pCoefB), outR) >> 16U;
-
-        /* pIn[2 * n - 2 * i] * pBTable[2 * i + 1] - pIn[2 * n - 2 * i + 1] * pBTable[2 * i] */
-#ifndef ARM_MATH_BIG_ENDIAN
-        outI = __SMUSDX(read_q15x2_da (&pSrc2), read_q15x2((q15_t *) pCoefB));
-#else
-        outI = __SMUSDX(read_q15x2 ((q15_t *) pCoefB), read_q15x2_da (&pSrc2));
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
-        /* (pIn[2 * i + 1] * pATable[2 * i] + pIn[2 * i] * pATable[2 * i + 1] */
-        outI = __SMLADX(read_q15x2_ia (&pSrc1), read_q15x2 ((q15_t *) pCoefA), outI);
-
-        /* write output */
-        *pD1++ = (q15_t) outR;
-        *pD1++ = outI >> 16U;
-
-        /* write complex conjugate output */
-        pD2[0] = (q15_t) outR;
-        pD2[1] = -(outI >> 16U);
-        pD2 -= 2;
-
-        /* update coefficient pointer */
-        pCoefB = pCoefB + (2U * modifier);
-        pCoefA = pCoefA + (2U * modifier);
-    }
-
-    pDst[2U * fftLen]      = (pSrc[0] - pSrc[1]) >> 1U;
-    pDst[2U * fftLen + 1U] = 0;
-
-    pDst[0] = (pSrc[0] + pSrc[1]) >> 1U;
-    pDst[1] = 0;
-
-#else
-
-    i = 1U;
-
-    while (i < fftLen)
-    {
-        /*
-          outR = (  pSrc[2 * i]             * pATable[2 * i]
-                  - pSrc[2 * i + 1]         * pATable[2 * i + 1]
-                  + pSrc[2 * n - 2 * i]     * pBTable[2 * i]
-                  + pSrc[2 * n - 2 * i + 1] * pBTable[2 * i + 1]);
-        */
-
-        outR = *pSrc1 * *pCoefA;
-        outR = outR - (*(pSrc1 + 1) * *(pCoefA + 1));
-        outR = outR + (*pSrc2 * *pCoefB);
-        outR = (outR + (*(pSrc2 + 1) * *(pCoefB + 1))) >> 16;
-
-        /*
-          outI = (  pIn[2 * i + 1]         * pATable[2 * i]
-                  + pIn[2 * i]             * pATable[2 * i + 1]
-                  + pIn[2 * n - 2 * i]     * pBTable[2 * i + 1]
-                  - pIn[2 * n - 2 * i + 1] * pBTable[2 * i]);
-        */
-
-        outI = *pSrc2 * *(pCoefB + 1);
-        outI = outI - (*(pSrc2 + 1) * *pCoefB);
-        outI = outI + (*(pSrc1 + 1) * *pCoefA);
-        outI = outI + (*pSrc1 * *(pCoefA + 1));
-
-        /* update input pointers */
-        pSrc1 += 2U;
-        pSrc2 -= 2U;
-
-        /* write output */
-        pDst[2U * i] = (q15_t) outR;
-        pDst[2U * i + 1U] = outI >> 16U;
-
-        /* write complex conjugate output */
-        pDst[(4U * fftLen) - (2U * i)] = (q15_t) outR;
-        pDst[((4U * fftLen) - (2U * i)) + 1U] = -(outI >> 16U);
-
-        /* update coefficient pointer */
-        pCoefB = pCoefB + (2U * modifier);
-        pCoefA = pCoefA + (2U * modifier);
-
-        i++;
-    }
-
-    pDst[2U * fftLen] = (pSrc[0] - pSrc[1]) >> 1;
-    pDst[2U * fftLen + 1U] = 0;
-
-    pDst[0] = (pSrc[0] + pSrc[1]) >> 1;
-    pDst[1] = 0;
-
-#endif /* #if defined (ARM_MATH_DSP) */
-}
-#endif /* defined(ARM_MATH_MVEI) */
-
-/**
-  @brief         Core Real IFFT process
-  @param[in]     pSrc      points to input buffer
-  @param[in]     fftLen    length of FFT
-  @param[in]     pATable   points to twiddle Coef A buffer
-  @param[in]     pBTable   points to twiddle Coef B buffer
-  @param[out]    pDst      points to output buffer
-  @param[in]     modifier  twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table
-  @return        none
-
-  @par
-                   The function implements a Real IFFT
- */
-
-#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
-
-#include "arm_helium_utils.h"
-#include "arm_vec_fft.h"
-
-void arm_split_rifft_q15(
-        q15_t * pSrc,
-        uint32_t fftLen,
-  const q15_t * pATable,
-  const q15_t * pBTable,
-        q15_t * pDst,
-        uint32_t modifier)
-{
-   uint32_t        i;                  /* Loop Counter */
-    const q15_t    *pCoefA, *pCoefB;    /* Temporary pointers for twiddle factors */
-    q15_t          *pIn1;
-    uint16x8_t      offset = { 6, 7, 4, 5, 2, 3, 0, 1 };
-    uint16x8_t      offsetCoef;
-    int16x8_t       conj = { 1, -1, 1, -1, 1, -1, 1, -1 }; /* conjugate */
-    const uint16_t  offsetCoefArr[16] = {
-        0, 0, 2, 2, 4, 4, 6, 6,
-        0, 1, 0, 1, 0, 1, 0, 1
-    };
-
-    offsetCoef = vmulq_n_u16(vld1q_u16(offsetCoefArr), modifier) + vld1q_u16(offsetCoefArr + 8);
-
-    offset = vaddq_n_u16(offset, (2 * fftLen - 6));
-
-    /* Init coefficient pointers */
-    pCoefA = &pATable[0];
-    pCoefB = &pBTable[0];
-
-    const q15_t    *pCoefAb, *pCoefBb;
-    pCoefAb = pCoefA;
-    pCoefBb = pCoefB;
-
-    pIn1 = &pSrc[0];
-
-    i = fftLen;
-    i = i / 4;
-
-    while (i > 0U) {
-        q15x8_t         in1 = vld1q_s16(pIn1);
-        q15x8_t         in2 = vldrhq_gather_shifted_offset_s16(pSrc, offset);
-        q15x8_t         coefA = vldrhq_gather_shifted_offset_s16(pCoefAb, offsetCoef);
-        q15x8_t         coefB = vldrhq_gather_shifted_offset_s16(pCoefBb, offsetCoef);
-
-        /* can we avoid the conjugate here ? */
-        q15x8_t         out = vhaddq_s16(MVE_CMPLX_MULT_FX_AxConjB(in1, coefA, q15x8_t),
-                                         vmulq(conj, MVE_CMPLX_MULT_FX_AxB(in2, coefB, q15x8_t)));
-
-        vst1q_s16(pDst, out);
-        pDst += 8;
-
-        offsetCoef = vaddq_n_u16(offsetCoef, modifier * 8);
-        offset -= 8;
-
-        pIn1 += 8;
-        i -= 1;
-    }
-}
-#else
-void arm_split_rifft_q15(
-        q15_t * pSrc,
-        uint32_t fftLen,
-  const q15_t * pATable,
-  const q15_t * pBTable,
-        q15_t * pDst,
-        uint32_t modifier)
-{
-        uint32_t i;                                    /* Loop Counter */
-        q31_t outR, outI;                              /* Temporary variables for output */
-  const q15_t *pCoefA, *pCoefB;                        /* Temporary pointers for twiddle factors */
-        q15_t *pSrc1, *pSrc2;
-        q15_t *pDst1 = &pDst[0];
-
-  pCoefA = &pATable[0];
-  pCoefB = &pBTable[0];
-
-  pSrc1 = &pSrc[0];
-  pSrc2 = &pSrc[2 * fftLen];
-
-  i = fftLen;
-  while (i > 0U)
-  {
-      /*
-        outR = (  pIn[2 * i]             * pATable[2 * i]
-                + pIn[2 * i + 1]         * pATable[2 * i + 1]
-                + pIn[2 * n - 2 * i]     * pBTable[2 * i]
-                - pIn[2 * n - 2 * i + 1] * pBTable[2 * i + 1]);
-
-        outI = (  pIn[2 * i + 1]         * pATable[2 * i]
-                - pIn[2 * i]             * pATable[2 * i + 1]
-                - pIn[2 * n - 2 * i]     * pBTable[2 * i + 1]
-                - pIn[2 * n - 2 * i + 1] * pBTable[2 * i]);
-       */
-
-#if defined (ARM_MATH_DSP)
-
-#ifndef ARM_MATH_BIG_ENDIAN
-      /* pIn[2 * n - 2 * i] * pBTable[2 * i] - pIn[2 * n - 2 * i + 1] * pBTable[2 * i + 1]) */
-      outR = __SMUSD(read_q15x2(pSrc2), read_q15x2((q15_t *) pCoefB));
-#else
-      /* -(-pIn[2 * n - 2 * i] * pBTable[2 * i] + pIn[2 * n - 2 * i + 1] * pBTable[2 * i + 1])) */
-      outR = -(__SMUSD(read_q15x2(pSrc2), read_q15x2((q15_t *) pCoefB)));
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
-      /* pIn[2 * i] * pATable[2 * i] + pIn[2 * i + 1] * pATable[2 * i + 1] + pIn[2 * n - 2 * i] * pBTable[2 * i] */
-      outR = __SMLAD(read_q15x2(pSrc1), read_q15x2 ((q15_t *) pCoefA), outR) >> 16U;
-
-      /* -pIn[2 * n - 2 * i] * pBTable[2 * i + 1] + pIn[2 * n - 2 * i + 1] * pBTable[2 * i] */
-      outI = __SMUADX(read_q15x2_da (&pSrc2), read_q15x2((q15_t *) pCoefB));
-
-      /* pIn[2 * i + 1] * pATable[2 * i] - pIn[2 * i] * pATable[2 * i + 1] */
-#ifndef ARM_MATH_BIG_ENDIAN
-      outI = __SMLSDX(read_q15x2 ((q15_t *) pCoefA), read_q15x2_ia (&pSrc1), -outI);
-#else
-      outI = __SMLSDX(read_q15x2_ia (&pSrc1), read_q15x2 ((q15_t *) pCoefA), -outI);
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
-      /* write output */
-#ifndef ARM_MATH_BIG_ENDIAN
-      write_q15x2_ia (&pDst1, __PKHBT(outR, (outI >> 16U), 16));
-#else
-      write_q15x2_ia (&pDst1, __PKHBT((outI >> 16U), outR, 16));
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
-
-#else  /* #if defined (ARM_MATH_DSP) */
-
-      outR = *pSrc2 * *pCoefB;
-      outR = outR - (*(pSrc2 + 1) * *(pCoefB + 1));
-      outR = outR + (*pSrc1 * *pCoefA);
-      outR = (outR + (*(pSrc1 + 1) * *(pCoefA + 1))) >> 16;
-
-      outI = *(pSrc1 + 1) * *pCoefA;
-      outI = outI - (*pSrc1 * *(pCoefA + 1));
-      outI = outI - (*pSrc2 * *(pCoefB + 1));
-      outI = outI - (*(pSrc2 + 1) * *(pCoefB));
-
-      /* update input pointers */
-      pSrc1 += 2U;
-      pSrc2 -= 2U;
-
-      /* write output */
-      *pDst1++ = (q15_t) outR;
-      *pDst1++ = (q15_t) (outI >> 16);
-
-#endif /* #if defined (ARM_MATH_DSP) */
-
-      /* update coefficient pointer */
-      pCoefB = pCoefB + (2 * modifier);
-      pCoefA = pCoefA + (2 * modifier);
-
-      i--;
-  }
-
-}
-#endif /* defined(ARM_MATH_MVEI) */
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_rfft_q15.c
+ * Description:  RFFT & RIFFT Q15 process function
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/* ----------------------------------------------------------------------
+ * Internal functions prototypes
+ * -------------------------------------------------------------------- */
+
+void arm_split_rfft_q15(
+        q15_t * pSrc,
+        uint32_t fftLen,
+  const q15_t * pATable,
+  const q15_t * pBTable,
+        q15_t * pDst,
+        uint32_t modifier);
+
+void arm_split_rifft_q15(
+        q15_t * pSrc,
+        uint32_t fftLen,
+  const q15_t * pATable,
+  const q15_t * pBTable,
+        q15_t * pDst,
+        uint32_t modifier);
+
+/**
+  @addtogroup RealFFT
+  @{
+ */
+
+/**
+  @brief         Processing function for the Q15 RFFT/RIFFT.
+  @param[in]     S     points to an instance of the Q15 RFFT/RIFFT structure
+  @param[in]     pSrc  points to input buffer
+  @param[out]    pDst  points to output buffer
+  @return        none
+
+  @par           Input an output formats
+                   Internally input is downscaled by 2 for every stage to avoid saturations inside CFFT/CIFFT process.
+                   Hence the output format is different for different RFFT sizes.
+                   The input and output formats for different RFFT sizes and number of bits to upscale are mentioned in the tables below for RFFT and RIFFT:
+  @par
+                   \image html RFFTQ15.gif "Input and Output Formats for Q15 RFFT"
+  @par
+                   \image html RIFFTQ15.gif "Input and Output Formats for Q15 RIFFT"
+ */
+
+void arm_rfft_q15(
+  const arm_rfft_instance_q15 * S,
+        q15_t * pSrc,
+        q15_t * pDst)
+{
+  const arm_cfft_instance_q15 *S_CFFT = S->pCfft;
+        uint32_t L2 = S->fftLenReal >> 1U;
+        uint32_t i;
+
+  /* Calculation of RIFFT of input */
+  if (S->ifftFlagR == 1U)
+  {
+     /*  Real IFFT core process */
+     arm_split_rifft_q15 (pSrc, L2, S->pTwiddleAReal, S->pTwiddleBReal, pDst, S->twidCoefRModifier);
+
+     /* Complex IFFT process */
+     arm_cfft_q15 (S_CFFT, pDst, S->ifftFlagR, S->bitReverseFlagR);
+
+     for(i = 0; i < S->fftLenReal; i++)
+     {
+        pDst[i] = pDst[i] << 1U;
+     }
+  }
+  else
+  {
+     /* Calculation of RFFT of input */
+
+     /* Complex FFT process */
+     arm_cfft_q15 (S_CFFT, pSrc, S->ifftFlagR, S->bitReverseFlagR);
+
+     /*  Real FFT core process */
+     arm_split_rfft_q15 (pSrc, L2, S->pTwiddleAReal, S->pTwiddleBReal, pDst, S->twidCoefRModifier);
+  }
+
+}
+
+/**
+  @} end of RealFFT group
+ */
+
+/**
+  @brief         Core Real FFT process
+  @param[in]     pSrc      points to input buffer
+  @param[in]     fftLen    length of FFT
+  @param[in]     pATable   points to twiddle Coef A buffer
+  @param[in]     pBTable   points to twiddle Coef B buffer
+  @param[out]    pDst      points to output buffer
+  @param[in]     modifier  twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table
+  @return        none
+
+  @par
+                   The function implements a Real FFT
+ */
+
+void arm_split_rfft_q15(
+        q15_t * pSrc,
+        uint32_t fftLen,
+  const q15_t * pATable,
+  const q15_t * pBTable,
+        q15_t * pDst,
+        uint32_t modifier)
+{       
+        uint32_t i;                                    /* Loop Counter */
+        q31_t outR, outI;                              /* Temporary variables for output */
+  const q15_t *pCoefA, *pCoefB;                        /* Temporary pointers for twiddle factors */
+        q15_t *pSrc1, *pSrc2;
+#if defined (ARM_MATH_DSP)
+        q15_t *pD1, *pD2;
+#endif
+
+  /* Init coefficient pointers */
+  pCoefA = &pATable[modifier * 2];
+  pCoefB = &pBTable[modifier * 2];
+
+  pSrc1 = &pSrc[2];
+  pSrc2 = &pSrc[(2U * fftLen) - 2U];
+
+#if defined (ARM_MATH_DSP)
+
+    i = 1U;
+    pD1 = pDst + 2;
+    pD2 = pDst + (4U * fftLen) - 2;
+
+    for (i = fftLen - 1; i > 0; i--)
+    {
+        /*
+          outR = (  pSrc[2 * i]             * pATable[2 * i]
+                  - pSrc[2 * i + 1]         * pATable[2 * i + 1]
+                  + pSrc[2 * n - 2 * i]     * pBTable[2 * i]
+                  + pSrc[2 * n - 2 * i + 1] * pBTable[2 * i + 1]);
+
+          outI = (  pIn[2 * i + 1]         * pATable[2 * i]
+                  + pIn[2 * i]             * pATable[2 * i + 1]
+                  + pIn[2 * n - 2 * i]     * pBTable[2 * i + 1]
+                  - pIn[2 * n - 2 * i + 1] * pBTable[2 * i])
+         */
+
+
+#ifndef ARM_MATH_BIG_ENDIAN
+        /* pSrc[2 * i] * pATable[2 * i] - pSrc[2 * i + 1] * pATable[2 * i + 1] */
+        outR = __SMUSD(read_q15x2 (pSrc1), read_q15x2((q15_t *) pCoefA));
+#else
+        /* -(pSrc[2 * i + 1] * pATable[2 * i + 1] - pSrc[2 * i] * pATable[2 * i]) */
+        outR = -(__SMUSD(read_q15x2 (pSrc1), read_q15x2((q15_t *) pCoefA)));
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+        /* pSrc[2 * n - 2 * i] * pBTable[2 * i] + pSrc[2 * n - 2 * i + 1] * pBTable[2 * i + 1]) */
+        outR = __SMLAD(read_q15x2 (pSrc2), read_q15x2((q15_t *) pCoefB), outR) >> 16U;
+
+        /* pIn[2 * n - 2 * i] * pBTable[2 * i + 1] - pIn[2 * n - 2 * i + 1] * pBTable[2 * i] */
+#ifndef ARM_MATH_BIG_ENDIAN
+        outI = __SMUSDX(read_q15x2_da (&pSrc2), read_q15x2((q15_t *) pCoefB));
+#else
+        outI = __SMUSDX(read_q15x2 ((q15_t *) pCoefB), read_q15x2_da (&pSrc2));
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+        /* (pIn[2 * i + 1] * pATable[2 * i] + pIn[2 * i] * pATable[2 * i + 1] */
+        outI = __SMLADX(read_q15x2_ia (&pSrc1), read_q15x2 ((q15_t *) pCoefA), outI);
+
+        /* write output */
+        *pD1++ = (q15_t) outR;
+        *pD1++ = outI >> 16U;
+
+        /* write complex conjugate output */
+        pD2[0] = (q15_t) outR;
+        pD2[1] = -(outI >> 16U);
+        pD2 -= 2;
+
+        /* update coefficient pointer */
+        pCoefB = pCoefB + (2U * modifier);
+        pCoefA = pCoefA + (2U * modifier);
+    }
+
+    pDst[2U * fftLen]      = (pSrc[0] - pSrc[1]) >> 1U;
+    pDst[2U * fftLen + 1U] = 0;
+
+    pDst[0] = (pSrc[0] + pSrc[1]) >> 1U;
+    pDst[1] = 0;
+
+#else
+
+    i = 1U;
+
+    while (i < fftLen)
+    {
+        /*
+          outR = (  pSrc[2 * i]             * pATable[2 * i]
+                  - pSrc[2 * i + 1]         * pATable[2 * i + 1]
+                  + pSrc[2 * n - 2 * i]     * pBTable[2 * i]
+                  + pSrc[2 * n - 2 * i + 1] * pBTable[2 * i + 1]);
+        */
+
+        outR = *pSrc1 * *pCoefA;
+        outR = outR - (*(pSrc1 + 1) * *(pCoefA + 1));
+        outR = outR + (*pSrc2 * *pCoefB);
+        outR = (outR + (*(pSrc2 + 1) * *(pCoefB + 1))) >> 16;
+
+        /*
+          outI = (  pIn[2 * i + 1]         * pATable[2 * i]
+                  + pIn[2 * i]             * pATable[2 * i + 1]
+                  + pIn[2 * n - 2 * i]     * pBTable[2 * i + 1]
+                  - pIn[2 * n - 2 * i + 1] * pBTable[2 * i]);
+        */
+
+        outI = *pSrc2 * *(pCoefB + 1);
+        outI = outI - (*(pSrc2 + 1) * *pCoefB);
+        outI = outI + (*(pSrc1 + 1) * *pCoefA);
+        outI = outI + (*pSrc1 * *(pCoefA + 1));
+
+        /* update input pointers */
+        pSrc1 += 2U;
+        pSrc2 -= 2U;
+
+        /* write output */
+        pDst[2U * i] = (q15_t) outR;
+        pDst[2U * i + 1U] = outI >> 16U;
+
+        /* write complex conjugate output */
+        pDst[(4U * fftLen) - (2U * i)] = (q15_t) outR;
+        pDst[((4U * fftLen) - (2U * i)) + 1U] = -(outI >> 16U);
+
+        /* update coefficient pointer */
+        pCoefB = pCoefB + (2U * modifier);
+        pCoefA = pCoefA + (2U * modifier);
+
+        i++;
+    }
+
+    pDst[2U * fftLen] = (pSrc[0] - pSrc[1]) >> 1;
+    pDst[2U * fftLen + 1U] = 0;
+
+    pDst[0] = (pSrc[0] + pSrc[1]) >> 1;
+    pDst[1] = 0;
+
+#endif /* #if defined (ARM_MATH_DSP) */
+}
+
+
+/**
+  @brief         Core Real IFFT process
+  @param[in]     pSrc      points to input buffer
+  @param[in]     fftLen    length of FFT
+  @param[in]     pATable   points to twiddle Coef A buffer
+  @param[in]     pBTable   points to twiddle Coef B buffer
+  @param[out]    pDst      points to output buffer
+  @param[in]     modifier  twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table
+  @return        none
+
+  @par
+                   The function implements a Real IFFT
+ */
+
+void arm_split_rifft_q15(
+        q15_t * pSrc,
+        uint32_t fftLen,
+  const q15_t * pATable,
+  const q15_t * pBTable,
+        q15_t * pDst,
+        uint32_t modifier)
+{
+        uint32_t i;                                    /* Loop Counter */
+        q31_t outR, outI;                              /* Temporary variables for output */
+  const q15_t *pCoefA, *pCoefB;                        /* Temporary pointers for twiddle factors */
+        q15_t *pSrc1, *pSrc2;
+        q15_t *pDst1 = &pDst[0];
+
+  pCoefA = &pATable[0];
+  pCoefB = &pBTable[0];
+
+  pSrc1 = &pSrc[0];
+  pSrc2 = &pSrc[2 * fftLen];
+
+  i = fftLen;
+  while (i > 0U)
+  {
+      /*
+        outR = (  pIn[2 * i]             * pATable[2 * i]
+                + pIn[2 * i + 1]         * pATable[2 * i + 1]
+                + pIn[2 * n - 2 * i]     * pBTable[2 * i]
+                - pIn[2 * n - 2 * i + 1] * pBTable[2 * i + 1]);
+
+        outI = (  pIn[2 * i + 1]         * pATable[2 * i]
+                - pIn[2 * i]             * pATable[2 * i + 1]
+                - pIn[2 * n - 2 * i]     * pBTable[2 * i + 1]
+                - pIn[2 * n - 2 * i + 1] * pBTable[2 * i]);
+       */
+
+#if defined (ARM_MATH_DSP)
+
+#ifndef ARM_MATH_BIG_ENDIAN
+      /* pIn[2 * n - 2 * i] * pBTable[2 * i] - pIn[2 * n - 2 * i + 1] * pBTable[2 * i + 1]) */
+      outR = __SMUSD(read_q15x2(pSrc2), read_q15x2((q15_t *) pCoefB));
+#else
+      /* -(-pIn[2 * n - 2 * i] * pBTable[2 * i] + pIn[2 * n - 2 * i + 1] * pBTable[2 * i + 1])) */
+      outR = -(__SMUSD(read_q15x2(pSrc2), read_q15x2((q15_t *) pCoefB)));
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+      /* pIn[2 * i] * pATable[2 * i] + pIn[2 * i + 1] * pATable[2 * i + 1] + pIn[2 * n - 2 * i] * pBTable[2 * i] */
+      outR = __SMLAD(read_q15x2(pSrc1), read_q15x2 ((q15_t *) pCoefA), outR) >> 16U;
+
+      /* -pIn[2 * n - 2 * i] * pBTable[2 * i + 1] + pIn[2 * n - 2 * i + 1] * pBTable[2 * i] */
+      outI = __SMUADX(read_q15x2_da (&pSrc2), read_q15x2((q15_t *) pCoefB));
+
+      /* pIn[2 * i + 1] * pATable[2 * i] - pIn[2 * i] * pATable[2 * i + 1] */
+#ifndef ARM_MATH_BIG_ENDIAN
+      outI = __SMLSDX(read_q15x2 ((q15_t *) pCoefA), read_q15x2_ia (&pSrc1), -outI);
+#else
+      outI = __SMLSDX(read_q15x2_ia (&pSrc1), read_q15x2 ((q15_t *) pCoefA), -outI);
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+      /* write output */
+#ifndef ARM_MATH_BIG_ENDIAN
+      write_q15x2_ia (&pDst1, __PKHBT(outR, (outI >> 16U), 16));
+#else
+      write_q15x2_ia (&pDst1, __PKHBT((outI >> 16U), outR, 16));
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+
+#else  /* #if defined (ARM_MATH_DSP) */
+
+      outR = *pSrc2 * *pCoefB;
+      outR = outR - (*(pSrc2 + 1) * *(pCoefB + 1));
+      outR = outR + (*pSrc1 * *pCoefA);
+      outR = (outR + (*(pSrc1 + 1) * *(pCoefA + 1))) >> 16;
+
+      outI = *(pSrc1 + 1) * *pCoefA;
+      outI = outI - (*pSrc1 * *(pCoefA + 1));
+      outI = outI - (*pSrc2 * *(pCoefB + 1));
+      outI = outI - (*(pSrc2 + 1) * *(pCoefB));
+
+      /* update input pointers */
+      pSrc1 += 2U;
+      pSrc2 -= 2U;
+
+      /* write output */
+      *pDst1++ = (q15_t) outR;
+      *pDst1++ = (q15_t) (outI >> 16);
+
+#endif /* #if defined (ARM_MATH_DSP) */
+
+      /* update coefficient pointer */
+      pCoefB = pCoefB + (2 * modifier);
+      pCoefA = pCoefA + (2 * modifier);
+
+      i--;
+  }
+
+}
diff --git a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_rfft_q31.c b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_rfft_q31.c
index ad3212d..0cc595b 100644
--- a/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_rfft_q31.c
+++ b/Drivers/CMSIS/DSP/Source/TransformFunctions/arm_rfft_q31.c
@@ -1,430 +1,292 @@
-/* ----------------------------------------------------------------------
- * Project:      CMSIS DSP Library
- * Title:        arm_rfft_q31.c
- * Description:  FFT & RIFFT Q31 process function
- *
- * $Date:        23 April 2021
- * $Revision:    V1.9.0
- *
- * Target Processor: Cortex-M and Cortex-A cores
- * -------------------------------------------------------------------- */
-/*
- * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "dsp/transform_functions.h"
-
-/* ----------------------------------------------------------------------
- * Internal functions prototypes
- * -------------------------------------------------------------------- */
-
-void arm_split_rfft_q31(
-        q31_t * pSrc,
-        uint32_t fftLen,
-  const q31_t * pATable,
-  const q31_t * pBTable,
-        q31_t * pDst,
-        uint32_t modifier);
-
-void arm_split_rifft_q31(
-        q31_t * pSrc,
-        uint32_t fftLen,
-  const q31_t * pATable,
-  const q31_t * pBTable,
-        q31_t * pDst,
-        uint32_t modifier);
-
-/**
-  @addtogroup RealFFT
-  @{
- */
-
-/**
-  @brief         Processing function for the Q31 RFFT/RIFFT.
-  @param[in]     S     points to an instance of the Q31 RFFT/RIFFT structure
-  @param[in]     pSrc  points to input buffer (Source buffer is modified by this function)
-  @param[out]    pDst  points to output buffer
-  @return        none
-
-  @par           Input an output formats
-                   Internally input is downscaled by 2 for every stage to avoid saturations inside CFFT/CIFFT process.
-                   Hence the output format is different for different RFFT sizes.
-                   The input and output formats for different RFFT sizes and number of bits to upscale are mentioned in the tables below for RFFT and RIFFT:
-  @par
-                   \image html RFFTQ31.gif "Input and Output Formats for Q31 RFFT"
-  @par
-                   \image html RIFFTQ31.gif "Input and Output Formats for Q31 RIFFT"
-  @par
-                   If the input buffer is of length N, the output buffer must have length 2*N.
-                   The input buffer is modified by this function.
-  @par
-                   For the RIFFT, the source buffer must at least have length 
-                   fftLenReal + 2.
-                   The last two elements must be equal to what would be generated
-                   by the RFFT:
-                     (pSrc[0] - pSrc[1]) >> 1 and 0
-
- */
-
-void arm_rfft_q31(
-  const arm_rfft_instance_q31 * S,
-        q31_t * pSrc,
-        q31_t * pDst)
-{
-#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
-  const arm_cfft_instance_q31 *S_CFFT = &(S->cfftInst);
-#else
-  const arm_cfft_instance_q31 *S_CFFT = S->pCfft;
-#endif
-        uint32_t L2 = S->fftLenReal >> 1U;
-
-  /* Calculation of RIFFT of input */
-  if (S->ifftFlagR == 1U)
-  {
-     /*  Real IFFT core process */
-     arm_split_rifft_q31 (pSrc, L2, S->pTwiddleAReal, S->pTwiddleBReal, pDst, S->twidCoefRModifier);
-
-     /* Complex IFFT process */
-     arm_cfft_q31 (S_CFFT, pDst, S->ifftFlagR, S->bitReverseFlagR);
-
-     arm_shift_q31(pDst, 1, pDst, S->fftLenReal);
-  }
-  else
-  {
-     /* Calculation of RFFT of input */
-
-     /* Complex FFT process */
-     arm_cfft_q31 (S_CFFT, pSrc, S->ifftFlagR, S->bitReverseFlagR);
-
-     /*  Real FFT core process */
-     arm_split_rfft_q31 (pSrc, L2, S->pTwiddleAReal, S->pTwiddleBReal, pDst, S->twidCoefRModifier);
-  }
-
-}
-
-/**
-  @} end of RealFFT group
- */
-
-/**
-  @brief         Core Real FFT process
-  @param[in]     pSrc      points to input buffer
-  @param[in]     fftLen    length of FFT
-  @param[in]     pATable   points to twiddle Coef A buffer
-  @param[in]     pBTable   points to twiddle Coef B buffer
-  @param[out]    pDst      points to output buffer
-  @param[in]     modifier  twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table
-  @return        none
- */
-
-#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
-
-#include "arm_helium_utils.h"
-#include "arm_vec_fft.h"
-
-#if defined(__CMSIS_GCC_H)
-
-#define MVE_CMPLX_MULT_FX_AxB_S32(A,B)          vqdmladhxq_s32(vqdmlsdhq_s32((__typeof(A))vuninitializedq_s32(), A, B), A, B)
-#define MVE_CMPLX_MULT_FX_AxConjB_S32(A,B)      vqdmladhq_s32(vqdmlsdhxq_s32((__typeof(A))vuninitializedq_s32(), A, B), A, B)
-
-#endif 
-
-void arm_split_rfft_q31(
-    q31_t       *pSrc,
-    uint32_t     fftLen,
-    const q31_t       *pATable,
-    const q31_t       *pBTable,
-    q31_t       *pDst,
-    uint32_t     modifier)
-{
-    uint32_t        i;          /* Loop Counter */
-    const q31_t    *pCoefA, *pCoefB;    /* Temporary pointers for twiddle factors */
-    q31_t          *pOut1 = &pDst[2];
-    q31_t          *pIn1 = &pSrc[2];
-    uint32x4_t      offset = { 2, 3, 0, 1 };
-    uint32x4_t      offsetCoef = { 0, 1, modifier * 2, modifier * 2 + 1 };
-
-    offset = offset + (2 * fftLen - 4);
-
-
-    /* Init coefficient pointers */
-    pCoefA = &pATable[modifier * 2];
-    pCoefB = &pBTable[modifier * 2];
-
-    const q31_t    *pCoefAb, *pCoefBb;
-    pCoefAb = pCoefA;
-    pCoefBb = pCoefB;
-
-    pIn1 = &pSrc[2];
-
-    i = fftLen - 1U;
-    i = i / 2 + 1;
-    while (i > 0U) {
-        q31x4_t         in1 = vld1q_s32(pIn1);
-        q31x4_t         in2 = vldrwq_gather_shifted_offset_s32(pSrc, offset);
-        q31x4_t         coefA = vldrwq_gather_shifted_offset_s32(pCoefAb, offsetCoef);
-        q31x4_t         coefB = vldrwq_gather_shifted_offset_s32(pCoefBb, offsetCoef);
-#if defined(__CMSIS_GCC_H)
-        q31x4_t         out = vhaddq_s32(MVE_CMPLX_MULT_FX_AxB_S32(in1, coefA),MVE_CMPLX_MULT_FX_AxConjB_S32(coefB, in2));
-#else
-        q31x4_t         out = vhaddq_s32(MVE_CMPLX_MULT_FX_AxB(in1, coefA, q31x4_t),
-                                         MVE_CMPLX_MULT_FX_AxConjB(coefB, in2, q31x4_t));
-#endif
-        vst1q(pOut1, out);
-        pOut1 += 4;
-
-        offsetCoef += modifier * 4;
-        offset -= 4;
-
-        pIn1 += 4;
-        i -= 1;
-    }
-
-    pDst[2 * fftLen] = (pSrc[0] - pSrc[1]) >> 1U;
-    pDst[2 * fftLen + 1] = 0;
-
-    pDst[0] = (pSrc[0] + pSrc[1]) >> 1U;
-    pDst[1] = 0;
-}
-#else
-void arm_split_rfft_q31(
-        q31_t * pSrc,
-        uint32_t fftLen,
-  const q31_t * pATable,
-  const q31_t * pBTable,
-        q31_t * pDst,
-        uint32_t modifier)
-{
-        uint32_t i;                                    /* Loop Counter */
-        q31_t outR, outI;                              /* Temporary variables for output */
-  const q31_t *pCoefA, *pCoefB;                        /* Temporary pointers for twiddle factors */
-        q31_t CoefA1, CoefA2, CoefB1;                  /* Temporary variables for twiddle coefficients */
-        q31_t *pOut1 = &pDst[2], *pOut2 = &pDst[4 * fftLen - 1];
-        q31_t *pIn1 =  &pSrc[2], *pIn2 =  &pSrc[2 * fftLen - 1];
-
-  /* Init coefficient pointers */
-  pCoefA = &pATable[modifier * 2];
-  pCoefB = &pBTable[modifier * 2];
-
-  i = fftLen - 1U;
-
-  while (i > 0U)
-  {
-     /*
-       outR = (  pSrc[2 * i]             * pATable[2 * i]
-               - pSrc[2 * i + 1]         * pATable[2 * i + 1]
-               + pSrc[2 * n - 2 * i]     * pBTable[2 * i]
-               + pSrc[2 * n - 2 * i + 1] * pBTable[2 * i + 1]);
-
-       outI = (  pIn[2 * i + 1]         * pATable[2 * i]
-               + pIn[2 * i]             * pATable[2 * i + 1]
-               + pIn[2 * n - 2 * i]     * pBTable[2 * i + 1]
-               - pIn[2 * n - 2 * i + 1] * pBTable[2 * i]);
-      */
-
-     CoefA1 = *pCoefA++;
-     CoefA2 = *pCoefA;
-
-     /* outR = (pSrc[2 * i] * pATable[2 * i] */
-     mult_32x32_keep32_R (outR, *pIn1, CoefA1);
-
-     /* outI = pIn[2 * i] * pATable[2 * i + 1] */
-     mult_32x32_keep32_R (outI, *pIn1++, CoefA2);
-
-     /* - pSrc[2 * i + 1] * pATable[2 * i + 1] */
-     multSub_32x32_keep32_R (outR, *pIn1, CoefA2);
-
-     /* (pIn[2 * i + 1] * pATable[2 * i] */
-     multAcc_32x32_keep32_R (outI, *pIn1++, CoefA1);
-
-     /* pSrc[2 * n - 2 * i] * pBTable[2 * i]  */
-     multSub_32x32_keep32_R (outR, *pIn2, CoefA2);
-     CoefB1 = *pCoefB;
-
-     /* pIn[2 * n - 2 * i] * pBTable[2 * i + 1] */
-     multSub_32x32_keep32_R (outI, *pIn2--, CoefB1);
-
-     /* pSrc[2 * n - 2 * i + 1] * pBTable[2 * i + 1] */
-     multAcc_32x32_keep32_R (outR, *pIn2, CoefB1);
-
-     /* pIn[2 * n - 2 * i + 1] * pBTable[2 * i] */
-     multSub_32x32_keep32_R (outI, *pIn2--, CoefA2);
-
-     /* write output */
-     *pOut1++ = outR;
-     *pOut1++ = outI;
-
-     /* write complex conjugate output */
-     *pOut2-- = -outI;
-     *pOut2-- = outR;
-
-     /* update coefficient pointer */
-     pCoefB = pCoefB + (2 * modifier);
-     pCoefA = pCoefA + (2 * modifier - 1);
-
-     /* Decrement loop count */
-     i--;
-  }
-
-  pDst[2 * fftLen]     = (pSrc[0] - pSrc[1]) >> 1U;
-  pDst[2 * fftLen + 1] = 0;
-
-  pDst[0] = (pSrc[0] + pSrc[1]) >> 1U;
-  pDst[1] = 0;
-}
-#endif /* defined(ARM_MATH_MVEI) */
-
-/**
-  @brief         Core Real IFFT process
-  @param[in]     pSrc      points to input buffer
-  @param[in]     fftLen    length of FFT
-  @param[in]     pATable   points to twiddle Coef A buffer
-  @param[in]     pBTable   points to twiddle Coef B buffer
-  @param[out]    pDst      points to output buffer
-  @param[in]     modifier  twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table
-  @return        none
- */
-
-#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
-
-void arm_split_rifft_q31(
-        q31_t * pSrc,
-        uint32_t fftLen,
-  const q31_t * pATable,
-  const q31_t * pBTable,
-        q31_t * pDst,
-        uint32_t modifier)
-{
-    uint32_t        i;          /* Loop Counter */
-    const q31_t    *pCoefA, *pCoefB;    /* Temporary pointers for twiddle factors */
-    q31_t          *pIn1;
-    uint32x4_t      offset = { 2, 3, 0, 1 };
-    uint32x4_t      offsetCoef = { 0, 1, modifier * 2, modifier * 2 + 1 };
-    int32x4_t       conj = { 1, -1, 1, -1 };
-
-    offset = offset + (2 * fftLen - 2);
-
-    /* Init coefficient pointers */
-    pCoefA = &pATable[0];
-    pCoefB = &pBTable[0];
-
-    const q31_t    *pCoefAb, *pCoefBb;
-    pCoefAb = pCoefA;
-    pCoefBb = pCoefB;
-
-    pIn1 = &pSrc[0];
-
-    i = fftLen;
-    i = i >> 1;
-    while (i > 0U) {
-        q31x4_t         in1 = vld1q_s32(pIn1);
-        q31x4_t         in2 = vldrwq_gather_shifted_offset_s32(pSrc, offset);
-        q31x4_t         coefA = vldrwq_gather_shifted_offset_s32(pCoefAb, offsetCoef);
-        q31x4_t         coefB = vldrwq_gather_shifted_offset_s32(pCoefBb, offsetCoef);
-
-        /* can we avoid the conjugate here ? */
-#if defined(__CMSIS_GCC_H)
-        q31x4_t         out = vhaddq_s32(MVE_CMPLX_MULT_FX_AxConjB_S32(in1, coefA),
-                                     vmulq_s32(conj, MVE_CMPLX_MULT_FX_AxB_S32(in2, coefB)));
-#else
-        q31x4_t         out = vhaddq_s32(MVE_CMPLX_MULT_FX_AxConjB(in1, coefA, q31x4_t),
-                                         vmulq_s32(conj, MVE_CMPLX_MULT_FX_AxB(in2, coefB, q31x4_t)));
-#endif
-        vst1q_s32(pDst, out);
-        pDst += 4;
-
-        offsetCoef += modifier * 4;
-        offset -= 4;
-
-        pIn1 += 4;
-        i -= 1;
-    }
-}
-#else
-void arm_split_rifft_q31(
-        q31_t * pSrc,
-        uint32_t fftLen,
-  const q31_t * pATable,
-  const q31_t * pBTable,
-        q31_t * pDst,
-        uint32_t modifier)
-{       
-        q31_t outR, outI;                              /* Temporary variables for output */
-  const q31_t *pCoefA, *pCoefB;                        /* Temporary pointers for twiddle factors */
-        q31_t CoefA1, CoefA2, CoefB1;                  /* Temporary variables for twiddle coefficients */
-        q31_t *pIn1 = &pSrc[0], *pIn2 = &pSrc[2 * fftLen + 1];
-
-  pCoefA = &pATable[0];
-  pCoefB = &pBTable[0];
-
-  while (fftLen > 0U)
-  {
-     /*
-       outR = (  pIn[2 * i]             * pATable[2 * i]
-               + pIn[2 * i + 1]         * pATable[2 * i + 1]
-               + pIn[2 * n - 2 * i]     * pBTable[2 * i]
-               - pIn[2 * n - 2 * i + 1] * pBTable[2 * i + 1]);
-
-       outI = (  pIn[2 * i + 1]         * pATable[2 * i]
-               - pIn[2 * i]             * pATable[2 * i + 1]
-               - pIn[2 * n - 2 * i]     * pBTable[2 * i + 1]
-               - pIn[2 * n - 2 * i + 1] * pBTable[2 * i]);
-      */
-
-     CoefA1 = *pCoefA++;
-     CoefA2 = *pCoefA;
-
-     /* outR = (pIn[2 * i] * pATable[2 * i] */
-     mult_32x32_keep32_R (outR, *pIn1, CoefA1);
-
-     /* - pIn[2 * i] * pATable[2 * i + 1] */
-     mult_32x32_keep32_R (outI, *pIn1++, -CoefA2);
-
-     /* pIn[2 * i + 1] * pATable[2 * i + 1] */
-     multAcc_32x32_keep32_R (outR, *pIn1, CoefA2);
-
-     /* pIn[2 * i + 1] * pATable[2 * i] */
-     multAcc_32x32_keep32_R (outI, *pIn1++, CoefA1);
-
-     /* pIn[2 * n - 2 * i] * pBTable[2 * i] */
-     multAcc_32x32_keep32_R (outR, *pIn2, CoefA2);
-     CoefB1 = *pCoefB;
-
-     /* pIn[2 * n - 2 * i] * pBTable[2 * i + 1] */
-     multSub_32x32_keep32_R (outI, *pIn2--, CoefB1);
-
-     /* pIn[2 * n - 2 * i + 1] * pBTable[2 * i + 1] */
-     multAcc_32x32_keep32_R (outR, *pIn2, CoefB1);
-
-     /* pIn[2 * n - 2 * i + 1] * pBTable[2 * i] */
-     multAcc_32x32_keep32_R (outI, *pIn2--, CoefA2);
-
-     /* write output */
-     *pDst++ = outR;
-     *pDst++ = outI;
-
-     /* update coefficient pointer */
-     pCoefB = pCoefB + (modifier * 2);
-     pCoefA = pCoefA + (modifier * 2 - 1);
-
-     /* Decrement loop count */
-     fftLen--;
-  }
-
-}
-
-#endif /* defined(ARM_MATH_MVEI) */
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_rfft_q31.c
+ * Description:  FFT & RIFFT Q31 process function
+ *
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/* ----------------------------------------------------------------------
+ * Internal functions prototypes
+ * -------------------------------------------------------------------- */
+
+void arm_split_rfft_q31(
+        q31_t * pSrc,
+        uint32_t fftLen,
+  const q31_t * pATable,
+  const q31_t * pBTable,
+        q31_t * pDst,
+        uint32_t modifier);
+
+void arm_split_rifft_q31(
+        q31_t * pSrc,
+        uint32_t fftLen,
+  const q31_t * pATable,
+  const q31_t * pBTable,
+        q31_t * pDst,
+        uint32_t modifier);
+
+/**
+  @addtogroup RealFFT
+  @{
+ */
+
+/**
+  @brief         Processing function for the Q31 RFFT/RIFFT.
+  @param[in]     S     points to an instance of the Q31 RFFT/RIFFT structure
+  @param[in]     pSrc  points to input buffer
+  @param[out]    pDst  points to output buffer
+  @return        none
+
+  @par           Input an output formats
+                   Internally input is downscaled by 2 for every stage to avoid saturations inside CFFT/CIFFT process.
+                   Hence the output format is different for different RFFT sizes.
+                   The input and output formats for different RFFT sizes and number of bits to upscale are mentioned in the tables below for RFFT and RIFFT:
+  @par
+                   \image html RFFTQ31.gif "Input and Output Formats for Q31 RFFT"
+  @par
+                   \image html RIFFTQ31.gif "Input and Output Formats for Q31 RIFFT"
+ */
+
+void arm_rfft_q31(
+  const arm_rfft_instance_q31 * S,
+        q31_t * pSrc,
+        q31_t * pDst)
+{
+  const arm_cfft_instance_q31 *S_CFFT = S->pCfft;
+        uint32_t L2 = S->fftLenReal >> 1U;
+        uint32_t i;
+
+  /* Calculation of RIFFT of input */
+  if (S->ifftFlagR == 1U)
+  {
+     /*  Real IFFT core process */
+     arm_split_rifft_q31 (pSrc, L2, S->pTwiddleAReal, S->pTwiddleBReal, pDst, S->twidCoefRModifier);
+
+     /* Complex IFFT process */
+     arm_cfft_q31 (S_CFFT, pDst, S->ifftFlagR, S->bitReverseFlagR);
+
+     for(i = 0; i < S->fftLenReal; i++)
+     {
+        pDst[i] = pDst[i] << 1U;
+     }
+  }
+  else
+  {
+     /* Calculation of RFFT of input */
+
+     /* Complex FFT process */
+     arm_cfft_q31 (S_CFFT, pSrc, S->ifftFlagR, S->bitReverseFlagR);
+
+     /*  Real FFT core process */
+     arm_split_rfft_q31 (pSrc, L2, S->pTwiddleAReal, S->pTwiddleBReal, pDst, S->twidCoefRModifier);
+  }
+
+}
+
+/**
+  @} end of RealFFT group
+ */
+
+/**
+  @brief         Core Real FFT process
+  @param[in]     pSrc      points to input buffer
+  @param[in]     fftLen    length of FFT
+  @param[in]     pATable   points to twiddle Coef A buffer
+  @param[in]     pBTable   points to twiddle Coef B buffer
+  @param[out]    pDst      points to output buffer
+  @param[in]     modifier  twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table
+  @return        none
+ */
+
+void arm_split_rfft_q31(
+        q31_t * pSrc,
+        uint32_t fftLen,
+  const q31_t * pATable,
+  const q31_t * pBTable,
+        q31_t * pDst,
+        uint32_t modifier)
+{
+        uint32_t i;                                    /* Loop Counter */
+        q31_t outR, outI;                              /* Temporary variables for output */
+  const q31_t *pCoefA, *pCoefB;                        /* Temporary pointers for twiddle factors */
+        q31_t CoefA1, CoefA2, CoefB1;                  /* Temporary variables for twiddle coefficients */
+        q31_t *pOut1 = &pDst[2], *pOut2 = &pDst[4 * fftLen - 1];
+        q31_t *pIn1 =  &pSrc[2], *pIn2 =  &pSrc[2 * fftLen - 1];
+
+  /* Init coefficient pointers */
+  pCoefA = &pATable[modifier * 2];
+  pCoefB = &pBTable[modifier * 2];
+
+  i = fftLen - 1U;
+
+  while (i > 0U)
+  {
+     /*
+       outR = (  pSrc[2 * i]             * pATable[2 * i]
+               - pSrc[2 * i + 1]         * pATable[2 * i + 1]
+               + pSrc[2 * n - 2 * i]     * pBTable[2 * i]
+               + pSrc[2 * n - 2 * i + 1] * pBTable[2 * i + 1]);
+
+       outI = (  pIn[2 * i + 1]         * pATable[2 * i]
+               + pIn[2 * i]             * pATable[2 * i + 1]
+               + pIn[2 * n - 2 * i]     * pBTable[2 * i + 1]
+               - pIn[2 * n - 2 * i + 1] * pBTable[2 * i]);
+      */
+
+     CoefA1 = *pCoefA++;
+     CoefA2 = *pCoefA;
+
+     /* outR = (pSrc[2 * i] * pATable[2 * i] */
+     mult_32x32_keep32_R (outR, *pIn1, CoefA1);
+
+     /* outI = pIn[2 * i] * pATable[2 * i + 1] */
+     mult_32x32_keep32_R (outI, *pIn1++, CoefA2);
+
+     /* - pSrc[2 * i + 1] * pATable[2 * i + 1] */
+     multSub_32x32_keep32_R (outR, *pIn1, CoefA2);
+
+     /* (pIn[2 * i + 1] * pATable[2 * i] */
+     multAcc_32x32_keep32_R (outI, *pIn1++, CoefA1);
+
+     /* pSrc[2 * n - 2 * i] * pBTable[2 * i]  */
+     multSub_32x32_keep32_R (outR, *pIn2, CoefA2);
+     CoefB1 = *pCoefB;
+
+     /* pIn[2 * n - 2 * i] * pBTable[2 * i + 1] */
+     multSub_32x32_keep32_R (outI, *pIn2--, CoefB1);
+
+     /* pSrc[2 * n - 2 * i + 1] * pBTable[2 * i + 1] */
+     multAcc_32x32_keep32_R (outR, *pIn2, CoefB1);
+
+     /* pIn[2 * n - 2 * i + 1] * pBTable[2 * i] */
+     multSub_32x32_keep32_R (outI, *pIn2--, CoefA2);
+
+     /* write output */
+     *pOut1++ = outR;
+     *pOut1++ = outI;
+
+     /* write complex conjugate output */
+     *pOut2-- = -outI;
+     *pOut2-- = outR;
+
+     /* update coefficient pointer */
+     pCoefB = pCoefB + (2 * modifier);
+     pCoefA = pCoefA + (2 * modifier - 1);
+
+     /* Decrement loop count */
+     i--;
+  }
+
+  pDst[2 * fftLen]     = (pSrc[0] - pSrc[1]) >> 1U;
+  pDst[2 * fftLen + 1] = 0;
+
+  pDst[0] = (pSrc[0] + pSrc[1]) >> 1U;
+  pDst[1] = 0;
+}
+
+
+/**
+  @brief         Core Real IFFT process
+  @param[in]     pSrc      points to input buffer
+  @param[in]     fftLen    length of FFT
+  @param[in]     pATable   points to twiddle Coef A buffer
+  @param[in]     pBTable   points to twiddle Coef B buffer
+  @param[out]    pDst      points to output buffer
+  @param[in]     modifier  twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table
+  @return        none
+ */
+
+void arm_split_rifft_q31(
+        q31_t * pSrc,
+        uint32_t fftLen,
+  const q31_t * pATable,
+  const q31_t * pBTable,
+        q31_t * pDst,
+        uint32_t modifier)
+{       
+        q31_t outR, outI;                              /* Temporary variables for output */
+  const q31_t *pCoefA, *pCoefB;                        /* Temporary pointers for twiddle factors */
+        q31_t CoefA1, CoefA2, CoefB1;                  /* Temporary variables for twiddle coefficients */
+        q31_t *pIn1 = &pSrc[0], *pIn2 = &pSrc[2 * fftLen + 1];
+
+  pCoefA = &pATable[0];
+  pCoefB = &pBTable[0];
+
+  while (fftLen > 0U)
+  {
+     /*
+       outR = (  pIn[2 * i]             * pATable[2 * i]
+               + pIn[2 * i + 1]         * pATable[2 * i + 1]
+               + pIn[2 * n - 2 * i]     * pBTable[2 * i]
+               - pIn[2 * n - 2 * i + 1] * pBTable[2 * i + 1]);
+
+       outI = (  pIn[2 * i + 1]         * pATable[2 * i]
+               - pIn[2 * i]             * pATable[2 * i + 1]
+               - pIn[2 * n - 2 * i]     * pBTable[2 * i + 1]
+               - pIn[2 * n - 2 * i + 1] * pBTable[2 * i]);
+      */
+
+     CoefA1 = *pCoefA++;
+     CoefA2 = *pCoefA;
+
+     /* outR = (pIn[2 * i] * pATable[2 * i] */
+     mult_32x32_keep32_R (outR, *pIn1, CoefA1);
+
+     /* - pIn[2 * i] * pATable[2 * i + 1] */
+     mult_32x32_keep32_R (outI, *pIn1++, -CoefA2);
+
+     /* pIn[2 * i + 1] * pATable[2 * i + 1] */
+     multAcc_32x32_keep32_R (outR, *pIn1, CoefA2);
+
+     /* pIn[2 * i + 1] * pATable[2 * i] */
+     multAcc_32x32_keep32_R (outI, *pIn1++, CoefA1);
+
+     /* pIn[2 * n - 2 * i] * pBTable[2 * i] */
+     multAcc_32x32_keep32_R (outR, *pIn2, CoefA2);
+     CoefB1 = *pCoefB;
+
+     /* pIn[2 * n - 2 * i] * pBTable[2 * i + 1] */
+     multSub_32x32_keep32_R (outI, *pIn2--, CoefB1);
+
+     /* pIn[2 * n - 2 * i + 1] * pBTable[2 * i + 1] */
+     multAcc_32x32_keep32_R (outR, *pIn2, CoefB1);
+
+     /* pIn[2 * n - 2 * i + 1] * pBTable[2 * i] */
+     multAcc_32x32_keep32_R (outI, *pIn2--, CoefA2);
+
+     /* write output */
+     *pDst++ = outR;
+     *pDst++ = outI;
+
+     /* update coefficient pointer */
+     pCoefB = pCoefB + (modifier * 2);
+     pCoefA = pCoefA + (modifier * 2 - 1);
+
+     /* Decrement loop count */
+     fftLen--;
+  }
+
+}