diff options
Diffstat (limited to 'Drivers/CMSIS/NN')
102 files changed, 21721 insertions, 0 deletions
diff --git a/Drivers/CMSIS/NN/CMakeLists.txt b/Drivers/CMSIS/NN/CMakeLists.txt new file mode 100644 index 0000000..ad3b748 --- /dev/null +++ b/Drivers/CMSIS/NN/CMakeLists.txt @@ -0,0 +1,29 @@ +# +# Copyright (c) 2019-2021 Arm Limited. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +cmake_minimum_required(VERSION 3.15.6) + +project(CMSISNN) + +set(CMSIS_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../..") + +option(BUILD_CMSIS_NN_FUNCTIONS "Build CMSIS-NN Source." ON) + +if(BUILD_CMSIS_NN_FUNCTIONS) + add_subdirectory(Source) +endif() diff --git a/Drivers/CMSIS/NN/Include/arm_nn_math_types.h b/Drivers/CMSIS/NN/Include/arm_nn_math_types.h new file mode 100644 index 0000000..390fe78 --- /dev/null +++ b/Drivers/CMSIS/NN/Include/arm_nn_math_types.h @@ -0,0 +1,169 @@ +/****************************************************************************** + * @file arm_nn_math_types.h + * @brief Compiler include and basic types + * @version V1.1.0 + * @date 09 March 2022 + * Target Processor: Cortex-M + ******************************************************************************/ +/* + * Copyright (c) 2010-2022 Arm Limited or its affiliates. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + Copied from CMSIS/DSP/arm_math_types.h and modified +*/ + +#ifndef _ARM_NN_MATH_TYPES_H_ + +#define _ARM_NN_MATH_TYPES_H_ + +/* DSP inlcude for enum arm_status. */ +#include "arm_math_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* Compiler specific diagnostic adjustment */ +#if defined(__CC_ARM) + +#elif defined(__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050) + +#elif defined(__GNUC__) + +#elif defined(__ICCARM__) + +#elif defined(__TI_ARM__) + +#elif defined(__CSMC__) + +#elif defined(__TASKING__) + +#elif defined(_MSC_VER) + +#else +#error Unknown compiler +#endif + +/* Included for instrinsics definitions */ +#if defined(_MSC_VER) +#include <stdint.h> +#ifndef __STATIC_FORCEINLINE +#define __STATIC_FORCEINLINE static __forceinline +#endif +#ifndef __STATIC_INLINE +#define __STATIC_INLINE static __inline +#endif +#ifndef __ALIGNED +#define __ALIGNED(x) __declspec(align(x)) +#endif + +#elif defined(__GNUC_PYTHON__) +#include <stdint.h> +#ifndef __ALIGNED +#define __ALIGNED(x) __attribute__((aligned(x))) +#endif +#ifndef __STATIC_FORCEINLINE +#define __STATIC_FORCEINLINE static inline __attribute__((always_inline)) +#endif +#ifndef __STATIC_INLINE +#define __STATIC_INLINE static inline +#endif + +#else +#include "cmsis_compiler.h" +#endif + +#include <float.h> +#include <limits.h> +#include <math.h> +#include <string.h> + +/* evaluate ARM DSP feature */ +#if (defined(__ARM_FEATURE_DSP) && (__ARM_FEATURE_DSP == 1)) +#ifndef ARM_MATH_DSP +#define ARM_MATH_DSP 1 +#endif +#endif + +#if __ARM_FEATURE_MVE +#ifndef ARM_MATH_MVEI +#define ARM_MATH_MVEI +#endif +#endif + +/* Compiler specific diagnostic adjustment */ +#if defined(__CC_ARM) + +#elif defined(__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050) + +#elif defined(__GNUC__) +// #pragma GCC diagnostic pop + +#elif defined(__ICCARM__) + +#elif defined(__TI_ARM__) + +#elif defined(__CSMC__) + +#elif defined(__TASKING__) + +#elif defined(_MSC_VER) + +#else +#error Unknown compiler +#endif + +#ifdef __cplusplus +} +#endif + +#if __ARM_FEATURE_MVE +#include <arm_mve.h> +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @brief Add necessary typedefs + */ + +#define NN_Q31_MAX ((q31_t)(0x7FFFFFFFL)) +#define NN_Q15_MAX ((q15_t)(0x7FFF)) +#define NN_Q7_MAX ((q7_t)(0x7F)) +#define NN_Q31_MIN ((q31_t)(0x80000000L)) +#define NN_Q15_MIN ((q15_t)(0x8000)) +#define NN_Q7_MIN ((q7_t)(0x80)) + +/** + * @brief Error status returned by some functions in the library. + */ + +typedef enum +{ + ARM_CMSIS_NN_SUCCESS = 0, /**< No error */ + ARM_CMSIS_NN_ARG_ERROR = -1, /**< One or more arguments are incorrect */ + ARM_CMSIS_NN_NO_IMPL_ERROR = -2, /**< No implementation available */ +} arm_cmsis_nn_status; + +#ifdef __cplusplus +} +#endif + +#endif /*ifndef _ARM_NN_MATH_TYPES_H_ */ diff --git a/Drivers/CMSIS/NN/Include/arm_nn_tables.h b/Drivers/CMSIS/NN/Include/arm_nn_tables.h new file mode 100644 index 0000000..327294d --- /dev/null +++ b/Drivers/CMSIS/NN/Include/arm_nn_tables.h @@ -0,0 +1,56 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_nn_tables.h + * Description: Extern declaration for NN tables + * + * $Date: 17. August 2021 + * $Revision: V.1.0.2 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _ARM_NN_TABLES_H +#define _ARM_NN_TABLES_H + +#include "arm_nn_math_types.h" + +/** + * @brief tables for various activation functions + * + */ + +extern const q15_t sigmoidTable_q15[256]; +extern const q7_t sigmoidTable_q7[256]; + +extern const q7_t tanhTable_q7[256]; +extern const q15_t tanhTable_q15[256]; + +/** + * @brief 2-way tables for various activation functions + * + * 2-way table, H table for value larger than 1/4 + * L table for value smaller than 1/4, H table for remaining + * We have this only for the q15_t version. It does not make + * sense to have it for q7_t type + */ +extern const q15_t sigmoidHTable_q15[192]; +extern const q15_t sigmoidLTable_q15[128]; + +#endif /* ARM_NN_TABLES_H */ diff --git a/Drivers/CMSIS/NN/Include/arm_nn_types.h b/Drivers/CMSIS/NN/Include/arm_nn_types.h new file mode 100644 index 0000000..6040d72 --- /dev/null +++ b/Drivers/CMSIS/NN/Include/arm_nn_types.h @@ -0,0 +1,137 @@ +/* + * Copyright (C) 2020-2022 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_nn_types.h + * Description: Public header file to contain the CMSIS-NN structs for the + * TensorFlowLite micro compliant functions + * + * $Date: 22. Februari 2022 + * $Revision: V.2.1.0 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ + +#ifndef _ARM_NN_TYPES_H +#define _ARM_NN_TYPES_H + +#include <stdint.h> + +/** CMSIS-NN object to contain the width and height of a tile */ +typedef struct +{ + int32_t w; /**< Width */ + int32_t h; /**< Height */ +} cmsis_nn_tile; + +/** CMSIS-NN object used for the function context. */ +typedef struct +{ + void *buf; /**< Pointer to a buffer needed for the optimization */ + int32_t size; /**< Buffer size */ +} cmsis_nn_context; + +/** CMSIS-NN object to contain the dimensions of the tensors */ +typedef struct +{ + int32_t n; /**< Generic dimension to contain either the batch size or output channels. + Please refer to the function documentation for more information */ + int32_t h; /**< Height */ + int32_t w; /**< Width */ + int32_t c; /**< Input channels */ +} cmsis_nn_dims; + +/** CMSIS-NN object for the per-channel quantization parameters */ +typedef struct +{ + int32_t *multiplier; /**< Multiplier values */ + int32_t *shift; /**< Shift values */ +} cmsis_nn_per_channel_quant_params; + +/** CMSIS-NN object for the per-tensor quantization parameters */ +typedef struct +{ + int32_t multiplier; /**< Multiplier value */ + int32_t shift; /**< Shift value */ +} cmsis_nn_per_tensor_quant_params; + +/** CMSIS-NN object for the quantized Relu activation */ +typedef struct +{ + int32_t min; /**< Min value used to clamp the result */ + int32_t max; /**< Max value used to clamp the result */ +} cmsis_nn_activation; + +/** CMSIS-NN object for the convolution layer parameters */ +typedef struct +{ + int32_t input_offset; /**< Zero value for the input tensor */ + int32_t output_offset; /**< Zero value for the output tensor */ + cmsis_nn_tile stride; + cmsis_nn_tile padding; + cmsis_nn_tile dilation; + cmsis_nn_activation activation; +} cmsis_nn_conv_params; + +/** CMSIS-NN object for Depthwise convolution layer parameters */ +typedef struct +{ + int32_t input_offset; /**< Zero value for the input tensor */ + int32_t output_offset; /**< Zero value for the output tensor */ + int32_t ch_mult; /**< Channel Multiplier. ch_mult * in_ch = out_ch */ + cmsis_nn_tile stride; + cmsis_nn_tile padding; + cmsis_nn_tile dilation; + cmsis_nn_activation activation; +} cmsis_nn_dw_conv_params; +/** CMSIS-NN object for pooling layer parameters */ +typedef struct +{ + cmsis_nn_tile stride; + cmsis_nn_tile padding; + cmsis_nn_activation activation; +} cmsis_nn_pool_params; + +/** CMSIS-NN object for Fully Connected layer parameters */ +typedef struct +{ + int32_t input_offset; /**< Zero value for the input tensor */ + int32_t filter_offset; /**< Zero value for the filter tensor. Not used */ + int32_t output_offset; /**< Zero value for the output tensor */ + cmsis_nn_activation activation; +} cmsis_nn_fc_params; + +/** CMSIS-NN object for SVDF layer parameters */ +typedef struct +{ + int32_t rank; + int32_t input_offset; /**< Zero value for the input tensor */ + int32_t output_offset; /**< Zero value for the output tensor */ + cmsis_nn_activation input_activation; + cmsis_nn_activation output_activation; +} cmsis_nn_svdf_params; + +/** CMSIS-NN object for Softmax s16 layer parameters */ +typedef struct +{ + const int16_t *exp_lut; + const int16_t *one_by_one_lut; +} cmsis_nn_softmax_lut_s16; + +#endif // _ARM_NN_TYPES_H diff --git a/Drivers/CMSIS/NN/Include/arm_nnfunctions.h b/Drivers/CMSIS/NN/Include/arm_nnfunctions.h new file mode 100644 index 0000000..deaade7 --- /dev/null +++ b/Drivers/CMSIS/NN/Include/arm_nnfunctions.h @@ -0,0 +1,2532 @@ +/* + * Copyright (C) 2010-2022 Arm Limited or its affiliates. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_nnfunctions.h + * Description: Public header file for CMSIS NN Library + * + * $Date: 19 April 2022 + * $Revision: V.9.0.0 + * + * Target Processor: Cortex-M CPUs + * -------------------------------------------------------------------- */ + +/** + \mainpage CMSIS NN Software Library + * + * Introduction + * ------------ + * + * This user manual describes the CMSIS NN software library, + * a collection of efficient neural network kernels developed to maximize the + * performance and minimize the memory footprint of neural networks on Cortex-M processor cores. + * + * The library is divided into a number of functions each covering a specific category: + * - Convolution Functions + * - Activation Functions + * - Fully-connected Layer Functions + * - SVDF Layer Functions + * - Pooling Functions + * - Softmax Functions + * - Basic math Functions + * + * The library has separate functions for operating on different weight and activation data + * types including 8-bit integers (q7_t) and 16-bit integers (q15_t). The descrition of the + * kernels are included in the function description. The implementation details are also + * described in this paper [1]. + * + * Supported Processors + * ------- + * CMSIS-NN targets Cortex-M processors with typically three different implementations for each function. Each + * targets a different group of processors. + * - Processors without SIMD capability (e.g, Cortex-M0) + * - Processors with DSP extention (e.g Cortex-M4) + * - Processors with MVE extension (e.g Cortex-M55) + * The right implementation is picked through feature flags and the user usually does not have to explicit set it. + * + * Function Classification + * -------- + * The functions can be classified into two segments + * - Legacy functions supporting ARM's internal symmetric quantization(8 bits). + * - Functions that support TensorFlow Lite framework with symmetric quantization(8 bits). + * + * The legacy functions can be identified with their suffix of _q7 or _q15 and are no new development is done there. + * The article in [2] describes in detail how to run a network using the legacy functions. + * + * The functions supporting TensorFlow Lite framework is identified by the _s8 suffix and can be invoked from TFL + * micro. The functions are bit exact to TensorFlow Lite. Refer to the TensorFlow's documentation in [3] on how to run + * a TensorFlow Lite model using optimized CMSIS-NN kernels. + * + * Block Diagram + * -------- + * \image html CMSIS-NN-OVERVIEW.PNG + * + * Examples + * -------- + * + * The library ships with a number of examples which demonstrate how to use the library functions. + * + * Pre-processor Macros + * ------------ + * + * Each library project have different pre-processor macros. + * + * - ARM_MATH_DSP: + * + * Define macro ARM_MATH_DSP, If the silicon supports DSP instructions(DSP extension). + * + * - ARM_MATH_MVEI: + * + * Define macro ARM_MATH_MVEI, If the silicon supports M-Profile Vector Extension. + + * - ARM_MATH_AUTOVECTORIZE + * Used in conjucture with ARM_MATH_MVEI to let the compiler auto vectorize for the functions that uses inline + * assembly. It does not affect functions that use C or intrinsics. + * - ARM_MATH_BIG_ENDIAN: + * + * Define macro ARM_MATH_BIG_ENDIAN to build the library for big endian targets. This is supported only for the legacy + * functions i.e, functions targetted at TensorFlow Lite do not support big endianness. By default library builds for + * little endian targets. + * + * - ARM_NN_TRUNCATE: + * + * Define macro ARM_NN_TRUNCATE to use floor instead of round-to-the-nearest-int for the computation. + * + * + * Copyright Notice + * ------------ + * + * Copyright (C) 2010-2019 Arm Limited. All rights reserved. + * + * [1] CMSIS-NN: Efficient Neural Network Kernels for Arm Cortex-M CPUs https://arxiv.org/abs/1801.06601 + * + * [2] Converting a Neural Network for Arm Cortex-M with CMSIS-NN + * + https://developer.arm.com/solutions/machine-learning-on-arm/developer-material/how-to-guides/converting-a-neural-network-for-arm-cortex-m-with-cmsis-nn/single-page + * [3] https://www.tensorflow.org/lite/microcontrollers/library + * + * [4] https://github.com/ARM-software/CMSIS_5/tree/develop/CMSIS/NN#legacy-vs-tfl-micro-compliant-apis + */ + +/** + * @defgroup groupNN Neural Network Functions + * A collection of functions to perform basic operations for neural network layers. Functions with a _s8 suffix support + * TensorFlow Lite framework. + */ + +#ifndef _ARM_NNFUNCTIONS_H +#define _ARM_NNFUNCTIONS_H + +#include "arm_nn_math_types.h" +#include "arm_nn_types.h" + +#define USE_INTRINSIC + +//#define ARM_NN_TRUNCATE /* This config the rounding model to floor or round to the nearest int */ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @brief Struct for specifying activation function types + * + */ +typedef enum +{ + ARM_SIGMOID = 0, + /**< Sigmoid activation function */ + ARM_TANH = 1, + /**< Tanh activation function */ +} arm_nn_activation_type; + +/** + * @defgroup NNConv Convolution Functions + * + * Collection of convolution, depthwise convolution functions and their variants. + * + * The convolution is implemented in 2 steps: im2col and GEMM + * + * im2col is a process of converting each patch of image data into + * a column. After im2col, the convolution is computed as matrix-matrix + * multiplication. + * + * To reduce the memory footprint, the im2col is performed partially. + * Each iteration, only a few column (i.e., patches) are generated and + * computed with GEMM kernels similar to CMSIS-DSP arm_mat_mult functions. + * + */ + +/** + * @brief s8 convolution layer wrapper function with the main purpose to call the optimal kernel available in + cmsis-nn + * to perform the convolution. + * + * @param[in, out] ctx Function context that contains the additional buffer if required by the function. + arm_convolve_wrapper_s8_get_buffer_size will return the buffer_size if required + * @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...). + * Range of conv_params->input_offset : [-127, 128] + * Range of conv_params->output_offset : [-128, 127] + * @param[in] quant_params Per-channel quantization info. + * It contains the multiplier and shift values to be applied to each output channel + * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN] + * @param[in] input_data Input (activation) data pointer. Data type: int8 + * @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the + * spatial filter dimensions + * @param[in] filter_data Filter data pointer. Data type: int8 + * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT] + * @param[in] bias_data Bias data pointer. Data type: int32 + * @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT] + * @param[out] output_data Output data pointer. Data type: int8 + * + * @return The function returns either + * <code>ARM_MATH_SIZE_MISMATCH</code> if argument constraints fail. or, + * <code>ARM_MATH_SUCCESS</code> on successful completion. + * + */ +arm_status arm_convolve_wrapper_s8(const cmsis_nn_context *ctx, + const cmsis_nn_conv_params *conv_params, + const cmsis_nn_per_channel_quant_params *quant_params, + const cmsis_nn_dims *input_dims, + const q7_t *input_data, + const cmsis_nn_dims *filter_dims, + const q7_t *filter_data, + const cmsis_nn_dims *bias_dims, + const int32_t *bias_data, + const cmsis_nn_dims *output_dims, + q7_t *output_data); + +/** + * @brief Get the required buffer size for arm_convolve_wrapper_s8 + * + * @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...). + * Range of conv_params->input_offset : [-127, 128] + * Range of conv_params->output_offset : [-128, 127] + * @param[in] input_dims Input (activation) dimensions. Format: [N, H, W, C_IN] + * @param[in] filter_dims Filter dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the spatial + * filter dimensions + * @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT] + * + * @return The function returns required buffer size(bytes) + * + */ +int32_t arm_convolve_wrapper_s8_get_buffer_size(const cmsis_nn_conv_params *conv_params, + const cmsis_nn_dims *input_dims, + const cmsis_nn_dims *filter_dims, + const cmsis_nn_dims *output_dims); + +/** + * @brief s16 convolution layer wrapper function with the main purpose to call the optimal kernel available in + cmsis-nn + * to perform the convolution. + * + * @param[in, out] ctx Function context that contains the additional buffer if required by the function. + arm_convolve_wrapper_s8_get_buffer_size will return the buffer_size if required + * @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...). + * conv_params->input_offset : Not used + * conv_params->output_offset : Not used + * @param[in] quant_params Per-channel quantization info. + * It contains the multiplier and shift values to be applied to each output channel + * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN] + * @param[in] input_data Input (activation) data pointer. Data type: int16 + * @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the + * spatial filter dimensions + * @param[in] filter_data Filter data pointer. Data type: int8 + * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT] + * @param[in] bias_data Bias data pointer. Data type: int64 + * @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT] + * @param[out] output_data Output data pointer. Data type: int16 + * + * @return The function returns either + * <code>ARM_MATH_SIZE_MISMATCH</code> if argument constraints fail. or, + * <code>ARM_MATH_SUCCESS</code> on successful completion. + * + */ +arm_status arm_convolve_wrapper_s16(const cmsis_nn_context *ctx, + const cmsis_nn_conv_params *conv_params, + const cmsis_nn_per_channel_quant_params *quant_params, + const cmsis_nn_dims *input_dims, + const q15_t *input_data, + const cmsis_nn_dims *filter_dims, + const q7_t *filter_data, + const cmsis_nn_dims *bias_dims, + const int64_t *bias_data, + const cmsis_nn_dims *output_dims, + q15_t *output_data); + +/** + * @brief Get the required buffer size for arm_convolve_wrapper_s16 + * + * @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...). + * conv_params->input_offset : Not used + * conv_params->output_offset : Not used + * @param[in] input_dims Input (activation) dimensions. Format: [N, H, W, C_IN] + * @param[in] filter_dims Filter dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the spatial + * filter dimensions + * @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT] + * + * @return The function returns required buffer size(bytes) + * + */ +int32_t arm_convolve_wrapper_s16_get_buffer_size(const cmsis_nn_conv_params *conv_params, + const cmsis_nn_dims *input_dims, + const cmsis_nn_dims *filter_dims, + const cmsis_nn_dims *output_dims); + +/** + * @brief Basic s8 convolution function + * @param[in, out] ctx Function context that contains the additional buffer if required by the function. + arm_convolve_s8_get_buffer_size will return the buffer_size if required + * @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...). + * Range of conv_params->input_offset : [-127, 128] + * Range of conv_params->output_offset : [-128, 127] + * @param[in] quant_params Per-channel quantization info. + * It contains the multiplier and shift values to be applied to each output channel + * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN] + * @param[in] input_data Input (activation) data pointer. Data type: int8 + * @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the + * spatial filter dimensions + * @param[in] filter_data Filter data pointer. Data type: int8 + * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT] + * @param[in] bias_data Optional bias data pointer. Data type: int32 + * @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT] + * @param[out] output_data Output data pointer. Data type: int8 + + * @return The function returns <code>ARM_MATH_SUCCESS</code> + * + * @details + * 1. Supported framework: TensorFlow Lite micro + * 2. q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs. + * 3. Additional memory is required for optimization. Refer to argument 'ctx' for details. + * + */ +arm_status arm_convolve_s8(const cmsis_nn_context *ctx, + const cmsis_nn_conv_params *conv_params, + const cmsis_nn_per_channel_quant_params *quant_params, + const cmsis_nn_dims *input_dims, + const q7_t *input_data, + const cmsis_nn_dims *filter_dims, + const q7_t *filter_data, + const cmsis_nn_dims *bias_dims, + const int32_t *bias_data, + const cmsis_nn_dims *output_dims, + q7_t *output_data); + +/** + * @brief Get the required buffer size for s8 convolution function + * + * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN] + * @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK + * are the spatial filter dimensions + * @return The function returns required buffer size(bytes) + * + */ +int32_t arm_convolve_s8_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims); + +/** + * @brief Basic s16 convolution function + * @param[in, out] ctx Function context that contains the additional buffer if required by the function. + arm_convolve_s16_get_buffer_size will return the buffer_size if required + * @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...). + * conv_params->input_offset : Not used + * conv_params->output_offset : Not used + * @param[in] quant_params Per-channel quantization info. + * It contains the multiplier and shift values to be applied to each output channel + * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN] + * @param[in] input_data Input (activation) data pointer. Data type: int16 + * @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the + * spatial filter dimensions + * @param[in] filter_data Filter data pointer. Data type: int8 + * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT] + * @param[in] bias_data Optional bias data pointer. Data type: int64 + * @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT] + * @param[out] output_data Output data pointer. Data type: int16 + + * @return The function returns <code>ARM_MATH_SUCCESS</code> + * + * @details + * 1. Supported framework: TensorFlow Lite micro + * 2. q7/q15 is used as data type eventhough it is s8/s16 data. It is done so to be consistent with existing APIs. + * 3. Additional memory is required for optimization. Refer to argument 'ctx' for details. + * + */ +arm_status arm_convolve_s16(const cmsis_nn_context *ctx, + const cmsis_nn_conv_params *conv_params, + const cmsis_nn_per_channel_quant_params *quant_params, + const cmsis_nn_dims *input_dims, + const q15_t *input_data, + const cmsis_nn_dims *filter_dims, + const q7_t *filter_data, + const cmsis_nn_dims *bias_dims, + const int64_t *bias_data, + const cmsis_nn_dims *output_dims, + q15_t *output_data); +/** + * @brief Optimized s16 convolution function + * @param[in, out] ctx Function context that contains the additional buffer if required by the function. + arm_convolve_fast_s16_get_buffer_size will return the buffer_size if required + * @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...). + * conv_params->input_offset : Not used + * conv_params->output_offset : Not used + * @param[in] quant_params Per-channel quantization info. + * It contains the multiplier and shift values to be applied to each output channel + * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN] + * @param[in] input_data Input (activation) data pointer. Data type: int16 + * @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the + * spatial filter dimensions. (filter_dims->w * filter_dims->h * input_dims->c) must not + exceed 512 + * @param[in] filter_data Filter data pointer. Data type: int8 + * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT] + * @param[in] bias_data Optional bias data pointer. Data type: int64 + * @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT] + * @param[out] output_data Output data pointer. Data type: int16 + + * @return The function returns <code>ARM_MATH_SUCCESS</code> + * + * @details + * 1. Supported framework: TensorFlow Lite micro + * 2. q7/q15 is used as data type eventhough it is s8/s16 data. It is done so to be consistent with existing APIs. + * 3. Additional memory is required for optimization. Refer to argument 'ctx' for details. + * 4. Implementation supports kernel volumes (filter width * filter height * input channels) < 512. + * + */ + +arm_status arm_convolve_fast_s16(const cmsis_nn_context *ctx, + const cmsis_nn_conv_params *conv_params, + const cmsis_nn_per_channel_quant_params *quant_params, + const cmsis_nn_dims *input_dims, + const q15_t *input_data, + const cmsis_nn_dims *filter_dims, + const q7_t *filter_data, + const cmsis_nn_dims *bias_dims, + const int64_t *bias_data, + const cmsis_nn_dims *output_dims, + q15_t *output_data); + +/** + * @brief Get the required buffer size for s16 convolution function + * + * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN] + * @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK + * are the spatial filter dimensions + * @return The function returns required buffer size(bytes) + * + */ +int32_t arm_convolve_s16_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims); + +/** + * @brief Get the required buffer size for fast s16 convolution function + * + * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN] + * @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK + * are the spatial filter dimensions + * @return The function returns required buffer size(bytes) + * + */ +int32_t arm_convolve_fast_s16_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims); + +/** + * @brief Basic Q7 convolution function + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimension + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferB pointer to buffer space for output + * @return The function returns <code>ARM_MATH_SUCCESS</code> + * + */ +arm_status arm_convolve_HWC_q7_basic(const q7_t *Im_in, + const uint16_t dim_im_in, + const uint16_t ch_im_in, + const q7_t *wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel, + const uint16_t padding, + const uint16_t stride, + const q7_t *bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q7_t *Im_out, + const uint16_t dim_im_out, + q15_t *bufferA, + q7_t *bufferB); + +/** + * @brief Basic Q7 convolution function (non-square shape) + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in_x input tensor dimension x + * @param[in] dim_im_in_y input tensor dimension y + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel_x filter kernel size x + * @param[in] dim_kernel_y filter kernel size y + * @param[in] padding_x padding size x + * @param[in] padding_y padding size y + * @param[in] stride_x convolution stride x + * @param[in] stride_y convolution stride y + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out_x output tensor dimension x + * @param[in] dim_im_out_y output tensor dimension y + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferB pointer to buffer space for output + * @return The function returns <code>ARM_MATH_SUCCESS</code> + */ +arm_status arm_convolve_HWC_q7_basic_nonsquare(const q7_t *Im_in, + const uint16_t dim_im_in_x, + const uint16_t dim_im_in_y, + const uint16_t ch_im_in, + const q7_t *wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel_x, + const uint16_t dim_kernel_y, + const uint16_t padding_x, + const uint16_t padding_y, + const uint16_t stride_x, + const uint16_t stride_y, + const q7_t *bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q7_t *Im_out, + const uint16_t dim_im_out_x, + const uint16_t dim_im_out_y, + q15_t *bufferA, + q7_t *bufferB); + +/** + * @brief Basic Q15 convolution function + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimension + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferB pointer to buffer space for output + * @return The function returns <code>ARM_MATH_SUCCESS</code> + * + */ +arm_status arm_convolve_HWC_q15_basic(const q15_t *Im_in, + const uint16_t dim_im_in, + const uint16_t ch_im_in, + const q15_t *wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel, + const uint16_t padding, + const uint16_t stride, + const q15_t *bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q15_t *Im_out, + const uint16_t dim_im_out, + q15_t *bufferA, + q7_t *bufferB); + +/** + * @brief Fast Q7 convolution function + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimension + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferB pointer to buffer space for output + * @return The function returns either + * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking. + * + * This function is the version with full list of optimization tricks, but with + * some contraints: + * ch_im_in is multiple of 4 + * ch_im_out is multiple of 2 + */ +arm_status arm_convolve_HWC_q7_fast(const q7_t *Im_in, + const uint16_t dim_im_in, + const uint16_t ch_im_in, + const q7_t *wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel, + const uint16_t padding, + const uint16_t stride, + const q7_t *bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q7_t *Im_out, + const uint16_t dim_im_out, + q15_t *bufferA, + q7_t *bufferB); + +/** + * @brief Fast Q7 convolution function (non-sqaure shape) + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in_x input tensor dimension x + * @param[in] dim_im_in_y input tensor dimension y + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel_x filter kernel size x + * @param[in] dim_kernel_y filter kernel size y + * @param[in] padding_x padding size x + * @param[in] padding_y padding size y + * @param[in] stride_x convolution stride x + * @param[in] stride_y convolution stride y + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out_x output tensor dimension x + * @param[in] dim_im_out_y output tensor dimension y + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferB pointer to buffer space for output + * @return The function returns either + * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking. + * + * This function is the version with full list of optimization tricks, but with + * some contraints: + * ch_im_in is multiple of 4 + * ch_im_out is multiple of 2 + */ + +arm_status arm_convolve_HWC_q7_fast_nonsquare(const q7_t *Im_in, + const uint16_t dim_im_in_x, + const uint16_t dim_im_in_y, + const uint16_t ch_im_in, + const q7_t *wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel_x, + const uint16_t dim_kernel_y, + const uint16_t padding_x, + const uint16_t padding_y, + const uint16_t stride_x, + const uint16_t stride_y, + const q7_t *bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q7_t *Im_out, + const uint16_t dim_im_out_x, + const uint16_t dim_im_out_y, + q15_t *bufferA, + q7_t *bufferB); + +/** + * @brief Fast Q7 version of 1x1 convolution (non-sqaure shape) + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in_x input tensor dimension x + * @param[in] dim_im_in_y input tensor dimension y + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel_x filter kernel size x + * @param[in] dim_kernel_y filter kernel size y + * @param[in] padding_x padding size x + * @param[in] padding_y padding size y + * @param[in] stride_x convolution stride x + * @param[in] stride_y convolution stride y + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out_x output tensor dimension x + * @param[in] dim_im_out_y output tensor dimension y + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferB pointer to buffer space for output + * @return The function returns either + * <code>ARM_MATH_SIZE_MISMATCH</code> if argument constraints fail. or, + * <code>ARM_MATH_SUCCESS</code> on successful completion. + * + * This function implement convolution with 1x1 kernel size (i.e., dim_kernel_x=1 + * and dim_kernel_y=1). It can be used for + * second half of MobileNets after depthwise separable convolution. + * + * This function is the version with full list of optimization tricks, but with + * some contraints: + * ch_im_in is multiple of 4 + * ch_im_out is multiple of 2 + */ +arm_status arm_convolve_1x1_HWC_q7_fast_nonsquare(const q7_t *Im_in, + const uint16_t dim_im_in_x, + const uint16_t dim_im_in_y, + const uint16_t ch_im_in, + const q7_t *wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel_x, + const uint16_t dim_kernel_y, + const uint16_t padding_x, + const uint16_t padding_y, + const uint16_t stride_x, + const uint16_t stride_y, + const q7_t *bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q7_t *Im_out, + const uint16_t dim_im_out_x, + const uint16_t dim_im_out_y, + q15_t *bufferA, + q7_t *bufferB); + +/** + * @brief Fast s8 version for 1x1 convolution (non-square shape) + * + * @param[in, out] ctx Function context that contains the additional buffer if required by the function. + arm_convolve_1x1_s8_fast_get_buffer_size will return the buffer_size if required + * @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...). + * Range of conv_params->input_offset : [-127, 128] + * Range of conv_params->output_offset : [-128, 127] + * @param[in] quant_params Per-channel quantization info. + * It contains the multiplier and shift values to be applied to each output channel + * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN] + * @param[in] input_data Input (activation) data pointer. Data type: int8 + * @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, 1, 1, C_IN] + * @param[in] filter_data Filter data pointer. Data type: int8 + * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT] + * @param[in] bias_data Optional bias data pointer. Data type: int32 + * @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT] + * @param[out] output_data Output data pointer. Data type: int8 + * + * @return The function returns either + * <code>ARM_MATH_SIZE_MISMATCH</code> if argument constraints fail. or, + * <code>ARM_MATH_SUCCESS</code> on successful completion. + * + * @details + * - Supported framework : TensorFlow Lite Micro + * - The following constrains on the arguments apply + * -# input_dims->c is a multiple of 4 + * -# conv_params->padding.w = conv_params->padding.h = 0 + * -# conv_params->stride.w = conv_params->stride.h = 1 + * + */ +arm_status arm_convolve_1x1_s8_fast(const cmsis_nn_context *ctx, + const cmsis_nn_conv_params *conv_params, + const cmsis_nn_per_channel_quant_params *quant_params, + const cmsis_nn_dims *input_dims, + const q7_t *input_data, + const cmsis_nn_dims *filter_dims, + const q7_t *filter_data, + const cmsis_nn_dims *bias_dims, + const int32_t *bias_data, + const cmsis_nn_dims *output_dims, + q7_t *output_data); + +/** + * @brief Get the required buffer size for arm_convolve_1x1_s8_fast + * + * @param[in] input_dims Input (activation) dimensions + * @return The function returns the required buffer size in bytes + * + */ +int32_t arm_convolve_1x1_s8_fast_get_buffer_size(const cmsis_nn_dims *input_dims); + +/** + * @brief 1xn convolution + * + * @param[in, out] ctx Function context that contains the additional buffer if required by the function. + arm_convolve_1_x_n_s8_get_buffer_size will return the buffer_size if required + * @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...). + * Range of conv_params->input_offset : [-127, 128] + * Range of conv_params->output_offset : [-128, 127] + * @param[in] quant_params Per-channel quantization info. + * It contains the multiplier and shift values to be applied to each output channel + * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN] + * @param[in] input_data Input (activation) data pointer. Data type: int8 + * @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, 1, WK, C_IN] where WK is the horizontal + * spatial filter dimension + * @param[in] filter_data Filter data pointer. Data type: int8 + * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT] + * @param[in] bias_data Optional bias data pointer. Data type: int32 + * @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT] + * @param[out] output_data Output data pointer. Data type: int8 + * + * @return The function returns either + * <code>ARM_MATH_SIZE_MISMATCH</code> if argument constraints fail. or, + * <code>ARM_MATH_SUCCESS</code> on successful completion. + * + * @details + * - Supported framework : TensorFlow Lite Micro + * - The following constrains on the arguments apply + * -# input_dims->n equals 1 + * -# ouput_dims->w is a multiple of 4 + * -# Explicit constraints(since it is for 1xN convolution) + * -## input_dims->h equals 1 + * -## output_dims->h equals 1 + * -## filter_dims->h equals 1 + *@todo Remove constraint on output_dims->w to make the function generic. + * + */ +arm_status arm_convolve_1_x_n_s8(const cmsis_nn_context *ctx, + const cmsis_nn_conv_params *conv_params, + const cmsis_nn_per_channel_quant_params *quant_params, + const cmsis_nn_dims *input_dims, + const q7_t *input_data, + const cmsis_nn_dims *filter_dims, + const q7_t *filter_data, + const cmsis_nn_dims *bias_dims, + const int32_t *bias_data, + const cmsis_nn_dims *output_dims, + q7_t *output_data); + +/** + * @brief Get the required additional buffer size for 1xn convolution + * + * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN] + * @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, 1, WK, C_IN] where WK is the + * horizontal spatial filter dimension + * @return The function returns required buffer size(bytes) + * + */ +int32_t arm_convolve_1_x_n_s8_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims); + +/** + * @brief Q7 version of convolution for RGB image + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimension + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferB pointer to buffer space for output + * @return The function returns either + * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking. + * + * This kernel is written exclusively for convolution with ch_im_in + * equals 3. This applies on the first layer of CNNs which has input + * image with RGB format. + */ + +arm_status arm_convolve_HWC_q7_RGB(const q7_t *Im_in, + const uint16_t dim_im_in, + const uint16_t ch_im_in, + const q7_t *wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel, + const uint16_t padding, + const uint16_t stride, + const q7_t *bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q7_t *Im_out, + const uint16_t dim_im_out, + q15_t *bufferA, + q7_t *bufferB); + +/** + * @brief Fast Q15 convolution function + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimension + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferB pointer to buffer space for output + * @return The function returns either + * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking. + * + * This function is the version with full list of optimization tricks, but with + * some contraints: + * ch_im_in is multiple of 2 + * ch_im_out is multiple of 2 + * dim_im_out is a multiple of 2 + */ + +arm_status arm_convolve_HWC_q15_fast(const q15_t *Im_in, + const uint16_t dim_im_in, + const uint16_t ch_im_in, + const q15_t *wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel, + const uint16_t padding, + const uint16_t stride, + const q15_t *bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q15_t *Im_out, + const uint16_t dim_im_out, + q15_t *bufferA, + q7_t *bufferB); + +/** + * @brief Fast Q15 convolution function (non-sqaure shape) + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in_x input tensor dimension x + * @param[in] dim_im_in_y input tensor dimension y + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel_x filter kernel size x + * @param[in] dim_kernel_y filter kernel size y + * @param[in] padding_x padding size x + * @param[in] padding_y padding size y + * @param[in] stride_x convolution stride x + * @param[in] stride_y convolution stride y + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out_x output tensor dimension x + * @param[in] dim_im_out_y output tensor dimension y + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferB pointer to buffer space for output + * @return The function returns either + * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking. + * + * @details + * + * <b>Buffer size:</b> + * + * bufferA size: 2*ch_im_in*dim_kernel*dim_kernel + * + * bufferB size: 0 + * + * <b>Input dimension constraints:</b> + * + * ch_im_in is multiple of 2 + * + * ch_im_out is multipe of 2 + * + */ + +arm_status arm_convolve_HWC_q15_fast_nonsquare(const q15_t *Im_in, + const uint16_t dim_im_in_x, + const uint16_t dim_im_in_y, + const uint16_t ch_im_in, + const q15_t *wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel_x, + const uint16_t dim_kernel_y, + const uint16_t padding_x, + const uint16_t padding_y, + const uint16_t stride_x, + const uint16_t stride_y, + const q15_t *bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q15_t *Im_out, + const uint16_t dim_im_out_x, + const uint16_t dim_im_out_y, + q15_t *bufferA, + q7_t *bufferB); + +/** + * @brief Q7 depthwise separable convolution function + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimension + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferB pointer to buffer space for output + * @return The function returns either + * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking. + * + * This function is the version with full list of optimization tricks, but with + * some contraints: + * ch_im_in is multiple of 2 + * ch_im_out is multiple of 2 + */ + +arm_status arm_depthwise_separable_conv_HWC_q7(const q7_t *Im_in, + const uint16_t dim_im_in, + const uint16_t ch_im_in, + const q7_t *wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel, + const uint16_t padding, + const uint16_t stride, + const q7_t *bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q7_t *Im_out, + const uint16_t dim_im_out, + q15_t *bufferA, + q7_t *bufferB); + +/** + * @brief Q7 depthwise separable convolution function (non-square shape) + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in_x input tensor dimension x + * @param[in] dim_im_in_y input tensor dimension y + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel_x filter kernel size x + * @param[in] dim_kernel_y filter kernel size y + * @param[in] padding_x padding sizes x + * @param[in] padding_y padding sizes y + * @param[in] stride_x convolution stride x + * @param[in] stride_y convolution stride y + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out_x output tensor dimension x + * @param[in] dim_im_out_y output tensor dimension y + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferB pointer to buffer space for output + * @return The function returns either + * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking. + * + * This function is the version with full list of optimization tricks, but with + * some contraints: + * ch_im_in is multiple of 2 + * ch_im_out is multiple of 2 + */ +arm_status arm_depthwise_separable_conv_HWC_q7_nonsquare(const q7_t *Im_in, + const uint16_t dim_im_in_x, + const uint16_t dim_im_in_y, + const uint16_t ch_im_in, + const q7_t *wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel_x, + const uint16_t dim_kernel_y, + const uint16_t padding_x, + const uint16_t padding_y, + const uint16_t stride_x, + const uint16_t stride_y, + const q7_t *bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q7_t *Im_out, + const uint16_t dim_im_out_x, + const uint16_t dim_im_out_y, + q15_t *bufferA, + q7_t *bufferB); + +/** + * @brief Wrapper function to pick the right optimized s8 depthwise convolution function + * + * @param[in, out] ctx Function context (e.g. temporary buffer). Check the function + * definition file to see if an additional buffer is required. + * Optional function {API}_get_buffer_size() provides the buffer + * size if required. + * @param[in] dw_conv_params Depthwise convolution parameters (e.g. strides, dilations, pads,...) + * dw_conv_params->dilation is not used. + * Range of dw_conv_params->input_offset : [-127, 128] + * Range of dw_conv_params->output_offset : [-128, 127] + * @param[in] quant_params Per-channel quantization info. + * It contains the multiplier and shift values to be applied to each + * output channel + * @param[in] input_dims Input (activation) tensor dimensions. Format: [H, W, C_IN] + * Batch argument N is not used and assumed to be 1. + * @param[in] input_data Input (activation) data pointer. Data type: int8 + * @param[in] filter_dims Filter tensor dimensions. Format: [1, H, W, C_OUT] + * @param[in] filter_data Filter data pointer. Data type: int8 + * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT] + * @param[in] bias_data Bias data pointer. Data type: int32 + * @param[in] output_dims Output tensor dimensions. Format: [1, H, W, C_OUT] + * @param[in, out] output_data Output data pointer. Data type: int8 + * @return The function returns + * <code>ARM_MATH_SUCCESS</code> - Successful completion. + * + * @details + * - Supported framework: TensorFlow Lite + * - Picks one of the the following functions + * -# arm_depthwise_conv_s8() + * -# arm_depthwise_conv_3x3_s8() - Cortex-M CPUs with DSP extension only + * -# arm_depthwise_conv_s8_opt() + * - q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs. + * - Check details of arm_depthwise_conv_s8_opt() for potential data that can be accessed outside of the + * boundary. + */ +arm_status arm_depthwise_conv_wrapper_s8(const cmsis_nn_context *ctx, + const cmsis_nn_dw_conv_params *dw_conv_params, + const cmsis_nn_per_channel_quant_params *quant_params, + const cmsis_nn_dims *input_dims, + const q7_t *input_data, + const cmsis_nn_dims *filter_dims, + const q7_t *filter_data, + const cmsis_nn_dims *bias_dims, + const int32_t *bias_data, + const cmsis_nn_dims *output_dims, + q7_t *output_data); + +/** + * @brief Get size of additional buffer required by arm_depthwise_conv_wrapper_s8() + * + * @param[in] dw_conv_params Depthwise convolution parameters (e.g. strides, dilations, pads,...) + * dw_conv_params->dilation is not used. + * Range of dw_conv_params->input_offset : [-127, 128] + * Range of dw_conv_params->input_offset : [-128, 127] + * @param[in] input_dims Input (activation) tensor dimensions. Format: [H, W, C_IN] + * Batch argument N is not used and assumed to be 1. + * @param[in] filter_dims Filter tensor dimensions. Format: [1, H, W, C_OUT] + * @param[in] output_dims Output tensor dimensions. Format: [1, H, W, C_OUT] + * @return Size of additional memory required for optimizations in bytes. + * + */ +int32_t arm_depthwise_conv_wrapper_s8_get_buffer_size(const cmsis_nn_dw_conv_params *dw_conv_params, + const cmsis_nn_dims *input_dims, + const cmsis_nn_dims *filter_dims, + const cmsis_nn_dims *output_dims); + +/** + * @brief Basic s8 depthwise convolution function that doesn't have any constraints on the input dimensions. + * + * @param[in, out] ctx Function context (e.g. temporary buffer). Check the function + * definition file to see if an additional buffer is required. + * Optional function {API}_get_buffer_size() provides the buffer + * size if an additional buffer is required. + * exists if additional memory is. + * @param[in] dw_conv_params Depthwise convolution parameters (e.g. strides, dilations, pads,...) + * dw_conv_params->dilation is not used. + * Range of dw_conv_params->input_offset : [-127, 128] + * Range of dw_conv_params->input_offset : [-128, 127] + * @param[in] quant_params Per-channel quantization info. + * It contains the multiplier and shift values to be applied to each + * output channel + * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN] + * Batch argument N is not used. + * @param[in] input_data Input (activation) data pointer. Data type: int8 + * @param[in] filter_dims Filter tensor dimensions. Format: [1, H, W, C_OUT] + * @param[in] filter_data Filter data pointer. Data type: int8 + * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT] + * @param[in] bias_data Bias data pointer. Data type: int32 + * @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT] + * @param[in, out] output_data Output data pointer. Data type: int8 + * @return The function returns <code>ARM_MATH_SUCCESS</code> + * + * @details + * - Supported framework: TensorFlow Lite + * - q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs. + */ +arm_status arm_depthwise_conv_s8(const cmsis_nn_context *ctx, + const cmsis_nn_dw_conv_params *dw_conv_params, + const cmsis_nn_per_channel_quant_params *quant_params, + const cmsis_nn_dims *input_dims, + const q7_t *input_data, + const cmsis_nn_dims *filter_dims, + const q7_t *filter_data, + const cmsis_nn_dims *bias_dims, + const int32_t *bias_data, + const cmsis_nn_dims *output_dims, + q7_t *output_data); + +/** + * @brief Basic s16 depthwise convolution function that doesn't have any constraints on the input dimensions. + * + * @param[in, out] ctx Function context (e.g. temporary buffer). Check the function + * definition file to see if an additional buffer is required. + * Optional function {API}_get_buffer_size() provides the buffer + * size if an additional buffer is required. + * exists if additional memory is. + * @param[in] dw_conv_params Depthwise convolution parameters (e.g. strides, dilations, pads,...) + * conv_params->input_offset : Not used + * conv_params->output_offset : Not used + * @param[in] quant_params Per-channel quantization info. + * It contains the multiplier and shift values to be applied to each + * output channel + * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN] + * Batch argument N is not used. + * @param[in] input_data Input (activation) data pointer. Data type: int8 + * @param[in] filter_dims Filter tensor dimensions. Format: [1, H, W, C_OUT] + * @param[in] filter_data Filter data pointer. Data type: int8 + * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT] + * @param[in] bias_data Bias data pointer. Data type: int64 + * @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT] + * @param[in, out] output_data Output data pointer. Data type: int16 + * @return The function returns <code>ARM_MATH_SUCCESS</code> + * + * @details + * - Supported framework: TensorFlow Lite + * - q15 is used as data type eventhough it is s16 data. It is done so to be consistent with existing APIs. + */ +arm_status arm_depthwise_conv_s16(const cmsis_nn_context *ctx, + const cmsis_nn_dw_conv_params *dw_conv_params, + const cmsis_nn_per_channel_quant_params *quant_params, + const cmsis_nn_dims *input_dims, + const q15_t *input_data, + const cmsis_nn_dims *filter_dims, + const q7_t *filter_data, + const cmsis_nn_dims *bias_dims, + const int64_t *bias_data, + const cmsis_nn_dims *output_dims, + q15_t *output_data); + +/** + * @brief Optimized s8 depthwise convolution function for 3x3 kernel size with some constraints on + * the input arguments(documented below). Refer arm_depthwise_conv_s8() for function + * argument details. + * + * @return The function returns one of the following + * <code>ARM_MATH_SIZE_MISMATCH</code> - Unsupported dimension of tensors + * <code>ARM_MATH_ARGUMENT_ERROR</code> - Unsupported pad size along the x axis + * <code>ARM_MATH_SUCCESS</code> - Successful operation + * + * @details + * - Supported framework : TensorFlow Lite Micro + * - The following constrains on the arguments apply + * -# Number of input channel equals number of output channels + * -# Filter height and width equals 3 + * -# Padding along x is either 0 or 1. + * + */ +arm_status arm_depthwise_conv_3x3_s8(const cmsis_nn_context *ctx, + const cmsis_nn_dw_conv_params *dw_conv_params, + const cmsis_nn_per_channel_quant_params *quant_params, + const cmsis_nn_dims *input_dims, + const q7_t *input_data, + const cmsis_nn_dims *filter_dims, + const q7_t *filter_data, + const cmsis_nn_dims *bias_dims, + const int32_t *bias_data, + const cmsis_nn_dims *output_dims, + q7_t *output_data); + +/** + * @brief Optimized s8 depthwise convolution function with constraint that in_channel equals out_channel. + * Refer arm_depthwise_conv_s8() for function argument details. + * + * @return The function returns one of the following + * <code>ARM_MATH_SIZE_MISMATCH</code> - input channel != output channel or + * ch_mult != 1 + * <code>ARM_MATH_SUCCESS</code> - Successful operation + * + * @note If number of channels is not a multiple of 4, upto 3 elements outside the boundary will be read out + * for the following if MVE optimizations(Arm Helium Technology) are used. + * - Output shift + * - Output multiplier + * - Output bias + * - kernel + * @details + * - Supported framework: TensorFlow Lite + * - The following constrains on the arguments apply + * -# Number of input channel equals number of output channels or ch_mult equals 1 + * - q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs. + * - Reccomended when number of channels is 4 or greater. + * + */ +arm_status arm_depthwise_conv_s8_opt(const cmsis_nn_context *ctx, + const cmsis_nn_dw_conv_params *dw_conv_params, + const cmsis_nn_per_channel_quant_params *quant_params, + const cmsis_nn_dims *input_dims, + const q7_t *input_data, + const cmsis_nn_dims *filter_dims, + const q7_t *filter_data, + const cmsis_nn_dims *bias_dims, + const int32_t *bias_data, + const cmsis_nn_dims *output_dims, + q7_t *output_data); + +/** + * @brief Get the required buffer size for optimized s8 depthwise convolution + * function with constraint that in_channel equals out_channel. + * @param[in] input_dims Input (activation) tensor dimensions. Format: [1, H, W, C_IN] + * Batch argument N is not used. + * @param[in] filter_dims Filter tensor dimensions. Format: [1, H, W, C_OUT] + * @return The function returns required buffer size in bytes + * + */ +int32_t arm_depthwise_conv_s8_opt_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims); + +/** + * @defgroup FC Fully-connected Layer Functions + * + * Collection of fully-connected and matrix multiplication functions. + * + * Fully-connected layer is basically a matrix-vector multiplication + * with bias. The matrix is the weights and the input/output vectors + * are the activation values. Supported {weight, activation} precisions + * include {8-bit, 8-bit}, {16-bit, 16-bit}, and {8-bit, 16-bit}. + * + * Here we have two types of kernel functions. The basic function + * implements the function using regular GEMV approach. The opt functions + * operates with weights in interleaved formats. + * + */ + +/** + *@brief Q7 basic fully-connected layer function + *@param[in] pV pointer to input vector + *@param[in] pM pointer to matrix weights + *@param[in] dim_vec length of the vector + *@param[in] num_of_rows number of rows in weight matrix + *@param[in] bias_shift amount of left-shift for bias + *@param[in] out_shift amount of right-shift for output + *@param[in] bias pointer to bias + *@param[in,out] pOut pointer to output vector + *@param[in,out] vec_buffer pointer to buffer space for input + *@return The function returns <code>ARM_MATH_SUCCESS</code> + * + */ + +arm_status arm_fully_connected_q7(const q7_t *pV, + const q7_t *pM, + const uint16_t dim_vec, + const uint16_t num_of_rows, + const uint16_t bias_shift, + const uint16_t out_shift, + const q7_t *bias, + q7_t *pOut, + q15_t *vec_buffer); + +/** + * @brief Basic s8 Fully Connected function. + * + * @param[in, out] ctx Function context (e.g. temporary buffer). Check the function + * definition file to see if an additional buffer is required. + * Optional function {API}_get_buffer_size() provides the buffer + * size if an additional buffer is required. + * @param[in] fc_params Fully Connected layer parameters. + * Range of fc_params->input_offset : [-127, 128] + * fc_params->filter_offset : 0 + * Range of fc_params->output_offset : [-128, 127] + * @param[in] quant_params Per-tensor quantization info. + * It contains the multiplier and shift values to be applied to the output tensor. + * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN] + * Input dimension is taken as Nx(H * W * C_IN) + * @param[in] input_data Input (activation) data pointer. Data type: int8 + * @param[in] filter_dims Two dimensional filter dimensions. Format: [N, C] + * N : accumulation depth and equals (H * W * C_IN) from input_dims + * C : output depth and equals C_OUT in output_dims + * H & W : Not used + * @param[in] filter_data Filter data pointer. Data type: int8 + * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT] + * N, H, W : Not used + * @param[in] bias_data Bias data pointer. Data type: int32 + * @param[in] output_dims Output tensor dimensions. Format: [N, C_OUT] + * N : Batches + * C_OUT : Output depth + * H & W : Not used. + * @param[in, out] output_data Output data pointer. Data type: int8 + * @return The function returns <code>ARM_MATH_SUCCESS</code> + * + * @details + * - Supported framework: TensorFlow Lite + * - q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs. + */ +arm_status arm_fully_connected_s8(const cmsis_nn_context *ctx, + const cmsis_nn_fc_params *fc_params, + const cmsis_nn_per_tensor_quant_params *quant_params, + const cmsis_nn_dims *input_dims, + const q7_t *input_data, + const cmsis_nn_dims *filter_dims, + const q7_t *filter_data, + const cmsis_nn_dims *bias_dims, + const int32_t *bias_data, + const cmsis_nn_dims *output_dims, + q7_t *output_data); + +/** + * @brief Get the required buffer size for S8 basic fully-connected and + * matrix multiplication layer function for TF Lite + * @param[in] filter_dims dimension of filter + * @return The function returns required buffer size in bytes + * + */ +int32_t arm_fully_connected_s8_get_buffer_size(const cmsis_nn_dims *filter_dims); + +/** + * @brief Basic s16 Fully Connected function. + * + * @param[in, out] ctx Function context (e.g. temporary buffer). Check the function + * definition file to see if an additional buffer is required. + * Optional function {API}_get_buffer_size() provides the buffer + * size if an additional buffer is required. + * @param[in] fc_params Fully Connected layer parameters. + * fc_params->input_offset : 0 + * fc_params->filter_offset : 0 + * fc_params->output_offset : 0 + * @param[in] quant_params Per-tensor quantization info. + * It contains the multiplier and shift values to be applied to the output tensor. + * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN] + * Input dimension is taken as Nx(H * W * C_IN) + * @param[in] input_data Input (activation) data pointer. Data type: int16 + * @param[in] filter_dims Two dimensional filter dimensions. Format: [N, C] + * N : accumulation depth and equals (H * W * C_IN) from input_dims + * C : output depth and equals C_OUT in output_dims + * H & W : Not used + * @param[in] filter_data Filter data pointer. Data type: int8 + * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT] + * N, H, W : Not used + * @param[in] bias_data Bias data pointer. Data type: int64 + * @param[in] output_dims Output tensor dimensions. Format: [N, C_OUT] + * N : Batches + * C_OUT : Output depth + * H & W : Not used. + * @param[in, out] output_data Output data pointer. Data type: int16 + * @return The function returns <code>ARM_MATH_SUCCESS</code> + * + * @details + * - Supported framework: TensorFlow Lite + * - q15 is used as data type eventhough it is s16 data. It is done so to be consistent with existing APIs. + */ +arm_status arm_fully_connected_s16(const cmsis_nn_context *ctx, + const cmsis_nn_fc_params *fc_params, + const cmsis_nn_per_tensor_quant_params *quant_params, + const cmsis_nn_dims *input_dims, + const q15_t *input_data, + const cmsis_nn_dims *filter_dims, + const q7_t *filter_data, + const cmsis_nn_dims *bias_dims, + const int64_t *bias_data, + const cmsis_nn_dims *output_dims, + q15_t *output_data); + +/** + * @brief Get the required buffer size for S16 basic fully-connected and + * matrix multiplication layer function for TF Lite + * @param[in] filter_dims dimension of filter + * @return The function returns required buffer size in bytes + * + */ +int32_t arm_fully_connected_s16_get_buffer_size(const cmsis_nn_dims *filter_dims); + +/** + * @brief Q7 opt fully-connected layer function + * @param[in] pV pointer to input vector + * @param[in] pM pointer to matrix weights + * @param[in] dim_vec length of the vector + * @param[in] num_of_rows number of rows in weight matrix + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in] bias pointer to bias + * @param[in,out] pOut pointer to output vector + * @param[in,out] vec_buffer pointer to buffer space for input + * @return The function returns <code>ARM_MATH_SUCCESS</code> + * + */ + +arm_status arm_fully_connected_q7_opt(const q7_t *pV, + const q7_t *pM, + const uint16_t dim_vec, + const uint16_t num_of_rows, + const uint16_t bias_shift, + const uint16_t out_shift, + const q7_t *bias, + q7_t *pOut, + q15_t *vec_buffer); + +/** + * @brief Q15 basic fully-connected layer function + * @param[in] pV pointer to input vector + * @param[in] pM pointer to matrix weights + * @param[in] dim_vec length of the vector + * @param[in] num_of_rows number of rows in weight matrix + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in] bias pointer to bias + * @param[in,out] pOut pointer to output vector + * @param[in,out] vec_buffer pointer to buffer space for input + * @return The function returns <code>ARM_MATH_SUCCESS</code> + * + */ + +arm_status arm_fully_connected_q15(const q15_t *pV, + const q15_t *pM, + const uint16_t dim_vec, + const uint16_t num_of_rows, + const uint16_t bias_shift, + const uint16_t out_shift, + const q15_t *bias, + q15_t *pOut, + q15_t *vec_buffer); + +/** + * @brief Q15 opt fully-connected layer function + * @param[in] pV pointer to input vector + * @param[in] pM pointer to matrix weights + * @param[in] dim_vec length of the vector + * @param[in] num_of_rows number of rows in weight matrix + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in] bias pointer to bias + * @param[in,out] pOut pointer to output vector + * @param[in,out] vec_buffer pointer to buffer space for input + * @return The function returns <code>ARM_MATH_SUCCESS</code> + * + */ + +arm_status arm_fully_connected_q15_opt(const q15_t *pV, + const q15_t *pM, + const uint16_t dim_vec, + const uint16_t num_of_rows, + const uint16_t bias_shift, + const uint16_t out_shift, + const q15_t *bias, + q15_t *pOut, + q15_t *vec_buffer); + +/** + * @brief Mixed Q15-Q7 fully-connected layer function + * @param[in] pV pointer to input vector + * @param[in] pM pointer to matrix weights + * @param[in] dim_vec length of the vector + * @param[in] num_of_rows number of rows in weight matrix + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in] bias pointer to bias + * @param[in,out] pOut pointer to output vector + * @param[in,out] vec_buffer pointer to buffer space for input + * @return The function returns <code>ARM_MATH_SUCCESS</code> + * + */ + +arm_status arm_fully_connected_mat_q7_vec_q15(const q15_t *pV, + const q7_t *pM, + const uint16_t dim_vec, + const uint16_t num_of_rows, + const uint16_t bias_shift, + const uint16_t out_shift, + const q7_t *bias, + q15_t *pOut, + q15_t *vec_buffer); + +/** + * @brief Mixed Q15-Q7 opt fully-connected layer function + * @param[in] pV pointer to input vector + * @param[in] pM pointer to matrix weights + * @param[in] dim_vec length of the vector + * @param[in] num_of_rows number of rows in weight matrix + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in] bias pointer to bias + * @param[in,out] pOut pointer to output vector + * @param[in,out] vec_buffer pointer to buffer space for input + * @return The function returns <code>ARM_MATH_SUCCESS</code> + * + */ + +arm_status arm_fully_connected_mat_q7_vec_q15_opt(const q15_t *pV, + const q7_t *pM, + const uint16_t dim_vec, + const uint16_t num_of_rows, + const uint16_t bias_shift, + const uint16_t out_shift, + const q7_t *bias, + q15_t *pOut, + q15_t *vec_buffer); + +/** + * @brief Matrix-Multiplication Kernels for Convolution + * + * These functions are used within convolution layer functions for + * matrix multiplication. + * + * The implementation is similar to CMSIS-DSP arm_mat_mult functions + * with one Q7 and one Q15 operands. The Q15 operand is the im2col + * output which is always with 2 columns. + * + */ + +/** + * @brief Matrix-multiplication function for convolution + * @param[in] pA pointer to operand A + * @param[in] pInBuffer pointer to operand B, always conssists of 2 vectors + * @param[in] ch_im_out numRow of A + * @param[in] numCol_A numCol of A + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in] bias the bias + * @param[in,out] pOut pointer to output + * @return The function returns the incremented output pointer + */ + +q7_t *arm_nn_mat_mult_kernel_q7_q15(const q7_t *pA, + const q15_t *pInBuffer, + const uint16_t ch_im_out, + const uint16_t numCol_A, + const uint16_t bias_shift, + const uint16_t out_shift, + const q7_t *bias, + q7_t *pOut); + +#ifdef __cplusplus +} +#endif + +/* + * Other functions + * These layers are typically not timing critical + * Basic implementation is supported here + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @defgroup BasicMath Basic math functions + * + * Elementwise add and multiplication functions. + * + */ + +/** + * @brief s8 elementwise add of two vectors + * @param[in] input_1_vect pointer to input vector 1 + * @param[in] input_2_vect pointer to input vector 2 + * @param[in] input_1_offset offset for input 1. Range: -127 to 128 + * @param[in] input_1_mult multiplier for input 1 + * @param[in] input_1_shift shift for input 1 + * @param[in] input_2_offset offset for input 2. Range: -127 to 128 + * @param[in] input_2_mult multiplier for input 2 + * @param[in] input_2_shift shift for input 2 + * @param[in] left_shift input left shift + * @param[in,out] output pointer to output vector + * @param[in] out_offset output offset. Range: -128 to 127 + * @param[in] out_mult output multiplier + * @param[in] out_shift output shift + * @param[in] out_activation_min minimum value to clamp output to. Min: -128 + * @param[in] out_activation_max maximum value to clamp output to. Max: 127 + * @param[in] block_size number of samples + * @return The function returns ARM_MATH_SUCCESS + */ +arm_status arm_elementwise_add_s8(const int8_t *input_1_vect, + const int8_t *input_2_vect, + const int32_t input_1_offset, + const int32_t input_1_mult, + const int32_t input_1_shift, + const int32_t input_2_offset, + const int32_t input_2_mult, + const int32_t input_2_shift, + const int32_t left_shift, + int8_t *output, + const int32_t out_offset, + const int32_t out_mult, + const int32_t out_shift, + const int32_t out_activation_min, + const int32_t out_activation_max, + const int32_t block_size); + +/** + * @brief s16 elementwise add of two vectors + * @param[in] input_1_vect pointer to input vector 1 + * @param[in] input_2_vect pointer to input vector 2 + * @param[in] input_1_offset offset for input 1. Not used. + * @param[in] input_1_mult multiplier for input 1 + * @param[in] input_1_shift shift for input 1 + * @param[in] input_2_offset offset for input 2. Not used. + * @param[in] input_2_mult multiplier for input 2 + * @param[in] input_2_shift shift for input 2 + * @param[in] left_shift input left shift + * @param[in,out] output pointer to output vector + * @param[in] out_offset output offset. Not used. + * @param[in] out_mult output multiplier + * @param[in] out_shift output shift + * @param[in] out_activation_min minimum value to clamp output to. Min: -32768 + * @param[in] out_activation_max maximum value to clamp output to. Max: 32767 + * @param[in] block_size number of samples + * @return The function returns ARM_MATH_SUCCESS + */ +arm_status arm_elementwise_add_s16(const int16_t *input_1_vect, + const int16_t *input_2_vect, + const int32_t input_1_offset, + const int32_t input_1_mult, + const int32_t input_1_shift, + const int32_t input_2_offset, + const int32_t input_2_mult, + const int32_t input_2_shift, + const int32_t left_shift, + int16_t *output, + const int32_t out_offset, + const int32_t out_mult, + const int32_t out_shift, + const int32_t out_activation_min, + const int32_t out_activation_max, + const int32_t block_size); + +/** + * @brief s8 elementwise multiplication + * @param[in] input_1_vect pointer to input vector 1 + * @param[in] input_2_vect pointer to input vector 2 + * @param[in] input_1_offset offset for input 1. Range: -127 to 128 + * @param[in] input_2_offset offset for input 2. Range: -127 to 128 + * @param[in,out] output pointer to output vector + * @param[in] out_offset output offset. Range: -128 to 127 + * @param[in] out_mult output multiplier + * @param[in] out_shift output shift + * @param[in] out_activation_min minimum value to clamp output to. Min: -128 + * @param[in] out_activation_max maximum value to clamp output to. Max: 127 + * @param[in] block_size number of samples + * @return The function returns ARM_MATH_SUCCESS + * + * @details Supported framework: TensorFlow Lite micro + */ +arm_status arm_elementwise_mul_s8(const int8_t *input_1_vect, + const int8_t *input_2_vect, + const int32_t input_1_offset, + const int32_t input_2_offset, + int8_t *output, + const int32_t out_offset, + const int32_t out_mult, + const int32_t out_shift, + const int32_t out_activation_min, + const int32_t out_activation_max, + const int32_t block_size); + +/** + * @brief s16 elementwise multiplication + * @param[in] input_1_vect pointer to input vector 1 + * @param[in] input_2_vect pointer to input vector 2 + * @param[in] input_1_offset offset for input 1. Not used. + * @param[in] input_2_offset offset for input 2. Not used. + * @param[in,out] output pointer to output vector + * @param[in] out_offset output offset. Not used. + * @param[in] out_mult output multiplier + * @param[in] out_shift output shift + * @param[in] out_activation_min minimum value to clamp output to. Min: -32768 + * @param[in] out_activation_max maximum value to clamp output to. Max: 32767 + * @param[in] block_size number of samples + * @return The function returns ARM_MATH_SUCCESS + * + * @details Supported framework: TensorFlow Lite micro + */ +arm_status arm_elementwise_mul_s16(const int16_t *input_1_vect, + const int16_t *input_2_vect, + const int32_t input_1_offset, + const int32_t input_2_offset, + int16_t *output, + const int32_t out_offset, + const int32_t out_mult, + const int32_t out_shift, + const int32_t out_activation_min, + const int32_t out_activation_max, + const int32_t block_size); + +/** + * @defgroup Acti Activation Functions + * + * Perform activation layers, including ReLU (Rectified Linear Unit), + * sigmoid and tanh + * + */ + +/** + * @brief Q7 RELU function + * @param[in,out] data pointer to input + * @param[in] size number of elements + * @return none. + */ + +void arm_relu_q7(q7_t *data, uint16_t size); + +/** + * @brief s8 ReLU6 function + * @param[in,out] data pointer to input + * @param[in] size number of elements + */ + +void arm_relu6_s8(q7_t *data, uint16_t size); + +/** + * @brief Q15 RELU function + * @param[in,out] data pointer to input + * @param[in] size number of elements + * @return none. + */ + +void arm_relu_q15(q15_t *data, uint16_t size); + +/** + * @brief Q7 neural network activation function using direct table look-up + * @param[in,out] data pointer to input + * @param[in] size number of elements + * @param[in] int_width bit-width of the integer part, assume to be smaller than 3 + * @param[in] type type of activation functions + * @return none. + */ + +void arm_nn_activations_direct_q7(q7_t *data, uint16_t size, uint16_t int_width, arm_nn_activation_type type); + +/** + * @brief Q15 neural network activation function using direct table look-up + * @param[in,out] data pointer to input + * @param[in] size number of elements + * @param[in] int_width bit-width of the integer part, assume to be smaller than 3 + * @param[in] type type of activation functions + * @return none. + * + * @details + * + * This is the direct table look-up approach. + * + * Assume here the integer part of the fixed-point is <= 3. + * More than 3 just not making much sense, makes no difference with + * saturation followed by any of these activation functions. + */ + +void arm_nn_activations_direct_q15(q15_t *data, uint16_t size, uint16_t int_width, arm_nn_activation_type type); + +/** + * @defgroup Pooling Pooling Functions + * + * Perform pooling functions, including max pooling and average pooling + * + */ + +/** + * @brief Q7 max pooling function + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimension + * @param[in] ch_im_in number of input tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] Im_out pointer to output tensor + * @return none. + * + */ + +void arm_maxpool_q7_HWC(q7_t *Im_in, + const uint16_t dim_im_in, + const uint16_t ch_im_in, + const uint16_t dim_kernel, + const uint16_t padding, + const uint16_t stride, + const uint16_t dim_im_out, + q7_t *bufferA, + q7_t *Im_out); + +/** + * @brief Q7 average pooling function + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimension + * @param[in] ch_im_in number of input tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] Im_out pointer to output tensor + * @return none. + * + */ + +void arm_avepool_q7_HWC(q7_t *Im_in, + const uint16_t dim_im_in, + const uint16_t ch_im_in, + const uint16_t dim_kernel, + const uint16_t padding, + const uint16_t stride, + const uint16_t dim_im_out, + q7_t *bufferA, + q7_t *Im_out); + +/** + * @brief s8 average pooling function. + * + * @param[in, out] ctx Function context (e.g. temporary buffer). Check the function + * definition file to see if an additional buffer is required. + * Optional function {API}_get_buffer_size() provides the buffer + * size if an additional buffer is required. + * @param[in] pool_params Pooling parameters + * @param[in] input_dims Input (activation) tensor dimensions. Format: [H, W, C_IN] + * Argument 'N' is not used. + * @param[in] input_data Input (activation) data pointer. Data type: int8 + * @param[in] filter_dims Filter tensor dimensions. Format: [H, W] + * Argument N and C are not used. + * @param[in] output_dims Output tensor dimensions. Format: [H, W, C_OUT] + * Argument N is not used. + * C_OUT equals C_IN. + * @param[in, out] output_data Output data pointer. Data type: int8 + * @return The function returns + * <code>ARM_MATH_SUCCESS</code> - Successful operation + * + * @details + * - Supported Framework: TensorFlow Lite + * + */ +arm_status arm_avgpool_s8(const cmsis_nn_context *ctx, + const cmsis_nn_pool_params *pool_params, + const cmsis_nn_dims *input_dims, + const q7_t *input_data, + const cmsis_nn_dims *filter_dims, + const cmsis_nn_dims *output_dims, + q7_t *output_data); + +/** + * @brief Get the required buffer size for S8 average pooling function + * @param[in] dim_dst_width output tensor dimension + * @param[in] ch_src number of input tensor channels + * @return The function returns required buffer size in bytes + * + */ +int32_t arm_avgpool_s8_get_buffer_size(const int dim_dst_width, const int ch_src); + +/** + * @brief s16 average pooling function. + * + * @param[in, out] ctx Function context (e.g. temporary buffer). Check the function + * definition file to see if an additional buffer is required. + * Optional function {API}_get_buffer_size() provides the buffer + * size if an additional buffer is required. + * @param[in] pool_params Pooling parameters + * @param[in] input_dims Input (activation) tensor dimensions. Format: [H, W, C_IN] + * Argument 'N' is not used. + * @param[in] input_data Input (activation) data pointer. Data type: int16 + * @param[in] filter_dims Filter tensor dimensions. Format: [H, W] + * Argument N and C are not used. + * @param[in] output_dims Output tensor dimensions. Format: [H, W, C_OUT] + * Argument N is not used. + * C_OUT equals C_IN. + * @param[in, out] output_data Output data pointer. Data type: int16 + * @return The function returns + * <code>ARM_MATH_SUCCESS</code> - Successful operation + * + * @details + * - Supported Framework: TensorFlow Lite + * + */ +arm_status arm_avgpool_s16(const cmsis_nn_context *ctx, + const cmsis_nn_pool_params *pool_params, + const cmsis_nn_dims *input_dims, + const int16_t *input_data, + const cmsis_nn_dims *filter_dims, + const cmsis_nn_dims *output_dims, + int16_t *output_data); + +/** + * @brief Get the required buffer size for S16 average pooling function + * @param[in] dim_dst_width output tensor dimension + * @param[in] ch_src number of input tensor channels + * @return The function returns required buffer size in bytes + * + */ +int32_t arm_avgpool_s16_get_buffer_size(const int dim_dst_width, const int ch_src); + +/** + * @brief s8 max pooling function. + * + * @param[in, out] ctx Function context (e.g. temporary buffer). Check the function + * definition file to see if an additional buffer is required. + * Optional function {API}_get_buffer_size() provides the buffer + * size if an additional buffer is required. + * @param[in] pool_params Pooling parameters + * @param[in] input_dims Input (activation) tensor dimensions. Format: [H, W, C_IN] + * Argument 'N' is not used. + * @param[in] input_data Input (activation) data pointer. The input tensor must not + * overlap with the output tensor. Data type: int8 + * @param[in] filter_dims Filter tensor dimensions. Format: [H, W] + * Argument N and C are not used. + * @param[in] output_dims Output tensor dimensions. Format: [H, W, C_OUT] + * Argument N is not used. + * C_OUT equals C_IN. + * @param[in, out] output_data Output data pointer. Data type: int8 + * @return The function returns + * <code>ARM_MATH_SUCCESS</code> - Successful operation + * + * @details + * - Supported Framework: TensorFlow Lite + * + */ +arm_status arm_max_pool_s8(const cmsis_nn_context *ctx, + const cmsis_nn_pool_params *pool_params, + const cmsis_nn_dims *input_dims, + const q7_t *input_data, + const cmsis_nn_dims *filter_dims, + const cmsis_nn_dims *output_dims, + q7_t *output_data); + +/** + * @brief s16 max pooling function. + * + * @param[in, out] ctx Function context (e.g. temporary buffer). Check the function + * definition file to see if an additional buffer is required. + * Optional function {API}_get_buffer_size() provides the buffer + * size if an additional buffer is required. + * @param[in] pool_params Pooling parameters + * @param[in] input_dims Input (activation) tensor dimensions. Format: [H, W, C_IN] + * Argument 'N' is not used. + * @param[in] src Input (activation) data pointer. The input tensor must not + * overlap with the output tensor. Data type: int16 + * @param[in] filter_dims Filter tensor dimensions. Format: [H, W] + * Argument N and C are not used. + * @param[in] output_dims Output tensor dimensions. Format: [H, W, C_OUT] + * Argument N is not used. + * C_OUT equals C_IN. + * @param[in, out] dst Output data pointer. Data type: int16 + * @return The function returns + * <code>ARM_MATH_SUCCESS</code> - Successful operation + * + * @details + * - Supported Framework: TensorFlow Lite + * + */ +arm_status arm_max_pool_s16(const cmsis_nn_context *ctx, + const cmsis_nn_pool_params *pool_params, + const cmsis_nn_dims *input_dims, + const int16_t *src, + const cmsis_nn_dims *filter_dims, + const cmsis_nn_dims *output_dims, + int16_t *dst); + +/** + * @defgroup Softmax Softmax Functions + * + * EXP(2) based softmax functions. + * + */ + +/** + * @brief Q7 softmax function + * @param[in] vec_in pointer to input vector + * @param[in] dim_vec input vector dimension + * @param[out] p_out pointer to output vector + * + * @note This function is an optimized version which is not bit-accurate with + * TensorFlow Lite's kernel + * + */ + +void arm_softmax_q7(const q7_t *vec_in, const uint16_t dim_vec, q7_t *p_out); + +/** + * @brief Q7 softmax function with batch parameter + * @param[in] vec_in pointer to input vector + * @param[in] nb_batches number of batches + * @param[in] dim_vec input vector dimension + * @param[out] p_out pointer to output vector + * @return none. + * + * @note This function is an optimized version which is not bit-accurate with + * TensorFlow Lite's kernel + * + */ + +void arm_softmax_with_batch_q7(const q7_t *vec_in, const uint16_t nb_batches, const uint16_t dim_vec, q7_t *p_out); +/** + * @brief Q15 softmax function + * @param[in] vec_in pointer to input vector + * @param[in] dim_vec input vector dimension + * @param[out] p_out pointer to output vector + * @return none. + * + * @note This function is an optimized version which is not bit-accurate with + * TensorFlow Lite's kernel + * + */ + +void arm_softmax_q15(const q15_t *vec_in, const uint16_t dim_vec, q15_t *p_out); + +/** + * @brief S8 softmax function + * @param[in] input Pointer to the input tensor + * @param[in] num_rows Number of rows in the input tensor + * @param[in] row_size Number of elements in each input row + * @param[in] mult Input quantization multiplier + * @param[in] shift Input quantization shift within the range [0, 31] + * @param[in] diff_min Minimum difference with max in row. Used to check if + * the quantized exponential operation can be performed + * @param[out] output Pointer to the output tensor + * + * @note Supported framework: TensorFlow Lite micro (bit-accurate) + * + */ +void arm_softmax_s8(const int8_t *input, + const int32_t num_rows, + const int32_t row_size, + const int32_t mult, + const int32_t shift, + const int32_t diff_min, + int8_t *output); + +/** + * @brief S8 to s16 softmax function + * @param[in] input Pointer to the input tensor + * @param[in] num_rows Number of rows in the input tensor + * @param[in] row_size Number of elements in each input row + * @param[in] mult Input quantization multiplier + * @param[in] shift Input quantization shift within the range [0, 31] + * @param[in] diff_min Minimum difference with max in row. Used to check if + * the quantized exponential operation can be performed + * @param[out] output Pointer to the output tensor + * + * @note Supported framework: TensorFlow Lite micro (bit-accurate) + * + */ +void arm_softmax_s8_s16(const int8_t *input, + const int32_t num_rows, + const int32_t row_size, + const int32_t mult, + const int32_t shift, + const int32_t diff_min, + int16_t *output); + +/** + * @brief S16 softmax function + * @param[in] input Pointer to the input tensor + * @param[in] num_rows Number of rows in the input tensor + * @param[in] row_size Number of elements in each input row + * @param[in] mult Input quantization multiplier + * @param[in] shift Input quantization shift within the range [0, 31] + * @param[in] softmax_params Softmax s16 layer parameters with two pointers to LUTs speficied below. + * For indexing the high 9 bits are used and 7 remaining for interpolation. + * That means 512 entries for the 9-bit indexing and 1 extra for interpolation, i.e. 513 + * values for each LUT. + * - Lookup table for exp(x), where x uniform distributed between [-10.0 , 0.0] + * - Lookup table for 1 / (1 + x), where x uniform distributed between [0.0 , 1.0] + * @param[out] output Pointer to the output tensor + * @return The function returns + * <code>ARM_MATH_ARGUMENT_ERROR</code> if LUTs are NULL + * <code>ARM_MATH_SUCCESS</code> - Successful operation + * + * @note Supported framework: TensorFlow Lite micro (bit-accurate) + * + */ +arm_status arm_softmax_s16(const int16_t *input, + const int32_t num_rows, + const int32_t row_size, + const int32_t mult, + const int32_t shift, + const cmsis_nn_softmax_lut_s16 *softmax_params, + int16_t *output); + +/** + * @brief U8 softmax function + * @param[in] input Pointer to the input tensor + * @param[in] num_rows Number of rows in the input tensor + * @param[in] row_size Number of elements in each input row + * @param[in] mult Input quantization multiplier + * @param[in] shift Input quantization shift within the range [0, 31] + * @param[in] diff_min Minimum difference with max in row. Used to check if + * the quantized exponential operation can be performed + * @param[out] output Pointer to the output tensor + * + * @note Supported framework: TensorFlow Lite micro (bit-accurate) + * + */ + +void arm_softmax_u8(const uint8_t *input, + const int32_t num_rows, + const int32_t row_size, + const int32_t mult, + const int32_t shift, + const int32_t diff_min, + uint8_t *output); + +/** + * @brief uint8 depthwise convolution function with asymmetric quantization + * Unless specified otherwise, arguments are mandatory. + * + * @param[in] input Pointer to input tensor + * @param[in] input_x Width of input tensor + * @param[in] input_y Height of input tensor + * @param[in] input_ch Channels in input tensor + * @param[in] kernel Pointer to kernel weights + * @param[in] kernel_x Width of kernel + * @param[in] kernel_y Height of kernel + * @param[in] ch_mult Number of channel multiplier + * @param[in] pad_x Padding sizes x + * @param[in] pad_y Padding sizes y + * @param[in] stride_x stride along the width + * @param[in] stride_y stride along the height + * @param[in] dilation_x Dilation along width. Not used and intended for future enhancement. + * @param[in] dilation_y Dilation along height. Not used and intended for future enhancement. + * @param[in] bias Pointer to optional bias values. If no bias is + * availble, NULL is expected + * @param[in] input_offset Input tensor zero offset + * @param[in] filter_offset Kernel tensor zero offset + * @param[in] output_offset Output tensor zero offset + * @param[in,out] output Pointer to output tensor + * @param[in] output_x Width of output tensor + * @param[in] output_y Height of output tensor + * @param[in] output_activation_min Minimum value to clamp the output to. Range : {0, 255} + * @param[in] output_activation_max Minimum value to clamp the output to. Range : {0, 255} + * @param[in] out_shift Amount of right-shift for output + * @param[in] out_mult Output multiplier for requantization + * @return The function returns the following + * <code>ARM_MATH_SUCCESS</code> - Successful operation + * + */ +arm_status arm_depthwise_conv_u8_basic_ver1(const uint8_t *input, + const uint16_t input_x, + const uint16_t input_y, + const uint16_t input_ch, + const uint8_t *kernel, + const uint16_t kernel_x, + const uint16_t kernel_y, + const int16_t ch_mult, + const int16_t pad_x, + const int16_t pad_y, + const int16_t stride_x, + const int16_t stride_y, + const int16_t dilation_x, + const int16_t dilation_y, + const int32_t *bias, + const int32_t input_offset, + const int32_t filter_offset, + const int32_t output_offset, + uint8_t *output, + const uint16_t output_x, + const uint16_t output_y, + const int32_t output_activation_min, + const int32_t output_activation_max, + const int32_t out_shift, + const int32_t out_mult); + +/** + * @defgroup Reshape Reshape Functions + * + */ + +/** + * @brief Reshape a s8 vector into another with different shape + * @param[in] input points to the s8 input vector + * @param[out] output points to the s8 output vector + * @param[in] total_size total size of the input and output vectors in bytes + * + * @note The output is expected to be in a memory area that does not overlap with the input's + * + */ +void arm_reshape_s8(const int8_t *input, int8_t *output, const uint32_t total_size); + +/** + * @defgroup Concatenation Concatenation Functions + * + */ + +/** + * @brief int8/uint8 concatenation function to be used for concatenating N-tensors along the X axis + * This function should be called for each input tensor to concatenate. The argument offset_x + * will be used to store the input tensor in the correct position in the output tensor + * + * i.e. offset_x = 0 + * for(i = 0 i < num_input_tensors; ++i) + * { + * arm_concatenation_s8_x(&input[i], ..., &output, ..., ..., offset_x) + * offset_x += input_x[i] + * } + * + * This function assumes that the output tensor has: + * -# The same height of the input tensor + * -# The same number of channels of the input tensor + * -# The same batch size of the input tensor + * + * Unless specified otherwise, arguments are mandatory. + * + * @note This function, data layout independent, can be used to concatenate either int8 or uint8 tensors because it + * does not involve any arithmetic operation + * + * @param[in] input Pointer to input tensor. Input tensor must not overlap with the output tensor. + * @param[in] input_x Width of input tensor + * @param[in] input_y Height of input tensor + * @param[in] input_z Channels in input tensor + * @param[in] input_w Batch size in input tensor + * @param[out] output Pointer to output tensor. Expected to be at least + * (input_x * input_y * input_z * input_w) + offset_x + * bytes. + * @param[in] output_x Width of output tensor + * @param[in] offset_x The offset (in number of elements) on the X axis to start concatenating the input tensor + * It is user responsibility to provide the correct value + * + * <b> Input constraints</b> + * offset_x is less than output_x + * + */ +void arm_concatenation_s8_x(const int8_t *input, + const uint16_t input_x, + const uint16_t input_y, + const uint16_t input_z, + const uint16_t input_w, + int8_t *output, + const uint16_t output_x, + const uint32_t offset_x); + +/** + * @brief int8/uint8 concatenation function to be used for concatenating N-tensors along the Y axis + * This function should be called for each input tensor to concatenate. The argument offset_y + * will be used to store the input tensor in the correct position in the output tensor + * + * i.e. offset_y = 0 + * for(i = 0 i < num_input_tensors; ++i) + * { + * arm_concatenation_s8_y(&input[i], ..., &output, ..., ..., offset_y) + * offset_y += input_y[i] + * } + * + * This function assumes that the output tensor has: + * -# The same width of the input tensor + * -# The same number of channels of the input tensor + * -# The same batch size of the input tensor + * + * Unless specified otherwise, arguments are mandatory. + * + * @note This function, data layout independent, can be used to concatenate either int8 or uint8 tensors because it + * does not involve any arithmetic operation + * + * @param[in] input Pointer to input tensor. Input tensor must not overlap with the output tensor. + * @param[in] input_x Width of input tensor + * @param[in] input_y Height of input tensor + * @param[in] input_z Channels in input tensor + * @param[in] input_w Batch size in input tensor + * @param[out] output Pointer to output tensor. Expected to be at least + * (input_z * input_w * input_x * input_y) + offset_y + * bytes. + * @param[in] output_y Height of output tensor + * @param[in] offset_y The offset on the Y axis to start concatenating the input tensor + * It is user responsibility to provide the correct value + * + * <b> Input constraints</b> + * offset_y is less than output_y + * + */ +void arm_concatenation_s8_y(const int8_t *input, + const uint16_t input_x, + const uint16_t input_y, + const uint16_t input_z, + const uint16_t input_w, + int8_t *output, + const uint16_t output_y, + const uint32_t offset_y); + +/** + * @brief int8/uint8 concatenation function to be used for concatenating N-tensors along the Z axis + * This function should be called for each input tensor to concatenate. The argument offset_z + * will be used to store the input tensor in the correct position in the output tensor + * + * i.e. offset_z = 0 + * for(i = 0 i < num_input_tensors; ++i) + * { + * arm_concatenation_s8_z(&input[i], ..., &output, ..., ..., offset_z) + * offset_z += input_z[i] + * } + * + * This function assumes that the output tensor has: + * -# The same width of the input tensor + * -# The same height of the input tensor + * -# The same batch size of the input tensor + * + * Unless specified otherwise, arguments are mandatory. + * + * @note This function, data layout independent, can be used to concatenate either int8 or uint8 tensors because it + * does not involve any arithmetic operation + * + * @param[in] input Pointer to input tensor. Input tensor must not overlap with output tensor. + * @param[in] input_x Width of input tensor + * @param[in] input_y Height of input tensor + * @param[in] input_z Channels in input tensor + * @param[in] input_w Batch size in input tensor + * @param[out] output Pointer to output tensor. Expected to be at least + * (input_x * input_y * input_z * input_w) + offset_z + * bytes. + * @param[in] output_z Channels in output tensor + * @param[in] offset_z The offset on the Z axis to start concatenating the input tensor + * It is user responsibility to provide the correct value + * + * <b> Input constraints</b> + * offset_z is less than output_z + * + */ +void arm_concatenation_s8_z(const int8_t *input, + const uint16_t input_x, + const uint16_t input_y, + const uint16_t input_z, + const uint16_t input_w, + int8_t *output, + const uint16_t output_z, + const uint32_t offset_z); + +/** + * @brief int8/uint8 concatenation function to be used for concatenating N-tensors along the W axis (Batch size) + * This function should be called for each input tensor to concatenate. The argument offset_w + * will be used to store the input tensor in the correct position in the output tensor + * + * i.e. offset_w = 0 + * for(i = 0 i < num_input_tensors; ++i) + * { + * arm_concatenation_s8_w(&input[i], ..., &output, ..., ..., offset_w) + * offset_w += input_w[i] + * } + * + * This function assumes that the output tensor has: + * -# The same width of the input tensor + * -# The same height of the input tensor + * -# The same number o channels of the input tensor + * + * Unless specified otherwise, arguments are mandatory. + * + * @note This function, data layout independent, can be used to concatenate either int8 or uint8 tensors because it + * does not involve any arithmetic operation + * + * @param[in] input Pointer to input tensor + * @param[in] input_x Width of input tensor + * @param[in] input_y Height of input tensor + * @param[in] input_z Channels in input tensor + * @param[in] input_w Batch size in input tensor + * @param[out] output Pointer to output tensor. Expected to be at least + * input_x * input_y * input_z * input_w + * bytes. + * @param[in] offset_w The offset on the W axis to start concatenating the input tensor + * It is user responsibility to provide the correct value + * + */ +void arm_concatenation_s8_w(const int8_t *input, + const uint16_t input_x, + const uint16_t input_y, + const uint16_t input_z, + const uint16_t input_w, + int8_t *output, + const uint32_t offset_w); +/** + * @defgroup SVDF SVDF Layer Functions + * + */ + +/** + * @brief s8 SVDF function with 8 bit state tensor and 8 bit time weights + * + * @param[in] input_ctx Temporary scratch buffer + * @param[in] output_ctx Temporary output scratch buffer + * @param[in] svdf_params SVDF Parameters + * Range of svdf_params->input_offset : [-128, 127] + * Range of svdf_params->output_offset : [-128, 127] + * @param[in] input_quant_params Input quantization parameters + * @param[in] output_quant_params Output quantization parameters + * @param[in] input_dims Input tensor dimensions + * @param[in] input_data Pointer to input tensor + * @param[in] state_dims State tensor dimensions + * @param[in] state_data Pointer to state tensor + * @param[in] weights_feature_dims Weights (feature) tensor dimensions + * @param[in] weights_feature_data Pointer to the weights (feature) tensor + * @param[in] weights_time_dims Weights (time) tensor dimensions + * @param[in] weights_time_data Pointer to the weights (time) tensor + * @param[in] bias_dims Bias tensor dimensions + * @param[in] bias_data Pointer to bias tensor + * @param[in] output_dims Output tensor dimensions + * @param[out] output_data Pointer to the output tensor + * + * @return The function returns <code>ARM_MATH_SUCCESS</code> + * + * @details + * 1. Supported framework: TensorFlow Lite micro + * 2. q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs. + * + */ +arm_status arm_svdf_s8(const cmsis_nn_context *input_ctx, + const cmsis_nn_context *output_ctx, + const cmsis_nn_svdf_params *svdf_params, + const cmsis_nn_per_tensor_quant_params *input_quant_params, + const cmsis_nn_per_tensor_quant_params *output_quant_params, + const cmsis_nn_dims *input_dims, + const q7_t *input_data, + const cmsis_nn_dims *state_dims, + q7_t *state_data, + const cmsis_nn_dims *weights_feature_dims, + const q7_t *weights_feature_data, + const cmsis_nn_dims *weights_time_dims, + const q7_t *weights_time_data, + const cmsis_nn_dims *bias_dims, + const q31_t *bias_data, + const cmsis_nn_dims *output_dims, + q7_t *output_data); + +/** + * @brief s8 SVDF function with 16 bit state tensor and 16 bit time weights + * + * @param[in] input_ctx Temporary scratch buffer + * @param[in] output_ctx Temporary output scratch buffer + * @param[in] svdf_params SVDF Parameters + * Range of svdf_params->input_offset : [-128, 127] + * Range of svdf_params->output_offset : [-128, 127] + * @param[in] input_quant_params Input quantization parameters + * @param[in] output_quant_params Output quantization parameters + * @param[in] input_dims Input tensor dimensions + * @param[in] input_data Pointer to input tensor + * @param[in] state_dims State tensor dimensions + * @param[in] state_data Pointer to state tensor + * @param[in] weights_feature_dims Weights (feature) tensor dimensions + * @param[in] weights_feature_data Pointer to the weights (feature) tensor + * @param[in] weights_time_dims Weights (time) tensor dimensions + * @param[in] weights_time_data Pointer to the weights (time) tensor + * @param[in] bias_dims Bias tensor dimensions + * @param[in] bias_data Pointer to bias tensor + * @param[in] output_dims Output tensor dimensions + * @param[out] output_data Pointer to the output tensor + * + * @return The function returns <code>ARM_MATH_SUCCESS</code> + * + * @details + * 1. Supported framework: TensorFlow Lite micro + * 2. q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs. + * + */ +arm_status arm_svdf_state_s16_s8(const cmsis_nn_context *input_ctx, + const cmsis_nn_context *output_ctx, + const cmsis_nn_svdf_params *svdf_params, + const cmsis_nn_per_tensor_quant_params *input_quant_params, + const cmsis_nn_per_tensor_quant_params *output_quant_params, + const cmsis_nn_dims *input_dims, + const q7_t *input_data, + const cmsis_nn_dims *state_dims, + q15_t *state_data, + const cmsis_nn_dims *weights_feature_dims, + const q7_t *weights_feature_data, + const cmsis_nn_dims *weights_time_dims, + const q15_t *weights_time_data, + const cmsis_nn_dims *bias_dims, + const q31_t *bias_data, + const cmsis_nn_dims *output_dims, + q7_t *output_data); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/Drivers/CMSIS/NN/Include/arm_nnsupportfunctions.h b/Drivers/CMSIS/NN/Include/arm_nnsupportfunctions.h new file mode 100644 index 0000000..4b50564 --- /dev/null +++ b/Drivers/CMSIS/NN/Include/arm_nnsupportfunctions.h @@ -0,0 +1,1186 @@ +/* + * Copyright (C) 2010-2022 Arm Limited or its affiliates. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_nnsupportfunctions.h + * Description: Public header file of support functions for CMSIS NN Library + * + * $Date: 19. April 2022 + * $Revision: V.7.0.1 + * + * Target Processor: Cortex-M CPUs + * -------------------------------------------------------------------- */ + +#ifndef _ARM_NNSUPPORTFUNCTIONS_H_ +#define _ARM_NNSUPPORTFUNCTIONS_H_ + +#include "arm_nn_math_types.h" +#include "arm_nn_types.h" + +#include <stdbool.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define LEFT_SHIFT(_shift) (_shift > 0 ? _shift : 0) +#define RIGHT_SHIFT(_shift) (_shift > 0 ? 0 : -_shift) +#define MASK_IF_ZERO(x) (x) == 0 ? ~0 : 0 +#define MASK_IF_NON_ZERO(x) (x) != 0 ? ~0 : 0 +#define SELECT_USING_MASK(mask, a, b) ((mask) & (a)) ^ (~(mask) & (b)) + +#define MAX(A, B) ((A) > (B) ? (A) : (B)) +#define MIN(A, B) ((A) < (B) ? (A) : (B)) +#define CLAMP(x, h, l) MAX(MIN((x), (h)), (l)) +#define REDUCE_MULTIPLIER(_mult) ((_mult < 0x7FFF0000) ? ((_mult + (1 << 15)) >> 16) : 0x7FFF) + +/** + * @brief definition to pack four 8 bit values. + */ +#define PACK_Q7x4_32x1(v0, v1, v2, v3) \ + ((((int32_t)(v0) << 0) & (int32_t)0x000000FF) | (((int32_t)(v1) << 8) & (int32_t)0x0000FF00) | \ + (((int32_t)(v2) << 16) & (int32_t)0x00FF0000) | (((int32_t)(v3) << 24) & (int32_t)0xFF000000)) + +/** + * @brief Union for SIMD access of q31/q15/q7 types + */ +union arm_nnword +{ + q31_t word; + /**< q31 type */ + q15_t half_words[2]; + /**< q15 type */ + q7_t bytes[4]; + /**< q7 type */ +}; + +/** + * @brief Union for data type long long + */ +struct arm_nn_double +{ + uint32_t low; + int32_t high; +}; + +union arm_nn_long_long +{ + int64_t long_long; + struct arm_nn_double word; +}; + +/** + * @defgroup nndata_convert Neural Network Data Conversion Functions + * + * Perform data type conversion in-between neural network operations + * + */ + +/** + * @brief Converts the elements of the q7 vector to q15 vector without left-shift + * @param[in] *pSrc points to the q7 input vector + * @param[out] *pDst points to the q15 output vector + * @param[in] blockSize length of the input vector + * + */ +void arm_q7_to_q15_no_shift(const q7_t *pSrc, q15_t *pDst, uint32_t blockSize); + +/** + * @brief Non-saturating addition of elements of a q7 vector + * @param[in] *input Pointer to the q7 input vector + * @param[out] *output Pointer to the q31 output variable. + * @param[in] block_size length of the input vector + * \par Description: + * + * 2^24 samples can be added without saturating the result. + * + * The equation used for the conversion process is: + * + * <pre> + * sum = input[0] + input[1] + .. + input[block_size -1] + * </pre> + * + * */ +void arm_nn_add_q7(const q7_t *input, q31_t *output, uint32_t block_size); + +/** + * @brief Converts the elements of the q7 vector to reordered q15 vector without left-shift + * @param[in] *pSrc points to the q7 input vector + * @param[out] *pDst points to the q15 output vector + * @param[in] blockSize length of the input vector + * @return none. + * + */ +void arm_q7_to_q15_reordered_no_shift(const q7_t *pSrc, q15_t *pDst, uint32_t blockSize); + +/** + * @brief Converts the elements from a q7 vector to a q15 vector with an added offset + * @param[in] src pointer to the q7 input vector + * @param[out] dst pointer to the q15 output vector + * @param[in] block_size length of the input vector + * @param[in] offset q7 offset to be added to each input vector element. + * + * \par Description: + * + * The equation used for the conversion process is: + * + * <pre> + * dst[n] = (q15_t) src[n] + offset; 0 <= n < block_size. + * </pre> + * + */ +void arm_q7_to_q15_with_offset(const q7_t *src, q15_t *dst, uint32_t block_size, q15_t offset); + +/** + * @brief Converts the elements of the q7 vector to reordered q15 vector with an added offset + * @param[in] src pointer to the q7 input vector + * @param[out] dst pointer to the q15 output vector + * @param[in] block_size length of the input vector + * @param[in] offset offset to be added to each input vector element. + * @return none. + * + * @details This function does the q7 to q15 expansion with re-ordering of bytes. Re-ordering is a consequence of + * the sign extension intrinsic(DSP extension). The tail (i.e., last (N % 4) elements) retains its + * original order. + * + */ +void arm_q7_to_q15_reordered_with_offset(const q7_t *src, q15_t *dst, uint32_t block_size, q15_t offset); + +/** + * @brief Converts the elements from a q7 vector and accumulate to a q15 vector + * @param[in] *src points to the q7 input vector + * @param[out] *dst points to the q15 output vector + * @param[in] block_size length of the input vector + * + * \par Description: + * + * The equation used for the conversion process is: + * + * <pre> + * dst[n] += (q15_t) src[n] ; 0 <= n < block_size. + * </pre> + * + */ +void arm_nn_accumulate_q7_to_q15(q15_t *dst, const q7_t *src, uint32_t block_size); + +/** + * @brief Depthwise conv on an im2col buffer where the input channel equals output channel. + * @param[in] row pointer to row + * @param[in] col pointer to im2col buffer, always consists of 2 columns. + * @param[in] num_ch number of channels + * @param[in] out_shift pointer to per output channel requantization shift parameter. + * @param[in] out_mult pointer to per output channel requantization multiplier parameter. + * @param[in] out_offset output tensor offset. + * @param[in] activation_min minimum value to clamp the output to. Range : int8 + * @param[in] activation_max maximum value to clamp the output to. Range : int8 + * @param[in] kernel_size number of elements in one column. + * @param[in] output_bias per output channel bias. Range : int32 + * @param[out] out pointer to output + * @return The function returns one of the two + * 1. The incremented output pointer for a successful operation or + * 2. NULL if implementation is not available. + * + * @details Supported framework: TensorFlow Lite micro. + */ +q7_t *arm_nn_depthwise_conv_s8_core(const q7_t *row, + const q15_t *col, + const uint16_t num_ch, + const int32_t *out_shift, + const int32_t *out_mult, + const int32_t out_offset, + const int32_t activation_min, + const int32_t activation_max, + const uint16_t kernel_size, + const int32_t *const output_bias, + q7_t *out); + +/** + * @brief General Matrix-multiplication function with per-channel requantization. + * @param[in] input_row pointer to row operand + * @param[in] input_col pointer to col operand + * @param[in] output_ch number of rows of input_row + * @param[in] col_batches number of column batches. Range: 1 to 4 + * @param[in] output_shift pointer to per output channel requantization shift parameter. + * @param[in] output_mult pointer to per output channel requantization multiplier parameter. + * @param[in] out_offset output tensor offset. + * @param[in] col_offset input tensor(col) offset. + * @param[in] row_offset kernel offset(row). Not used. + * @param[in] out_activation_min minimum value to clamp the output to. Range : int8 + * @param[in] out_activation_max maximum value to clamp the output to. Range : int8 + * @param[in] row_len number of elements in each row + * @param[in] bias per output channel bias. Range : int32 + * @param[in,out] out pointer to output + * @return The function returns one of the two + * 1. The incremented output pointer for a successful operation or + * 2. NULL if implementation is not available. + * + * @details Supported framework: TensorFlow Lite + */ +q7_t *arm_nn_mat_mult_s8(const q7_t *input_row, + const q7_t *input_col, + const uint16_t output_ch, + const uint16_t col_batches, + const int32_t *output_shift, + const int32_t *output_mult, + const int32_t out_offset, + const int32_t col_offset, + const int32_t row_offset, + const int16_t out_activation_min, + const int16_t out_activation_max, + const uint16_t row_len, + const int32_t *const bias, + q7_t *out); +/** + * @brief Matrix-multiplication function for convolution with per-channel requantization for 16 bits convolution. + * @param[in] input_a pointer to operand A + * @param[in] input_b pointer to operand B, always consists of 2 vectors. + * @param[in] output_ch number of rows of A + * @param[in] out_shift pointer to per output channel requantization shift parameter. + * @param[in] out_mult pointer to per output channel requantization multiplier parameter. + * @param[in] activation_min minimum value to clamp the output to. Range : int16 + * @param[in] activation_max maximum value to clamp the output to. Range : int16 + * @param[in] num_col_a number of columns of A + * @param[in] output_bias per output channel bias. Range : int64 + * @param[in,out] out_0 pointer to output + * @return The function returns one of the two + * 1. The incremented output pointer for a successful operation or + * 2. NULL if implementation is not available. + * + * @details This function does the matrix multiplication of weight matrix for all output channels + * with 2 columns from im2col and produces two elements/output_channel. The outputs are + * clamped in the range provided by activation min and max. + * Supported framework: TensorFlow Lite micro. + */ +q15_t *arm_nn_mat_mult_kernel_s16(const q7_t *input_a, + const q15_t *input_b, + const int32_t output_ch, + const int32_t *out_shift, + const int32_t *out_mult, + const int16_t activation_min, + const int16_t activation_max, + const int32_t num_col_a, + const int64_t *const output_bias, + q15_t *out_0); +/** + * @brief General Matrix-multiplication without requantization for one row & one column + * @param[in] row_elements number of row elements + * @param[in] row_base pointer to row operand + * @param[in] col_base pointer to col operand + * @param[out] sum_col pointer to store sum of column elements + * @param[out] output pointer to store result of multiply-accumulate + * @return The function returns the multiply-accumulated result of the row by column. + * + * @details Pseudo-code + * *output = 0 + * sum_col = 0 + * for (i = 0; i < row_elements; i++) + * *output += row_base[i] * col_base[i] + * sum_col += col_base[i] + * + */ +arm_status arm_nn_mat_mul_core_1x_s8(int32_t row_elements, + const int8_t *row_base, + const int8_t *col_base, + int32_t *const sum_col, + int32_t *const output); + +/** + * @brief Matrix-multiplication with requantization & activation function for four rows and one column + * @param[in] row_elements number of row elements + * @param[in] offset offset between rows. Can be the same as row_elements. + * For e.g, in a 1x1 conv scenario with stride as 1. + * @param[in] row_base pointer to row operand + * @param[in] col_base pointer to col operand + * @param[in] out_ch Number of output channels + * @param[in] conv_params Pointer to convolution parameters like offsets and activation values + * @param[in] quant_params Pointer to per-channel quantization parameters + * @param[in] bias Pointer to per-channel bias + * @param[out] output Pointer to output where int8 results are stored. + * + * @return The function returns the updated output pointer or NULL if implementation is not available. + * + * @details Compliant to TFLM int8 specification. MVE implementation only + */ +int8_t *arm_nn_mat_mul_core_4x_s8(const int32_t row_elements, + const int32_t offset, + const int8_t *row_base, + const int8_t *col_base, + const int32_t out_ch, + const cmsis_nn_conv_params *conv_params, + const cmsis_nn_per_channel_quant_params *quant_params, + const int32_t *bias, + int8_t *output); + +/** + * @brief General Matrix-multiplication function with per-channel requantization. + * This function assumes: + * - LHS input matrix NOT transposed (nt) + * - RHS input matrix transposed (t) + * + * @note This operation also performs the broadcast bias addition before the requantization + * + * @param[in] lhs Pointer to the LHS input matrix + * @param[in] rhs Pointer to the RHS input matrix + * @param[in] bias Pointer to the bias vector. The length of this vector is equal to the number of + * output columns (or RHS input rows) + * @param[out] dst Pointer to the output matrix with "m" rows and "n" columns + * @param[in] dst_multipliers Pointer to the multipliers vector needed for the per-channel requantization. + * The length of this vector is equal to the number of output columns (or RHS input + * rows) + * @param[in] dst_shifts Pointer to the shifts vector needed for the per-channel requantization. The length + * of this vector is equal to the number of output columns (or RHS input rows) + * @param[in] lhs_rows Number of LHS input rows + * @param[in] rhs_rows Number of RHS input rows + * @param[in] rhs_cols Number of LHS/RHS input columns + * @param[in] lhs_offset Offset to be applied to the LHS input value + * @param[in] dst_offset Offset to be applied the output result + * @param[in] activation_min Minimum value to clamp down the output. Range : int8 + * @param[in] activation_max Maximum value to clamp up the output. Range : int8 + * + * @return The function returns <code>ARM_MATH_SUCCESS</code> + * + */ +arm_status arm_nn_mat_mult_nt_t_s8(const q7_t *lhs, + const q7_t *rhs, + const q31_t *bias, + q7_t *dst, + const int32_t *dst_multipliers, + const int32_t *dst_shifts, + const int32_t lhs_rows, + const int32_t rhs_rows, + const int32_t rhs_cols, + const int32_t lhs_offset, + const int32_t dst_offset, + const int32_t activation_min, + const int32_t activation_max); + +/** + * @brief s8 Vector by Matrix (transposed) multiplication + * + * @param[in] lhs Input left-hand side vector + * @param[in] rhs Input right-hand side matrix (transposed) + * @param[in] bias Input bias + * @param[out] dst Output vector + * @param[in] lhs_offset Offset to be added to the input values of the left-hand side vector. + * Range: -127 to 128 + * @param[in] rhs_offset Not used + * @param[in] dst_offset Offset to be added to the output values. Range: -127 to 128 + * @param[in] dst_multiplier Output multiplier + * @param[in] dst_shift Output shift + * @param[in] rhs_cols Number of columns in the right-hand side input matrix + * @param[in] rhs_rows Number of rows in the right-hand side input matrix + * @param[in] activation_min Minimum value to clamp the output to. Range: int8 + * @param[in] activation_max Maximum value to clamp the output to. Range: int8 + * @param[in] address_offset Memory position offset for dst. First output is stored at 'dst', the + * second at 'dst + address_offset' and so on. Default value is typically 1. + * + * @return The function returns <code>ARM_MATH_SUCCESS</code> + * + */ +arm_status arm_nn_vec_mat_mult_t_s8(const q7_t *lhs, + const q7_t *rhs, + const q31_t *bias, + q7_t *dst, + const int32_t lhs_offset, + const int32_t rhs_offset, + const int32_t dst_offset, + const int32_t dst_multiplier, + const int32_t dst_shift, + const int32_t rhs_cols, + const int32_t rhs_rows, + const int32_t activation_min, + const int32_t activation_max, + const int32_t address_offset); + +/** + * @brief s16 Vector by Matrix (transposed) multiplication + * + * @param[in] lhs Input left-hand side vector + * @param[in] rhs Input right-hand side matrix (transposed) + * @param[in] bias Input bias + * @param[out] dst Output vector + * @param[in] dst_multiplier Output multiplier + * @param[in] dst_shift Output shift + * @param[in] rhs_cols Number of columns in the right-hand side input matrix + * @param[in] rhs_rows Number of rows in the right-hand side input matrix + * @param[in] activation_min Minimum value to clamp the output to. Range: int16 + * @param[in] activation_max Maximum value to clamp the output to. Range: int16 + * + * @return The function returns <code>ARM_MATH_SUCCESS</code> + * + */ +arm_status arm_nn_vec_mat_mult_t_s16(const q15_t *lhs, + const q7_t *rhs, + const q63_t *bias, + q15_t *dst, + const int32_t dst_multiplier, + const int32_t dst_shift, + const int32_t rhs_cols, + const int32_t rhs_rows, + const int32_t activation_min, + const int32_t activation_max); + +/** + * @brief s8 Vector by Matrix (transposed) multiplication with s16 output + * + * @param[in] lhs Input left-hand side vector + * @param[in] rhs Input right-hand side matrix (transposed) + * @param[out] dst Output vector + * @param[in] lhs_offset Offset to be added to the input values of the left-hand side + * vector. Range: -127 to 128 + * @param[in] rhs_offset Not used + * @param[in] scatter_offset Address offset for dst. First output is stored at 'dst', the + * second at 'dst + scatter_offset' and so on. + * @param[in] dst_multiplier Output multiplier + * @param[in] dst_shift Output shift + * @param[in] rhs_cols Number of columns in the right-hand side input matrix + * @param[in] rhs_rows Number of rows in the right-hand side input matrix + * @param[in] activation_min Minimum value to clamp the output to. Range: int16 + * @param[in] activation_max Maximum value to clamp the output to. Range: int16 + * + * @return The function returns <code>ARM_MATH_SUCCESS</code> + * + */ +arm_status arm_nn_vec_mat_mult_t_svdf_s8(const q7_t *lhs, + const q7_t *rhs, + q15_t *dst, + const int32_t lhs_offset, + const int32_t rhs_offset, + const int32_t scatter_offset, + const int32_t dst_multiplier, + const int32_t dst_shift, + const int32_t rhs_cols, + const int32_t rhs_rows, + const int32_t activation_min, + const int32_t activation_max); + +/** + * @brief Depthwise convolution of transposed rhs matrix with 4 lhs matrices. To be used in padded cases where + * the padding is -lhs_offset(Range: int8). Dimensions are the same for lhs and rhs. + * + * @param[in] lhs Input left-hand side matrix + * @param[in] rhs Input right-hand side matrix (transposed) + * @param[in] lhs_offset LHS matrix offset(input offset). Range: -127 to 128 + * @param[in] num_ch Number of channels in LHS/RHS + * @param[in] out_shift Per channel output shift. Length of vector is equal to number of channels + * @param[in] out_mult Per channel output multiplier. Length of vector is equal to number of channels + * @param[in] out_offset Offset to be added to the output values. Range: -127 to 128 + * @param[in] activation_min Minimum value to clamp the output to. Range: int8 + * @param[in] activation_max Maximum value to clamp the output to. Range: int8 + * @param[in] row_x_col (row_dimension * col_dimension) of LHS/RHS matrix + * @param[in] output_bias Per channel output bias. Length of vector is equal to number of channels + * @param[in] out Output pointer + * + * @return The function returns one of the two + * - Updated output pointer if an implementation is available + * - NULL if no implementation is available. + * + * @note If number of channels is not a multiple of 4, upto 3 elements outside the boundary will be read + * out for the following. + * - Output shift + * - Output multiplier + * - Output bias + * - rhs + */ +q7_t *arm_nn_depthwise_conv_nt_t_padded_s8(const q7_t *lhs, + const q7_t *rhs, + const int32_t lhs_offset, + const uint16_t num_ch, + const int32_t *out_shift, + const int32_t *out_mult, + const int32_t out_offset, + const int32_t activation_min, + const int32_t activation_max, + const uint16_t row_x_col, + const int32_t *const output_bias, + q7_t *out); + +/** + * @brief Depthwise convolution of transposed rhs matrix with 4 lhs matrices. To be used in non-padded cases. + * Dimensions are the same for lhs and rhs. + * + * @param[in] lhs Input left-hand side matrix + * @param[in] rhs Input right-hand side matrix (transposed) + * @param[in] lhs_offset LHS matrix offset(input offset). Range: -127 to 128 + * @param[in] num_ch Number of channels in LHS/RHS + * @param[in] out_shift Per channel output shift. Length of vector is equal to number of channels. + * @param[in] out_mult Per channel output multiplier. Length of vector is equal to number of channels. + * @param[in] out_offset Offset to be added to the output values. Range: -127 to 128 + * @param[in] activation_min Minimum value to clamp the output to. Range: int8 + * @param[in] activation_max Maximum value to clamp the output to. Range: int8 + * @param[in] row_x_col (row_dimension * col_dimension) of LHS/RHS matrix + * @param[in] output_bias Per channel output bias. Length of vector is equal to number of channels. + * @param[in] out Output pointer + * + * @return The function returns one of the two + * - Updated output pointer if an implementation is available + * - NULL if no implementation is available. + * + * @note If number of channels is not a multiple of 4, upto 3 elements outside the boundary will be read + * out for the following. + * - Output shift + * - Output multiplier + * - Output bias + * - rhs + */ +q7_t *arm_nn_depthwise_conv_nt_t_s8(const q7_t *lhs, + const q7_t *rhs, + const int32_t lhs_offset, + const uint16_t num_ch, + const int32_t *out_shift, + const int32_t *out_mult, + const int32_t out_offset, + const int32_t activation_min, + const int32_t activation_max, + const uint16_t row_x_col, + const int32_t *const output_bias, + q7_t *out); + +/** + *@brief Matrix-multiplication function for convolution with reordered columns + *@param[in] pA pointer to operand A + *@param[in] pInBuffer pointer to operand B, always conssists of 2 vectors + *@param[in] ch_im_out numRow of A + *@param[in] numCol_A numCol of A + *@param[in] bias_shift amount of left-shift for bias + *@param[in] out_shift amount of right-shift for output + *@param[in] bias the bias + *@param[in,out] pOut pointer to output + *@return The function returns the incremented output pointer + * + *@details This function assumes that data in pInBuffer are reordered + */ +q7_t *arm_nn_mat_mult_kernel_q7_q15_reordered(const q7_t *pA, + const q15_t *pInBuffer, + const uint16_t ch_im_out, + const uint16_t numCol_A, + const uint16_t bias_shift, + const uint16_t out_shift, + const q7_t *bias, + q7_t *pOut); + +/** + @brief Read 2 q15 elements and post increment pointer. + @param[in] in_q15 Pointer to pointer that holds address of input. + @return q31 value + */ +__STATIC_FORCEINLINE q31_t arm_nn_read_q15x2_ia(const q15_t **in_q15) +{ + q31_t val; + + memcpy(&val, *in_q15, 4); + *in_q15 += 2; + + return (val); +} + +/** + @brief Read 4 q7 from q7 pointer and post increment pointer. + @param[in] in_q7 Pointer to pointer that holds address of input. + @return q31 value + */ +__STATIC_FORCEINLINE q31_t arm_nn_read_q7x4_ia(const q7_t **in_q7) +{ + q31_t val; + memcpy(&val, *in_q7, 4); + *in_q7 += 4; + + return (val); +} + +/** + @brief Read 2 q15 from q15 pointer. + @param[in] in_q15 pointer to address of input. + @return q31 value + */ +__STATIC_FORCEINLINE q31_t arm_nn_read_q15x2(const q15_t *in_q15) +{ + q31_t val; + memcpy(&val, in_q15, 4); + + return (val); +} + +/** + @brief Read 4 q7 values. + @param[in] in_q7 pointer to address of input. + @return q31 value + */ +__STATIC_FORCEINLINE q31_t arm_nn_read_q7x4(const q7_t *in_q7) +{ + q31_t val; + memcpy(&val, in_q7, 4); + + return (val); +} + +/** + @brief Write four q7 to q7 pointer and increment pointer afterwards. + @param[in] in Double pointer to input value + @param[in] value Four bytes to copy + */ +__STATIC_FORCEINLINE void arm_nn_write_q7x4_ia(q7_t **in, q31_t value) +{ + memcpy(*in, &value, 4); + *in += 4; +} + +/** + * @brief memset optimized for MVE + * @param[in, out] dst Destination pointer + * @param[in] val Value to set + * @param[in] block_size Number of bytes to copy. + * + */ +__STATIC_FORCEINLINE void arm_memset_q7(q7_t *dst, const q7_t val, uint32_t block_size) +{ +#if defined(ARM_MATH_MVEI) + __asm volatile(" vdup.8 q0, %[set_val] \n" + " wlstp.8 lr, %[cnt], 1f \n" + "2: \n" + " vstrb.8 q0, [%[in]], #16 \n" + " letp lr, 2b \n" + "1: \n" + : [ in ] "+r"(dst) + : [ cnt ] "r"(block_size), [ set_val ] "r"(val) + : "q0", "memory", "r14"); +#else + memset(dst, val, block_size); +#endif +} + +#if defined(ARM_MATH_DSP) + +/** + * @brief read and expand one q7 word into two q15 words + */ + +__STATIC_FORCEINLINE const q7_t *read_and_pad(const q7_t *source, q31_t *out1, q31_t *out2) +{ + q31_t inA = arm_nn_read_q7x4_ia(&source); + q31_t inAbuf1 = __SXTB16_RORn((uint32_t)inA, 8); + q31_t inAbuf2 = __SXTB16(inA); + +#ifndef ARM_MATH_BIG_ENDIAN + *out2 = (int32_t)(__PKHTB(inAbuf1, inAbuf2, 16)); + *out1 = (int32_t)(__PKHBT(inAbuf2, inAbuf1, 16)); +#else + *out1 = (int32_t)(__PKHTB(inAbuf1, inAbuf2, 16)); + *out2 = (int32_t)(__PKHBT(inAbuf2, inAbuf1, 16)); +#endif + + return source; +} + +/** + * @brief read and expand one q7 word into two q15 words with reordering + */ + +__STATIC_FORCEINLINE const q7_t *read_and_pad_reordered(const q7_t *source, q31_t *out1, q31_t *out2) +{ + q31_t inA = arm_nn_read_q7x4_ia(&source); +#ifndef ARM_MATH_BIG_ENDIAN + *out2 = __SXTB16(__ROR((uint32_t)inA, 8)); + *out1 = __SXTB16(inA); +#else + *out1 = __SXTB16(__ROR((uint32_t)inA, 8)); + *out2 = __SXTB16(inA); +#endif + + return source; +} + +/** + * @brief read and expand one q7 word into two q15 words with reordering and add an offset + */ +__STATIC_FORCEINLINE const q7_t * +read_and_pad_reordered_with_offset(const q7_t *source, q31_t *out1, q31_t *out2, q31_t offset) +{ + q31_t inA = arm_nn_read_q7x4_ia(&source); + +#ifndef ARM_MATH_BIG_ENDIAN + *out2 = __SXTB16(__ROR((uint32_t)inA, 8)); + *out1 = __SXTB16(inA); +#else + *out1 = __SXTB16(__ROR((uint32_t)inA, 8)); + *out2 = __SXTB16(inA); +#endif + *out1 = __QADD16(*out1, offset); + *out2 = __QADD16(*out2, offset); + + return source; +} + +#endif + +/** + * @defgroup NNBasicMath Basic Math Functions for Neural Network Computation + * + * Basic Math Functions for Neural Network Computation + * + */ + +/** + * @brief q7 vector multiplication with variable output shifts + * @param[in] *pSrcA pointer to the first input vector + * @param[in] *pSrcB pointer to the second input vector + * @param[out] *pDst pointer to the output vector + * @param[in] out_shift amount of right-shift for output + * @param[in] blockSize number of samples in each vector + * @return none. + * + * <b>Scaling and Overflow Behavior:</b> + * \par + * The function uses saturating arithmetic. + * Results outside of the allowable q15 range [0x8000 0x7FFF] will be saturated. + */ + +void arm_nn_mult_q15(q15_t *pSrcA, q15_t *pSrcB, q15_t *pDst, const uint16_t out_shift, uint32_t blockSize); + +/** + * @brief q7 vector multiplication with variable output shifts + * @param[in] *pSrcA pointer to the first input vector + * @param[in] *pSrcB pointer to the second input vector + * @param[out] *pDst pointer to the output vector + * @param[in] out_shift amount of right-shift for output + * @param[in] blockSize number of samples in each vector + * @return none. + * + * <b>Scaling and Overflow Behavior:</b> + * \par + * The function uses saturating arithmetic. + * Results outside of the allowable q7 range [0x80 0x7F] will be saturated. + */ + +void arm_nn_mult_q7(q7_t *pSrcA, q7_t *pSrcB, q7_t *pDst, const uint16_t out_shift, uint32_t blockSize); + +/** + * @brief Matrix-multiplication function for convolution with per-channel requantization. + * @param[in] input_a pointer to operand A + * @param[in] input_b pointer to operand B, always consists of 2 vectors. + * @param[in] output_ch number of rows of A + * @param[in] out_shift pointer to per output channel requantization shift parameter. + * @param[in] out_mult pointer to per output channel requantization multiplier parameter. + * @param[in] out_offset output tensor offset. + * @param[in] activation_min minimum value to clamp the output to. Range : int8 + * @param[in] activation_max maximum value to clamp the output to. Range : int8 + * @param[in] num_col_a number of columns of A + * @param[in] output_bias per output channel bias. Range : int32 + * @param[in,out] out_0 pointer to output + * @return The function returns one of the two + * 1. The incremented output pointer for a successful operation or + * 2. NULL if implementation is not available. + * + * @details This function does the matrix multiplication of weight matrix for all output channels + * with 2 columns from im2col and produces two elements/output_channel. The outputs are + * clamped in the range provided by activation min and max. + * Supported framework: TensorFlow Lite micro. + */ +q7_t *arm_nn_mat_mult_kernel_s8_s16(const q7_t *input_a, + const q15_t *input_b, + const uint16_t output_ch, + const int32_t *out_shift, + const int32_t *out_mult, + const int32_t out_offset, + const int16_t activation_min, + const int16_t activation_max, + const uint16_t num_col_a, + const int32_t *const output_bias, + q7_t *out_0); + +/** + * @brief Common softmax function for s8 input and s8 or s16 output + * @param[in] input Pointer to the input tensor + * @param[in] num_rows Number of rows in the input tensor + * @param[in] row_size Number of elements in each input row + * @param[in] mult Input quantization multiplier + * @param[in] shift Input quantization shift within the range [0, 31] + * @param[in] diff_min Minimum difference with max in row. Used to check if + * the quantized exponential operation can be performed + * @param[in] int16_output Indicating s8 output if 0 else s16 output + * @param[out] output Pointer to the output tensor + * + * @note Supported framework: TensorFlow Lite micro (bit-accurate) + * + */ +void arm_nn_softmax_common_s8(const int8_t *input, + const int32_t num_rows, + const int32_t row_size, + const int32_t mult, + const int32_t shift, + const int32_t diff_min, + const bool int16_output, + void *output); + +/** + * @brief macro for adding rounding offset + */ +#ifndef ARM_NN_TRUNCATE +#define NN_ROUND(out_shift) ((0x1 << out_shift) >> 1) +#else +#define NN_ROUND(out_shift) 0 +#endif + +// Macros for shortening quantization functions' names and avoid long lines +#define MUL_SAT(a, b) arm_nn_doubling_high_mult((a), (b)) +#define MUL_SAT_MVE(a, b) arm_doubling_high_mult_mve_32x4((a), (b)) +#define MUL_POW2(a, b) arm_nn_mult_by_power_of_two((a), (b)) + +#define DIV_POW2(a, b) arm_nn_divide_by_power_of_two((a), (b)) +#define DIV_POW2_MVE(a, b) arm_divide_by_power_of_two_mve((a), (b)) + +#define EXP_ON_NEG(x) arm_nn_exp_on_negative_values((x)) +#define ONE_OVER1(x) arm_nn_one_over_one_plus_x_for_x_in_0_1((x)) + +/** + * @brief Saturating doubling high multiply. Result matches + * NEON instruction VQRDMULH. + * @param[in] m1 Multiplicand. Range: {NN_Q31_MIN, NN_Q31_MAX} + * @param[in] m2 Multiplier. Range: {NN_Q31_MIN, NN_Q31_MAX} + * @return Result of multiplication. + * + */ +__STATIC_FORCEINLINE q31_t arm_nn_doubling_high_mult(const q31_t m1, const q31_t m2) +{ + q31_t result = 0; + // Rounding offset to add for a right shift of 31 + q63_t mult = 1 << 30; + + if ((m1 < 0) ^ (m2 < 0)) + { + mult = 1 - mult; + } + // Gets resolved as a SMLAL instruction + mult = mult + (q63_t)m1 * m2; + + // Utilize all of the upper 32 bits. This is the doubling step + // as well. + result = (int32_t)(mult / (1ll << 31)); + + if ((m1 == m2) && (m1 == (int32_t)NN_Q31_MIN)) + { + result = NN_Q31_MAX; + } + return result; +} + +/** + * @brief Doubling high multiply without saturation. This is intended + * for requantization where the scale is a positive integer + * + * @param[in] m1 Multiplicand. Range: {NN_Q31_MIN, NN_Q31_MAX} + * @param[in] m2 Multiplier Range: {NN_Q31_MIN, NN_Q31_MAX} + * @return Result of multiplication. + * @note The result of this matches that of neon instruction + * VQRDMULH for m1 in range {NN_Q31_MIN, NN_Q31_MAX} and m2 in + * range {NN_Q31_MIN + 1, NN_Q31_MAX}. Saturation occurs when + * m1 equals m2 equals NN_Q31_MIN and that is not handled by + * this function. + * + */ +__STATIC_FORCEINLINE q31_t arm_nn_doubling_high_mult_no_sat(const q31_t m1, const q31_t m2) +{ + q31_t result = 0; + union arm_nn_long_long mult; + + // Rounding offset to add for a right shift of 31 + mult.word.low = 1 << 30; + mult.word.high = 0; + + // Gets resolved as a SMLAL instruction + mult.long_long = mult.long_long + (q63_t)m1 * m2; + + // Utilize all of the upper 32 bits. This is the doubling step + // as well. + result = (int32_t)(mult.long_long >> 31); + + return result; +} + +/** + * @brief Rounding divide by power of two. + * @param[in] dividend - Dividend + * @param[in] exponent - Divisor = power(2, exponent) + * Range: [0, 31] + * @return Rounded result of division. Midpoint is rounded away from zero. + * + */ +__STATIC_FORCEINLINE q31_t arm_nn_divide_by_power_of_two(const q31_t dividend, const q31_t exponent) +{ + q31_t result = 0; + const q31_t remainder_mask = (1 << exponent) - 1; + int32_t remainder = remainder_mask & dividend; + + // Basic division + result = dividend >> exponent; + + // Adjust 'result' for rounding (mid point away from zero) + q31_t threshold = remainder_mask >> 1; + if (result < 0) + { + threshold++; + } + if (remainder > threshold) + { + result++; + } + + return result; +} + +/** + * @brief Requantize a given value. + * @param[in] val Value to be requantized + * @param[in] multiplier multiplier. Range {NN_Q31_MIN + 1, Q32_MAX} + * @param[in] shift left or right shift for 'val * multiplier' + * + * @return Returns (val * multiplier)/(2 ^ shift) + * + */ +__STATIC_FORCEINLINE q31_t arm_nn_requantize(const q31_t val, const q31_t multiplier, const q31_t shift) +{ +#ifdef CMSIS_NN_USE_SINGLE_ROUNDING + const int64_t total_shift = 31 - shift; + const int64_t new_val = val * (int64_t)multiplier; + + int32_t result = new_val >> (total_shift - 1); + result = (result + 1) >> 1; + + return result; +#else + return arm_nn_divide_by_power_of_two(arm_nn_doubling_high_mult_no_sat(val * (1 << LEFT_SHIFT(shift)), multiplier), + RIGHT_SHIFT(shift)); +#endif +} + +/** + * @brief Requantize a given 64 bit value. + * @param[in] val Value to be requantized in the range {-(1<<47)} to {(1<<47) - 1} + * @param[in] reduced_multiplier Reduced multiplier in the range {NN_Q31_MIN + 1, Q32_MAX} to {Q16_MIN + 1, + * Q16_MAX} + * @param[in] shift Left or right shift for 'val * multiplier' in the range {-31} to {7} + * + * @return Returns (val * multiplier)/(2 ^ shift) + * + */ +__STATIC_FORCEINLINE q31_t arm_nn_requantize_s64(const q63_t val, const q31_t reduced_multiplier, const q31_t shift) +{ + const q63_t new_val = val * reduced_multiplier; + + q31_t result = new_val >> (14 - shift); // 64->32 bit reduction + result = (result + 1) >> 1; // Last shift position and insert round + + return result; +} + +/** + * @brief memcpy optimized for MVE + * @param[in, out] dst Destination pointer + * @param[in] src Source pointer. + * @param[in] block_size Number of bytes to copy. + * + */ +__STATIC_FORCEINLINE void arm_memcpy_q7(q7_t *__RESTRICT dst, const q7_t *__RESTRICT src, uint32_t block_size) +{ +#if defined(ARM_MATH_MVEI) + __asm volatile(" wlstp.8 lr, %[cnt], 1f \n" + "2: \n" + " vldrb.8 q0, [%[in]], #16 \n" + " vstrb.8 q0, [%[out]], #16 \n" + " letp lr, 2b \n" + "1: \n" + : [ in ] "+r"(src), [ out ] "+r"(dst) + : [ cnt ] "r"(block_size) + : "q0", "memory", "r14"); +#else + memcpy(dst, src, block_size); +#endif +} + +#if defined(ARM_MATH_MVEI) +/** + * @brief Vector saturating doubling high multiply returning high half. + * @param[in] m1 Multiplicand + * @param[in] m2 Multiplier + * @return Result of multiplication. + * + */ +__STATIC_FORCEINLINE int32x4_t arm_doubling_high_mult_mve(const int32x4_t m1, const q31_t m2) +{ + return vqrdmulhq_n_s32(m1, m2); +} + +/** + * @brief Vector rounding divide by power of two. + * @param[in] dividend - Dividend vector + * @param[in] exponent - Divisor = power(2, exponent) + * Range: [0, 31] + * @return Rounded result of division. Midpoint is rounded away from zero. + * + */ +__STATIC_FORCEINLINE int32x4_t arm_divide_by_power_of_two_mve(const int32x4_t dividend, const q31_t exponent) +{ + const int32x4_t shift = vdupq_n_s32(-exponent); + const int32x4_t fixup = vshrq_n_s32(vandq_s32(dividend, shift), 31); + const int32x4_t fixed_up_dividend = vqaddq_s32(dividend, fixup); + return vrshlq_s32(fixed_up_dividend, shift); +} + +/** + * @brief Requantize a given vector. + * @param[in] val Vector to be requantized + * @param[in] multiplier multiplier + * @param[in] shift shift + * + * @return Returns (val * multiplier)/(2 ^ shift) + * + */ +__STATIC_FORCEINLINE int32x4_t arm_requantize_mve(const int32x4_t val, const q31_t multiplier, const q31_t shift) +{ +#ifdef CMSIS_NN_USE_SINGLE_ROUNDING + const int right_shift = MIN(-1, shift); + const int left_shift = shift - right_shift; + + const int32x4_t left_shift_dup = vdupq_n_s32(left_shift); + const int32x4_t right_shift_dup = vdupq_n_s32(right_shift); + + int32x4_t result = vqdmulhq_n_s32(vshlq_s32(val, left_shift_dup), multiplier); + result = vrshlq_s32(result, right_shift_dup); + + return result; +#else + return arm_divide_by_power_of_two_mve( + arm_doubling_high_mult_mve(vshlq_s32(val, vdupq_n_s32(LEFT_SHIFT(shift))), multiplier), RIGHT_SHIFT(shift)); +#endif +} + +__STATIC_FORCEINLINE int32x4_t arm_doubling_high_mult_mve_32x4(const int32x4_t m1, const int32x4_t m2) +{ + return vqrdmulhq_s32(m1, m2); +} + +__STATIC_FORCEINLINE int32x4_t arm_divide_by_power_of_two_mve_32x4(const int32x4_t dividend, const int32x4_t exponent) +{ + const int32x4_t shift = -exponent; + const int32x4_t fixup = vshrq_n_s32(vandq_s32(dividend, shift), 31); + const int32x4_t fixed_up_dividend = vqaddq_s32(dividend, fixup); + return vrshlq_s32(fixed_up_dividend, shift); +} + +__STATIC_FORCEINLINE int32x4_t arm_requantize_mve_32x4(const int32x4_t val, + const int32x4_t multiplier, + const int32x4_t shift) +{ +#ifdef CMSIS_NN_USE_SINGLE_ROUNDING + const int32x4_t right_shift = vminq_s32(vdupq_n_s32(-1), shift); + const int32x4_t left_shift = vqsubq_s32(shift, right_shift); + + int32x4_t result = vqdmulhq_s32(vshlq_s32(val, left_shift), multiplier); + result = vrshlq_s32(result, right_shift); + + return result; +#else + const int32x4_t zz = vdupq_n_s32(0); + const mve_pred16_t p = vcmpgtq_n_s32(shift, 0); + + const int32x4_t left_shift = vpselq_s32(shift, zz, p); + const int32x4_t right_shift = -vpselq_s32(zz, shift, p); + + return arm_divide_by_power_of_two_mve_32x4(arm_doubling_high_mult_mve_32x4(vshlq_s32(val, left_shift), multiplier), + right_shift); +#endif +} +#endif + +// @note The following functions are used only for softmax layer, scaled bits = 5 assumed + +__STATIC_FORCEINLINE int32_t arm_nn_exp_on_negative_values(int32_t val) +{ + int32_t mask = 0; + int32_t shift = 24; + + const int32_t val_mod_minus_quarter = (val & ((1 << shift) - 1)) - (1 << shift); + const int32_t remainder = val_mod_minus_quarter - val; + const int32_t x = (val_mod_minus_quarter << 5) + (1 << 28); + const int32_t x2 = MUL_SAT(x, x); + + int32_t result = 1895147668 + + MUL_SAT(1895147668, x + DIV_POW2(MUL_SAT(DIV_POW2(MUL_SAT(x2, x2), 2) + MUL_SAT(x2, x), 715827883) + x2, 1)); + +#define SELECT_IF_NON_ZERO(x) \ + { \ + mask = MASK_IF_NON_ZERO(remainder & (1 << shift++)); \ + result = SELECT_USING_MASK(mask, MUL_SAT(result, x), result); \ + } + + SELECT_IF_NON_ZERO(1672461947) + SELECT_IF_NON_ZERO(1302514674) + SELECT_IF_NON_ZERO(790015084) + SELECT_IF_NON_ZERO(290630308) + SELECT_IF_NON_ZERO(39332535) + SELECT_IF_NON_ZERO(720401) + SELECT_IF_NON_ZERO(242) + +#undef SELECT_IF_NON_ZERO + + mask = MASK_IF_ZERO(val); + return SELECT_USING_MASK(mask, NN_Q31_MAX, result); +} + +__STATIC_FORCEINLINE q31_t arm_nn_mult_by_power_of_two(const int32_t val, const int32_t exp) +{ + const int32_t thresh = ((1 << (31 - exp)) - 1); + int32_t result = val << exp; + result = SELECT_USING_MASK(MASK_IF_NON_ZERO(val > thresh), NN_Q31_MAX, result); + result = SELECT_USING_MASK(MASK_IF_NON_ZERO(val < -thresh), NN_Q31_MIN, result); + return result; +} + +__STATIC_FORCEINLINE int32_t arm_nn_one_over_one_plus_x_for_x_in_0_1(int32_t val) +{ + const int64_t sum = (int64_t)val + (int64_t)NN_Q31_MAX; + const int32_t half_denominator = (int32_t)((sum + (sum >= 0 ? 1 : -1)) / 2L); + int32_t x = 1515870810 + MUL_SAT(half_denominator, -1010580540); + + const int32_t shift = (1 << 29); + x += MUL_POW2(MUL_SAT(x, shift - MUL_SAT(half_denominator, x)), 2); + x += MUL_POW2(MUL_SAT(x, shift - MUL_SAT(half_denominator, x)), 2); + x += MUL_POW2(MUL_SAT(x, shift - MUL_SAT(half_denominator, x)), 2); + + return MUL_POW2(x, 1); +} + +/** + @brief Write 2 q15 elements and post increment pointer. + @param[in] dest_q15 Pointer to pointer that holds address of destination. + @param[in] src_q31 Input value to be written. + */ +__STATIC_FORCEINLINE void arm_nn_write_q15x2_ia(q15_t **dest_q15, q31_t src_q31) +{ + q31_t val = src_q31; + + memcpy(*dest_q15, &val, 4); + *dest_q15 += 2; +} + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/Drivers/CMSIS/NN/Source/ActivationFunctions/CMakeLists.txt b/Drivers/CMSIS/NN/Source/ActivationFunctions/CMakeLists.txt new file mode 100644 index 0000000..67c3f79 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ActivationFunctions/CMakeLists.txt @@ -0,0 +1,20 @@ +# +# Copyright (c) 2019-2021 Arm Limited. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +file(GLOB SRC "./*_s8.c") +target_sources(cmsis-nn PRIVATE ${SRC}) diff --git a/Drivers/CMSIS/NN/Source/ActivationFunctions/arm_nn_activations_q15.c b/Drivers/CMSIS/NN/Source/ActivationFunctions/arm_nn_activations_q15.c new file mode 100644 index 0000000..cb8a08f --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ActivationFunctions/arm_nn_activations_q15.c @@ -0,0 +1,96 @@ +/* + * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_nn_activations_q15.c + * Description: Q15 neural network activation function using direct table look-up + * + * $Date: 09. October 2020 + * $Revision: V.1.0.1 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nn_tables.h" +#include "arm_nnfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup Acti + * @{ + */ + +/** + * @brief neural network activation function using direct table look-up + * + * @note Refer header file for details. + * + */ + +void arm_nn_activations_direct_q15(q15_t *data, uint16_t size, uint16_t int_width, arm_nn_activation_type type) +{ + uint16_t i = size; + q15_t *pIn = data; + q15_t *pOut = data; + uint16_t shift_size = 8 + 3 - int_width; + uint32_t bit_mask = 0x7FF >> int_width; + uint32_t full_frac = bit_mask + 1; + const q15_t *lookup_table; + + switch (type) + { + case ARM_SIGMOID: + lookup_table = sigmoidTable_q15; + break; + case ARM_TANH: + default: + lookup_table = tanhTable_q15; + break; + } + + while (i) + { + q15_t out; + q15_t in = *pIn++; + q15_t frac = (uint32_t)in & bit_mask; + q15_t value = lookup_table[(uint8_t)(in >> shift_size)]; + if ((in >> shift_size) != 0x7f) + { + q15_t value2 = lookup_table[(uint8_t)(1 + ((uint8_t)(in >> shift_size)))]; + /* doing the interpolation here for better accuracy */ + out = ((q31_t)(full_frac - frac) * value + (q31_t)value2 * frac) >> shift_size; + } + else + { + /* the largest positive value does not have a right side for linear interpolation */ + out = value; + } + + *pOut++ = out; + i--; + } +} + +/** + * @} end of Acti group + */ diff --git a/Drivers/CMSIS/NN/Source/ActivationFunctions/arm_nn_activations_q7.c b/Drivers/CMSIS/NN/Source/ActivationFunctions/arm_nn_activations_q7.c new file mode 100644 index 0000000..72a0b15 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ActivationFunctions/arm_nn_activations_q7.c @@ -0,0 +1,89 @@ +/* + * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_nn_activations_q7.c + * Description: Q7 neural network activation function using direct table look-up + * + * $Date: 09. October 2020 + * $Revision: V.1.0.1 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nn_tables.h" +#include "arm_nnfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup Acti + * @{ + */ + +/** + * @brief Q7 neural network activation function using direct table look-up + * @param[in,out] data pointer to input + * @param[in] size number of elements + * @param[in] int_width bit-width of the integer part, assume to be smaller than 3 + * @param[in] type type of activation functions + * + * @details + * + * This is the direct table look-up approach. + * + * Assume here the integer part of the fixed-point is <= 3. + * More than 3 just not making much sense, makes no difference with + * saturation followed by any of these activation functions. + */ + +void arm_nn_activations_direct_q7(q7_t *data, uint16_t size, uint16_t int_width, arm_nn_activation_type type) +{ + uint16_t i = size; + q7_t *pIn = data; + q7_t *pOut = data; + q7_t in; + q7_t out; + uint16_t shift_size = 3 - int_width; + const q7_t *lookup_table; + switch (type) + { + case ARM_SIGMOID: + lookup_table = sigmoidTable_q7; + break; + case ARM_TANH: + default: + lookup_table = tanhTable_q7; + break; + } + while (i) + { + in = *pIn++; + out = lookup_table[(uint8_t)(in >> shift_size)]; + *pOut++ = out; + i--; + } +} + +/** + * @} end of Acti group + */ diff --git a/Drivers/CMSIS/NN/Source/ActivationFunctions/arm_relu6_s8.c b/Drivers/CMSIS/NN/Source/ActivationFunctions/arm_relu6_s8.c new file mode 100644 index 0000000..a460b30 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ActivationFunctions/arm_relu6_s8.c @@ -0,0 +1,65 @@ +/* + * Copyright (C) 2010-2019 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_relu6_s8.c + * Description: Basic s8 version of ReLU6 + * + * $Date: 09. October 2020 + * $Revision: V.1.0.1 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup Acti + * @{ + */ + +/* + * Basic ReLU6 function + * + * Refer to header file for details. + * + */ + +void arm_relu6_s8(q7_t *data, uint16_t size) +{ + int32_t i; + + for (i = 0; i < size; i++) + { + int32_t ip = data[i]; + + ip = MAX(ip, 0); + data[i] = MIN(ip, 6); + } +} + +/** + * @} end of Acti group + */ diff --git a/Drivers/CMSIS/NN/Source/ActivationFunctions/arm_relu_q15.c b/Drivers/CMSIS/NN/Source/ActivationFunctions/arm_relu_q15.c new file mode 100644 index 0000000..1d4ea4e --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ActivationFunctions/arm_relu_q15.c @@ -0,0 +1,104 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_relu_q15.c + * Description: Q15 version of ReLU + * + * $Date: 20. July 2021 + * $Revision: V.1.0.2 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup Acti + * @{ + */ + +/** + * @brief Q15 RELU function + * @param[in,out] data pointer to input + * @param[in] size number of elements + * + * @details + * + * Optimized relu with QSUB instructions. + * + */ + +void arm_relu_q15(q15_t *data, uint16_t size) +{ + +#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) + /* Run the following code for M cores with DSP extension */ + + uint16_t i = size >> 1; + q15_t *input = data; + q15_t *output = data; + q31_t in; + q31_t buf; + q31_t mask; + + while (i) + { + in = arm_nn_read_q15x2_ia((const q15_t **)&input); + + /* extract the first bit */ + buf = __ROR(in & 0x80008000, 15); + + /* if MSB=1, mask will be 0xFF, 0x0 otherwise */ + mask = __QSUB16(0x00000000, buf); + + arm_nn_write_q15x2_ia(&output, in & (~mask)); + i--; + } + + if (size & 0x1) + { + if (*input < 0) + { + *input = 0; + } + input++; + } +#else + /* Run the following code as reference implementation for M cores without DSP extension */ + uint16_t i; + + for (i = 0; i < size; i++) + { + if (data[i] < 0) + data[i] = 0; + } + +#endif /* ARM_MATH_DSP */ +} + +/** + * @} end of Acti group + */ diff --git a/Drivers/CMSIS/NN/Source/ActivationFunctions/arm_relu_q7.c b/Drivers/CMSIS/NN/Source/ActivationFunctions/arm_relu_q7.c new file mode 100644 index 0000000..a3163cd --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ActivationFunctions/arm_relu_q7.c @@ -0,0 +1,109 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_relu_q7.c + * Description: Q7 version of ReLU + * + * $Date: 20. July 2021 + * $Revision: V.1.1.3 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup Acti + * @{ + */ + +/** + * @brief Q7 RELU function + * @param[in,out] data pointer to input + * @param[in] size number of elements + * + * @details + * + * Optimized relu with QSUB instructions. + * + */ + +void arm_relu_q7(q7_t *data, uint16_t size) +{ + +#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) + /* Run the following code for M cores with DSP extension */ + + uint16_t i = size >> 2; + q7_t *input = data; + q7_t *output = data; + q31_t in; + q31_t buf; + q31_t mask; + + while (i) + { + in = arm_nn_read_q7x4_ia((const q7_t **)&input); + + /* extract the first bit */ + buf = (int32_t)__ROR((uint32_t)in & 0x80808080, 7); + + /* if MSB=1, mask will be 0xFF, 0x0 otherwise */ + mask = __QSUB8(0x00000000, buf); + + arm_nn_write_q7x4_ia(&output, in & (~mask)); + + i--; + } + + i = size & 0x3; + while (i) + { + if (*input < 0) + { + *input = 0; + } + input++; + i--; + } + +#else + /* Run the following code as reference implementation for cores without DSP extension */ + + uint16_t i; + + for (i = 0; i < size; i++) + { + if (data[i] < 0) + data[i] = 0; + } + +#endif +} + +/** + * @} end of Acti group + */ diff --git a/Drivers/CMSIS/NN/Source/BasicMathFunctions/CMakeLists.txt b/Drivers/CMSIS/NN/Source/BasicMathFunctions/CMakeLists.txt new file mode 100644 index 0000000..9d3f543 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/BasicMathFunctions/CMakeLists.txt @@ -0,0 +1,20 @@ +# +# Copyright (c) 2019-2021 Arm Limited. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +file(GLOB SRC "./*_*.c") +target_sources(cmsis-nn PRIVATE ${SRC}) diff --git a/Drivers/CMSIS/NN/Source/BasicMathFunctions/arm_elementwise_add_s16.c b/Drivers/CMSIS/NN/Source/BasicMathFunctions/arm_elementwise_add_s16.c new file mode 100644 index 0000000..6b1366d --- /dev/null +++ b/Drivers/CMSIS/NN/Source/BasicMathFunctions/arm_elementwise_add_s16.c @@ -0,0 +1,105 @@ +/* + * Copyright (C) 2022 Arm Limited or its affiliates. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_elementwise_add_s16 + * Description: Elementwise add + * + * $Date: 14 Februari 2022 + * $Revision: V.1.0.0 + * + * Target Processor: Cortex-M CPUs + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup BasicMath + * @{ + */ + +/* + * s16 elementwise add + * + * Refer header file for details. + * + */ + +/* Note: __SHIFT is expected to be <=0 */ + +arm_status arm_elementwise_add_s16(const int16_t *input_1_vect, + const int16_t *input_2_vect, + const int32_t input_1_offset, + const int32_t input_1_mult, + const int32_t input_1_shift, + const int32_t input_2_offset, + const int32_t input_2_mult, + const int32_t input_2_shift, + const int32_t left_shift, + int16_t *output, + const int32_t out_offset, + const int32_t out_mult, + const int32_t out_shift, + const int32_t out_activation_min, + const int32_t out_activation_max, + const int32_t block_size) +{ + (void)input_1_offset; + (void)input_2_offset; + (void)out_offset; + int32_t loop_count; + int32_t input_1; + int32_t input_2; + int32_t sum; + + loop_count = block_size; + + while (loop_count > 0) + { + /* C = A + B */ + input_1 = *input_1_vect++ << left_shift; + input_2 = *input_2_vect++ << left_shift; + + input_1 = arm_nn_requantize(input_1, input_1_mult, input_1_shift); + input_2 = arm_nn_requantize(input_2, input_2_mult, input_2_shift); + + sum = input_1 + input_2; + sum = arm_nn_requantize(sum, out_mult, out_shift); + + sum = MAX(sum, out_activation_min); + sum = MIN(sum, out_activation_max); + + *output++ = (int16_t)sum; + + /* Decrement loop counter */ + loop_count--; + } + + return (ARM_MATH_SUCCESS); +} + +/** + * @} end of BasicMath group + */ diff --git a/Drivers/CMSIS/NN/Source/BasicMathFunctions/arm_elementwise_add_s8.c b/Drivers/CMSIS/NN/Source/BasicMathFunctions/arm_elementwise_add_s8.c new file mode 100644 index 0000000..13b6bb3 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/BasicMathFunctions/arm_elementwise_add_s8.c @@ -0,0 +1,234 @@ +/* + * Copyright (C) 2010-2022 Arm Limited or its affiliates. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_elementwise_add_s8 + * Description: Elementwise add + * + * $Date: 3 Februari 2022 + * $Revision: V.2.6.0 + * + * Target Processor: Cortex-M CPUs + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup BasicMath + * @{ + */ + +/* + * s8 elementwise add + * + * Refer header file for details. + * + */ + +/* Note: __SHIFT is expected to be <=0 */ + +arm_status arm_elementwise_add_s8(const int8_t *input_1_vect, + const int8_t *input_2_vect, + const int32_t input_1_offset, + const int32_t input_1_mult, + const int32_t input_1_shift, + const int32_t input_2_offset, + const int32_t input_2_mult, + const int32_t input_2_shift, + const int32_t left_shift, + int8_t *output, + const int32_t out_offset, + const int32_t out_mult, + const int32_t out_shift, + const int32_t out_activation_min, + const int32_t out_activation_max, + const int32_t block_size) +{ +#if defined(ARM_MATH_MVEI) + int32_t count = block_size; + + while (count > 0) + { + int32x4_t vect_1; + int32x4_t vect_2; + + mve_pred16_t p = vctp32q((uint32_t)count); + + vect_1 = vldrbq_z_s32(input_1_vect, p); + vect_2 = vldrbq_z_s32(input_2_vect, p); + + vect_1 = vaddq_s32(vect_1, vdupq_n_s32(input_1_offset)); + vect_2 = vaddq_s32(vect_2, vdupq_n_s32(input_2_offset)); + + vect_1 = vshlq_r_s32(vect_1, left_shift); + vect_2 = vshlq_r_s32(vect_2, left_shift); + + vect_1 = arm_requantize_mve(vect_1, input_1_mult, input_1_shift); + vect_2 = arm_requantize_mve(vect_2, input_2_mult, input_2_shift); + + vect_1 = vaddq_s32(vect_1, vect_2); + vect_1 = arm_requantize_mve(vect_1, out_mult, out_shift); + + vect_1 = vaddq_n_s32(vect_1, out_offset); + + vect_1 = vmaxq_s32(vect_1, vdupq_n_s32(out_activation_min)); + vect_1 = vminq_s32(vect_1, vdupq_n_s32(out_activation_max)); + + input_1_vect += 4; + input_2_vect += 4; + vstrbq_p_s32(output, vect_1, p); + + output += 4; + count -= 4; + } +#else + int32_t loop_count; + int32_t input_1; + int32_t input_2; + int32_t sum; + +#if defined(ARM_MATH_DSP) + int32_t a_1, b_1, a_2, b_2; + + int32_t offset_1_packed, offset_2_packed; + + int8_t r1, r2, r3, r4; + + offset_1_packed = (input_1_offset << 16U) | (input_1_offset & 0x0FFFFL); + offset_2_packed = (input_2_offset << 16U) | (input_2_offset & 0x0FFFFL); + + loop_count = block_size >> 2; + + while (loop_count > 0) + { + /* 4 outputs are calculated in one loop. The order of calculation is follows the order of output sign extension + intrinsic */ + input_1_vect = read_and_pad_reordered(input_1_vect, &b_1, &a_1); + input_2_vect = read_and_pad_reordered(input_2_vect, &b_2, &a_2); + + a_1 = __SADD16(a_1, offset_1_packed); + b_1 = __SADD16(b_1, offset_1_packed); + + a_2 = __SADD16(a_2, offset_2_packed); + b_2 = __SADD16(b_2, offset_2_packed); + + /* Sum 1 */ + input_1 = (b_1 & 0x0FFFF) << left_shift; + + input_1 = arm_nn_requantize(input_1, input_1_mult, input_1_shift); + + input_2 = (b_2 & 0x0FFFF) << left_shift; + input_2 = arm_nn_requantize(input_2, input_2_mult, input_2_shift); + + sum = input_1 + input_2; + sum = arm_nn_requantize(sum, out_mult, out_shift); + sum += out_offset; + sum = MAX(sum, out_activation_min); + sum = MIN(sum, out_activation_max); + r1 = (q7_t)sum; + + /* Sum 3 */ + input_1 = ((b_1 >> 16) & 0x0FFFF) << left_shift; + input_1 = arm_nn_requantize(input_1, input_1_mult, input_1_shift); + + input_2 = ((b_2 >> 16) & 0x0FFFF) << left_shift; + input_2 = arm_nn_requantize(input_2, input_2_mult, input_2_shift); + + sum = input_1 + input_2; + sum = arm_nn_requantize(sum, out_mult, out_shift); + sum += out_offset; + sum = MAX(sum, out_activation_min); + sum = MIN(sum, out_activation_max); + r3 = (q7_t)sum; + + /* Sum 2 */ + input_1 = (a_1 & 0x0FFFF) << left_shift; + input_1 = arm_nn_requantize(input_1, input_1_mult, input_1_shift); + + input_2 = (a_2 & 0x0FFFF) << left_shift; + input_2 = arm_nn_requantize(input_2, input_2_mult, input_2_shift); + + sum = input_1 + input_2; + sum = arm_nn_requantize(sum, out_mult, out_shift); + sum += out_offset; + sum = MAX(sum, out_activation_min); + sum = MIN(sum, out_activation_max); + r2 = (q7_t)sum; + + /* Sum 4 */ + input_1 = ((a_1 >> 16) & 0x0FFFF) << left_shift; + input_1 = arm_nn_requantize(input_1, input_1_mult, input_1_shift); + + input_2 = ((a_2 >> 16) & 0x0FFFF) << left_shift; + input_2 = arm_nn_requantize(input_2, input_2_mult, input_2_shift); + + sum = input_1 + input_2; + sum = arm_nn_requantize(sum, out_mult, out_shift); + sum += out_offset; + sum = MAX(sum, out_activation_min); + sum = MIN(sum, out_activation_max); + r4 = (q7_t)sum; + + arm_nn_write_q7x4_ia(&output, PACK_Q7x4_32x1(r1, r2, r3, r4)); + + loop_count--; + } + + loop_count = block_size & 0x3; +#else + loop_count = block_size; +#endif + + while (loop_count > 0) + { + /* C = A + B */ + + input_1 = (*input_1_vect++ + input_1_offset) << left_shift; + input_2 = (*input_2_vect++ + input_2_offset) << left_shift; + + input_1 = arm_nn_requantize(input_1, input_1_mult, input_1_shift); + input_2 = arm_nn_requantize(input_2, input_2_mult, input_2_shift); + + sum = input_1 + input_2; + sum = arm_nn_requantize(sum, out_mult, out_shift); + sum += out_offset; + + sum = MAX(sum, out_activation_min); + sum = MIN(sum, out_activation_max); + + *output++ = (q7_t)sum; + + /* Decrement loop counter */ + loop_count--; + } + +#endif /* ARM_MATH_MVEI */ + + return (ARM_MATH_SUCCESS); +} + +/** + * @} end of BasicMath group + */ diff --git a/Drivers/CMSIS/NN/Source/BasicMathFunctions/arm_elementwise_mul_s16.c b/Drivers/CMSIS/NN/Source/BasicMathFunctions/arm_elementwise_mul_s16.c new file mode 100644 index 0000000..4e25574 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/BasicMathFunctions/arm_elementwise_mul_s16.c @@ -0,0 +1,95 @@ +/* + * Copyright (C) 2022 Arm Limited or its affiliates. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_elementwise_mul_s16 + * Description: Element wise multiplication + * + * $Date: 14 Februari 2022 + * $Revision: V.1.0.0 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup BasicMath + * @{ + */ + +/** + * @brief s16 element wise multiplication of two vectors + * + * @note Refer header file for details. + * + */ +arm_status arm_elementwise_mul_s16(const int16_t *input_1_vect, + const int16_t *input_2_vect, + const int32_t input_1_offset, + const int32_t input_2_offset, + int16_t *output, + const int32_t out_offset, + const int32_t out_mult, + const int32_t out_shift, + const int32_t out_activation_min, + const int32_t out_activation_max, + const int32_t block_size) +{ + (void)input_1_offset; + (void)input_2_offset; + (void)out_offset; + int32_t loop_count; + int32_t input_1; + int32_t input_2; + int32_t mul_res; + + loop_count = block_size; + + while (loop_count > 0) + { + /* C = A * B */ + + input_1 = *input_1_vect++; + input_2 = *input_2_vect++; + + mul_res = input_1 * input_2; + mul_res = arm_nn_requantize(mul_res, out_mult, out_shift); + + mul_res = MAX(mul_res, out_activation_min); + mul_res = MIN(mul_res, out_activation_max); + + *output++ = (int16_t)mul_res; + + /* Decrement loop counter */ + loop_count--; + } + + return ARM_MATH_SUCCESS; +} + +/** + * @} end of BasicMath group + */ diff --git a/Drivers/CMSIS/NN/Source/BasicMathFunctions/arm_elementwise_mul_s8.c b/Drivers/CMSIS/NN/Source/BasicMathFunctions/arm_elementwise_mul_s8.c new file mode 100644 index 0000000..ff04cbf --- /dev/null +++ b/Drivers/CMSIS/NN/Source/BasicMathFunctions/arm_elementwise_mul_s8.c @@ -0,0 +1,200 @@ +/* + * Copyright (C) 2010-2022 Arm Limited or its affiliates. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_elementwise_mul_s8 + * Description: Element wise multiplication + * + * $Date: 3 Februari 2022 + * $Revision: V.1.1.0 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup BasicMath + * @{ + */ + +/** + * @brief s8 element wise multiplication of two vectors + * + * @note Refer header file for details. + * + */ + +arm_status arm_elementwise_mul_s8(const int8_t *input_1_vect, + const int8_t *input_2_vect, + const int32_t input_1_offset, + const int32_t input_2_offset, + int8_t *output, + const int32_t out_offset, + const int32_t out_mult, + const int32_t out_shift, + const int32_t out_activation_min, + const int32_t out_activation_max, + const int32_t block_size) +{ + + int32_t loop_count; +#if defined(ARM_MATH_MVEI) + + loop_count = (block_size + 3) / 4; + uint32_t num_elements = block_size; + + for (int i = 0; i < loop_count; i++) + { + mve_pred16_t p = vctp32q(num_elements); + + int32x4_t input_1 = vldrbq_z_s32(input_1_vect, p); + input_1 = vaddq_n_s32(input_1, input_1_offset); + + int32x4_t input_2 = vldrbq_z_s32(input_2_vect, p); + input_2 = vaddq_n_s32(input_2, input_2_offset); + + int32x4_t res_0 = vmulq_s32(input_1, input_2); + + res_0 = arm_requantize_mve_32x4(res_0, vdupq_n_s32(out_mult), vdupq_n_s32(out_shift)); + + res_0 += vdupq_n_s32(out_offset); + + res_0 = vmaxq_s32(res_0, vdupq_n_s32(out_activation_min)); + res_0 = vminq_s32(res_0, vdupq_n_s32(out_activation_max)); + + vstrbq_p_s32(output, res_0, p); + input_1_vect += 4; + input_2_vect += 4; + output += 4; + num_elements -= 4; + } + +#else + int32_t input_1; + int32_t input_2; + int32_t mul_res; + +#if defined(ARM_MATH_DSP) + int32_t a_1, b_1, a_2, b_2; + + int32_t offset_1_packed, offset_2_packed; + + int8_t r1, r2, r3, r4; + + offset_1_packed = (input_1_offset << 16U) | (input_1_offset & 0x0FFFFL); + offset_2_packed = (input_2_offset << 16U) | (input_2_offset & 0x0FFFFL); + + loop_count = block_size >> 2; + + while (loop_count > 0) + { + /* 4 outputs are calculated in one loop. The order of calculation is follows the order of output sign extension + intrinsic */ + input_1_vect = read_and_pad_reordered(input_1_vect, &b_1, &a_1); + input_2_vect = read_and_pad_reordered(input_2_vect, &b_2, &a_2); + + a_1 = __SADD16(a_1, offset_1_packed); + b_1 = __SADD16(b_1, offset_1_packed); + + a_2 = __SADD16(a_2, offset_2_packed); + b_2 = __SADD16(b_2, offset_2_packed); + + /* Mul 1 */ + input_1 = (int16_t)(b_1 & 0x0FFFFL); + input_2 = (int16_t)(b_2 & 0x0FFFFL); + + mul_res = input_1 * input_2; + mul_res = arm_nn_requantize(mul_res, out_mult, out_shift) + out_offset; + + mul_res = MAX(mul_res, out_activation_min); + mul_res = MIN(mul_res, out_activation_max); + r1 = (q7_t)mul_res; + + /* Mul 3 */ + input_1 = (int16_t)((b_1 >> 16U) & 0x0FFFFL); + input_2 = (int16_t)((b_2 >> 16U) & 0x0FFFFL); + + mul_res = input_1 * input_2; + mul_res = arm_nn_requantize(mul_res, out_mult, out_shift) + out_offset; + mul_res = MAX(mul_res, out_activation_min); + mul_res = MIN(mul_res, out_activation_max); + r3 = (q7_t)mul_res; + + /* Mul 2 */ + input_1 = (int16_t)(a_1 & 0x0FFFFL); + input_2 = (int16_t)(a_2 & 0x0FFFFL); + + mul_res = input_1 * input_2; + mul_res = arm_nn_requantize(mul_res, out_mult, out_shift) + out_offset; + mul_res = MAX(mul_res, out_activation_min); + mul_res = MIN(mul_res, out_activation_max); + r2 = (q7_t)mul_res; + + /* Mul 4 */ + input_1 = (int16_t)((a_1 >> 16U) & 0x0FFFFL); + input_2 = (int16_t)((a_2 >> 16U) & 0x0FFFFL); + + mul_res = input_1 * input_2; + mul_res = arm_nn_requantize(mul_res, out_mult, out_shift) + out_offset; + mul_res = MAX(mul_res, out_activation_min); + mul_res = MIN(mul_res, out_activation_max); + r4 = (q7_t)mul_res; + + arm_nn_write_q7x4_ia(&output, PACK_Q7x4_32x1(r1, r2, r3, r4)); + + loop_count--; + } + + loop_count = block_size & 0x3; +#else + loop_count = block_size; +#endif + + while (loop_count > 0) + { + /* C = A * B */ + + input_1 = *input_1_vect++ + input_1_offset; + input_2 = *input_2_vect++ + input_2_offset; + + mul_res = input_1 * input_2; + mul_res = arm_nn_requantize(mul_res, out_mult, out_shift) + out_offset; + + mul_res = MAX(mul_res, out_activation_min); + mul_res = MIN(mul_res, out_activation_max); + + *output++ = (q7_t)mul_res; + + /* Decrement loop counter */ + loop_count--; + } +#endif + return ARM_MATH_SUCCESS; +} + +/** + * @} end of BasicMath group + */ diff --git a/Drivers/CMSIS/NN/Source/CMakeLists.txt b/Drivers/CMSIS/NN/Source/CMakeLists.txt new file mode 100644 index 0000000..4150df3 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/CMakeLists.txt @@ -0,0 +1,98 @@ +# +# Copyright (c) 2019-2021 Arm Limited. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +SET(ROOT ${CMSIS_PATH}) + +# Select which parts of the CMSIS-DSP must be compiled. +# There are some dependencies between the parts but they are not tracked +# by this cmake. So, enabling some functions may require to enable some +# other ones. +option(CONCATENATION "Concatenation" ON) +option(FULLYCONNECTED "Fully Connected" ON) +option(CONVOLUTION "Convolutions" ON) +option(ACTIVATION "Activations" ON) +option(POOLING "Pooling" ON) +option(SOFTMAX "Softmax" ON) +option(BASICMATHSNN "Basic Maths for NN" ON) +option(RESHAPE "Reshape" ON) +option(SVDF "SVDF" ON) + +# When OFF it is the default behavior : all tables are included. +option(NNSUPPORT "NN Support" ON) + + +########################### +# +# CMSIS NN +# +########################### + +# NN Sources +SET(NN ${ROOT}/CMSIS/NN) + +list(APPEND CMAKE_MODULE_PATH ${NN}/Source) + +add_library(cmsis-nn STATIC) + +target_compile_options(cmsis-nn PRIVATE -Ofast) + +### Includes +target_include_directories(cmsis-nn PUBLIC "${NN}/Include") +target_include_directories(cmsis-nn PUBLIC "${ROOT}/CMSIS/Core/Include") +target_include_directories(cmsis-nn PUBLIC "${ROOT}/CMSIS/DSP/Include") + +if (BASICMATHSNN) + add_subdirectory(BasicMathFunctions) +endif() + +if (CONCATENATION) + add_subdirectory(ConcatenationFunctions) +endif() + +if (FULLYCONNECTED) + add_subdirectory(FullyConnectedFunctions) +endif() + +if (CONVOLUTION) + add_subdirectory(ConvolutionFunctions) +endif() + +if (ACTIVATION) + add_subdirectory(ActivationFunctions) +endif() + +if (POOLING) + add_subdirectory(PoolingFunctions) +endif() + +if (SOFTMAX) + add_subdirectory(SoftmaxFunctions) +endif() + +if (SVDF) + add_subdirectory(SVDFunctions) +endif() + +if (RESHAPE) + add_subdirectory(ReshapeFunctions) +endif() + +# Keep NNSUPPORT at the end +if (NNSUPPORT) + add_subdirectory(NNSupportFunctions) +endif() diff --git a/Drivers/CMSIS/NN/Source/ConcatenationFunctions/CMakeLists.txt b/Drivers/CMSIS/NN/Source/ConcatenationFunctions/CMakeLists.txt new file mode 100644 index 0000000..9d3f543 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ConcatenationFunctions/CMakeLists.txt @@ -0,0 +1,20 @@ +# +# Copyright (c) 2019-2021 Arm Limited. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +file(GLOB SRC "./*_*.c") +target_sources(cmsis-nn PRIVATE ${SRC}) diff --git a/Drivers/CMSIS/NN/Source/ConcatenationFunctions/arm_concatenation_s8_w.c b/Drivers/CMSIS/NN/Source/ConcatenationFunctions/arm_concatenation_s8_w.c new file mode 100644 index 0000000..257e6a6 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ConcatenationFunctions/arm_concatenation_s8_w.c @@ -0,0 +1,66 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_concatenation_s8_w.c + * Description: s8 version of concatenation along the W axis + * + * $Date: October 2019 + * $Revision: V.1.0.0 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup Concatenation + * @{ + */ + +/* + * s8 version of concatenation along the W axis + * + * Refer to header file for details. + * + */ +void arm_concatenation_s8_w(const int8_t *input, + const uint16_t input_x, + const uint16_t input_y, + const uint16_t input_z, + const uint16_t input_w, + int8_t *output, + const uint32_t offset_w) +{ + const uint32_t input_copy_size = input_x * input_y * input_z * input_w; + + output += offset_w * (input_x * input_y * input_z); + + arm_memcpy_q7(output, input, input_copy_size); +} + +/** + * @} end of Concatenation group + */ diff --git a/Drivers/CMSIS/NN/Source/ConcatenationFunctions/arm_concatenation_s8_x.c b/Drivers/CMSIS/NN/Source/ConcatenationFunctions/arm_concatenation_s8_x.c new file mode 100644 index 0000000..7e8487a --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ConcatenationFunctions/arm_concatenation_s8_x.c @@ -0,0 +1,75 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_concatenation_s8_x.c + * Description: s8 version of concatenation along the X axis + * + * $Date: October 2019 + * $Revision: V.1.0.0 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup Concatenation + * @{ + */ + +/* + * s8 version of concatenation along the X axis + * + * Refer to header file for details. + * + */ +void arm_concatenation_s8_x(const int8_t *input, + const uint16_t input_x, + const uint16_t input_y, + const uint16_t input_z, + const uint16_t input_w, + int8_t *output, + const uint16_t output_x, + const uint32_t offset_x) +{ + const uint32_t num_iterations = input_y * input_z * input_w; + + output += offset_x; + + uint32_t i; + + // Copy per row + for (i = 0; i < num_iterations; ++i) + { + arm_memcpy_q7(output, input, input_x); + input += input_x; + output += output_x; + } +} + +/** + * @} end of Concatenation group + */ diff --git a/Drivers/CMSIS/NN/Source/ConcatenationFunctions/arm_concatenation_s8_y.c b/Drivers/CMSIS/NN/Source/ConcatenationFunctions/arm_concatenation_s8_y.c new file mode 100644 index 0000000..075a702 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ConcatenationFunctions/arm_concatenation_s8_y.c @@ -0,0 +1,76 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_concatenation_s8_y.c + * Description: s8 version of concatenation along the Y axis + * + * $Date: October 2019 + * $Revision: V.1.0.0 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup Concatenation + * @{ + */ + +/* + * s8 version of concatenation along the Y axis + * + * Refer to header file for details. + * + */ +void arm_concatenation_s8_y(const int8_t *input, + const uint16_t input_x, + const uint16_t input_y, + const uint16_t input_z, + const uint16_t input_w, + int8_t *output, + const uint16_t output_y, + const uint32_t offset_y) +{ + const uint32_t num_iterations = input_z * input_w; + const uint32_t input_copy_size = input_x * input_y; + const uint32_t output_stride = input_x * output_y; + + output += offset_y * input_x; + uint32_t i; + + // Copy per tile + for (i = 0; i < num_iterations; ++i) + { + arm_memcpy_q7(output, input, input_copy_size); + input += input_copy_size; + output += output_stride; + } +} + +/** + * @} end of Concatenation group + */ diff --git a/Drivers/CMSIS/NN/Source/ConcatenationFunctions/arm_concatenation_s8_z.c b/Drivers/CMSIS/NN/Source/ConcatenationFunctions/arm_concatenation_s8_z.c new file mode 100644 index 0000000..3bd84f2 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ConcatenationFunctions/arm_concatenation_s8_z.c @@ -0,0 +1,75 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_concatenation_s8_z.c + * Description: s8 version of concatenation along the Z axis + * + * $Date: October 2019 + * $Revision: V.1.0.0 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup Concatenation + * @{ + */ + +/* + * s8 version of concatenation along the Z axis + * + * Refer to header file for details. + * + */ +void arm_concatenation_s8_z(const int8_t *input, + const uint16_t input_x, + const uint16_t input_y, + const uint16_t input_z, + const uint16_t input_w, + int8_t *output, + const uint16_t output_z, + const uint32_t offset_z) +{ + const uint32_t input_copy_size = input_x * input_y * input_z; + const uint32_t output_stride = input_x * input_y * output_z; + + output += offset_z * (input_x * input_y); + + uint32_t i; + + for (i = 0; i < input_w; ++i) + { + arm_memcpy_q7(output, input, input_copy_size); + input += input_copy_size; + output += output_stride; + } +} + +/** + * @} end of Concatenation group + */ diff --git a/Drivers/CMSIS/NN/Source/ConvolutionFunctions/CMakeLists.txt b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/CMakeLists.txt new file mode 100644 index 0000000..30be0fe --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/CMakeLists.txt @@ -0,0 +1,24 @@ +# +# Copyright (c) 2019-2022 Arm Limited. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +file(GLOB SRC "./*_s8*.c") +file(GLOB SRC_S16 "./*_s16*.c") +target_sources(cmsis-nn PRIVATE ${SRC} ${SRC_S16}) + + + diff --git a/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_1_x_n_s8.c b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_1_x_n_s8.c new file mode 100644 index 0000000..a3edd40 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_1_x_n_s8.c @@ -0,0 +1,205 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_convolve_1_x_n_s8.c + * Description: s8 version of 1xN convolution using symmetric quantization. + * + * $Date: December 14, 2021 + * $Revision: V.2.1.0 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup NNConv + * @{ + */ + +/* + * 1xN s8 convolution function. + * + * Refer header file for details. + * + */ + +arm_status arm_convolve_1_x_n_s8(const cmsis_nn_context *ctx, + const cmsis_nn_conv_params *conv_params, + const cmsis_nn_per_channel_quant_params *quant_params, + const cmsis_nn_dims *input_dims, + const q7_t *input_data, + const cmsis_nn_dims *filter_dims, + const q7_t *filter_data, + const cmsis_nn_dims *bias_dims, + const int32_t *bias_data, + const cmsis_nn_dims *output_dims, + q7_t *output_data) +{ + (void)bias_dims; + arm_status status = ARM_MATH_SUCCESS; + if (output_dims->w % 4 != 0) + { + status = ARM_MATH_SIZE_MISMATCH; + goto out; + } + +#if defined(ARM_MATH_MVEI) + (void)ctx; + + const uint16_t input_x = input_dims->w; + const uint16_t kernel_x = filter_dims->w; + const uint16_t output_x = output_dims->w; + const uint16_t output_ch = output_dims->c; + const uint16_t input_ch = input_dims->c; + const uint16_t pad_x = conv_params->padding.w; + const uint16_t stride_x = conv_params->stride.w; + + const int32_t input_offset = conv_params->input_offset; + const int32_t out_offset = conv_params->output_offset; + const int32_t out_activation_min = conv_params->activation.min; + const int32_t out_activation_max = conv_params->activation.max; + int32_t *output_mult = quant_params->multiplier; + int32_t *output_shift = quant_params->shift; + + for (int i_out_x = 0; i_out_x <= (output_x - 4); i_out_x += 4) + { + int32_t input_begin_idx[4]; + int32_t ker_begin_idx[4]; + int32_t ker_end_idx[4]; + + for (int i = 0; i < 4; i++) + { + const int32_t est_input_x_idx = stride_x * (i_out_x + i) - pad_x; + input_begin_idx[i] = MAX(0, est_input_x_idx); + ker_begin_idx[i] = MAX(0, -est_input_x_idx); + ker_end_idx[i] = MIN(kernel_x, input_x - est_input_x_idx); + } + + if ((ker_begin_idx[0] != 0) || (ker_end_idx[3] != kernel_x)) + { + for (int i_out_ch = 0; i_out_ch < output_ch; i_out_ch++) + { + int32x4_t s_offset; + int32_t acc[4]; + { + int32_t sum_row[4]; + + (void)arm_nn_mat_mul_core_1x_s8((ker_end_idx[0] - ker_begin_idx[0]) * input_ch, + input_data + input_begin_idx[0] * input_ch, + filter_data + (input_ch * kernel_x * i_out_ch) + + (ker_begin_idx[0] * input_ch), + &sum_row[0], + &acc[0]); + (void)arm_nn_mat_mul_core_1x_s8((ker_end_idx[1] - ker_begin_idx[1]) * input_ch, + input_data + input_begin_idx[1] * input_ch, + filter_data + (input_ch * kernel_x * i_out_ch) + + (ker_begin_idx[1] * input_ch), + &sum_row[1], + &acc[1]); + + (void)arm_nn_mat_mul_core_1x_s8((ker_end_idx[2] - ker_begin_idx[2]) * input_ch, + input_data + input_begin_idx[2] * input_ch, + filter_data + (input_ch * kernel_x * i_out_ch) + + (ker_begin_idx[2] * input_ch), + &sum_row[2], + &acc[2]); + + (void)arm_nn_mat_mul_core_1x_s8((ker_end_idx[3] - ker_begin_idx[3]) * input_ch, + input_data + input_begin_idx[3] * input_ch, + filter_data + (input_ch * kernel_x * i_out_ch) + + (ker_begin_idx[3] * input_ch), + &sum_row[3], + &acc[3]); + + s_offset = vldrwq_s32(sum_row); + } + int32x4_t res = vldrwq_s32(acc); + s_offset = vmulq_n_s32(s_offset, input_offset); + res = vaddq_s32(res, s_offset); + if (bias_data) + { + res = vaddq_n_s32(res, bias_data[i_out_ch]); + } + res = arm_requantize_mve(res, output_mult[i_out_ch], output_shift[i_out_ch]); + res = vaddq_n_s32(res, out_offset); + + res = vmaxq_s32(res, vdupq_n_s32(out_activation_min)); + res = vminq_s32(res, vdupq_n_s32(out_activation_max)); + + const uint32x4_t scatter_offset = {0, output_ch, output_ch * 2, output_ch * 3}; + vstrbq_scatter_offset_s32(output_data, scatter_offset, res); + output_data++; + } + output_data += (3 * output_ch); + } + else + { + output_data = arm_nn_mat_mul_core_4x_s8(kernel_x * input_ch, + stride_x * input_ch, + input_data + input_begin_idx[0] * input_ch, + filter_data, + output_ch, + conv_params, + quant_params, + bias_data, + output_data); + } + } + +#else + status = arm_convolve_s8(ctx, + conv_params, + quant_params, + input_dims, + input_data, + filter_dims, + filter_data, + bias_dims, + bias_data, + output_dims, + output_data); +#endif + +out: + /* Return to application */ + return status; +} + +int32_t arm_convolve_1_x_n_s8_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims) +{ +#if !defined(ARM_MATH_MVEI) + return (2 * input_dims->c * filter_dims->w * filter_dims->h) * sizeof(int16_t); +#else + (void)input_dims; + (void)filter_dims; + return 0; +#endif +} + +/** + * @} end of NNConv group + */ diff --git a/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_1x1_HWC_q7_fast_nonsquare.c b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_1x1_HWC_q7_fast_nonsquare.c new file mode 100644 index 0000000..3db3ba4 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_1x1_HWC_q7_fast_nonsquare.c @@ -0,0 +1,235 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_convolve_1x1_HWC_q7_fast_nonsquare.c + * Description: Fast Q7 version of 1x1 convolution (non-square shape) + * + * $Date: July 20, 2021 + * $Revision: V.1.1.2 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup NNConv + * @{ + */ + +/** + * @brief Fast Q7 version of 1x1 convolution (non-sqaure shape) + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in_x input tensor dimention x + * @param[in] dim_im_in_y input tensor dimention y + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel_x filter kernel size x + * @param[in] dim_kernel_y filter kernel size y + * @param[in] padding_x padding size x + * @param[in] padding_y padding size y + * @param[in] stride_x convolution stride x + * @param[in] stride_y convolution stride y + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out_x output tensor dimension x + * @param[in] dim_im_out_y output tensor dimension y + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferB pointer to buffer space for output + * @return The function returns either + * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking. + * + * This function is optimized for convolution with 1x1 kernel size (i.e., dim_kernel_x=1 + * and dim_kernel_y=1). It can be used for the second half of MobileNets [1] after depthwise + * separable convolution. + * + * This function is the version with full list of optimization tricks, but with + * some constraints: + * ch_im_in is multiple of 4 + * ch_im_out is multiple of 2 + * + * [1] MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications + * https://arxiv.org/abs/1704.04861 + */ + +arm_status arm_convolve_1x1_HWC_q7_fast_nonsquare(const q7_t *Im_in, + const uint16_t dim_im_in_x, + const uint16_t dim_im_in_y, + const uint16_t ch_im_in, + const q7_t *wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel_x, + const uint16_t dim_kernel_y, + const uint16_t padding_x, + const uint16_t padding_y, + const uint16_t stride_x, + const uint16_t stride_y, + const q7_t *bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q7_t *Im_out, + const uint16_t dim_im_out_x, + const uint16_t dim_im_out_y, + q15_t *bufferA, + q7_t *bufferB) +{ + (void)bufferB; +#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) + /* Run the following code for Cortex-M4 and Cortex-M7 */ + (void)dim_im_in_y; + int16_t i_out_y, i_out_x; + int16_t i_ch_out; + + /* ----------------------- + * Here we use bufferA as q15_t internally as computation are done with q15_t level + * im2col are done to output in q15_t format from q7_t input + */ + + q15_t *pBuffer = bufferA; + q7_t *pOut = Im_out; + + if (ch_im_in % 4 != 0 || ch_im_out % 2 != 0 || dim_kernel_x != 1 || dim_kernel_y != 1 || padding_x != 0 || + padding_y != 0 || stride_x != 1 || stride_y != 1) + { + /* check if the input dimension meets the constraints */ + return ARM_MATH_SIZE_MISMATCH; + } + + for (i_out_y = 0; i_out_y < dim_im_out_y; i_out_y++) + { + for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++) + { + /* This part implements the im2col function */ + arm_q7_to_q15_reordered_no_shift( + (q7_t *)Im_in + (i_out_y * dim_im_in_x + i_out_x) * ch_im_in, pBuffer, ch_im_in); + pBuffer += ch_im_in; + + if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y) + { + pOut = arm_nn_mat_mult_kernel_q7_q15_reordered( + wt, bufferA, ch_im_out, ch_im_in, bias_shift, out_shift, bias, pOut); + /* counter reset */ + pBuffer = bufferA; + } + } + } + + /* check if there is left-over for compute */ + if (pBuffer != bufferA) + { + const q7_t *pA = wt; + for (i_ch_out = 0; i_ch_out < ch_im_out; i_ch_out++) + { + q31_t sum = ((q31_t)(bias[i_ch_out]) << bias_shift) + NN_ROUND(out_shift); + const q15_t *pB = bufferA; + /* basically each time it process 4 entries */ + uint16_t colCnt = ch_im_in * dim_kernel_x * dim_kernel_y >> 2; + + while (colCnt) + { + + q31_t inA1, inA2; + q31_t inB1, inB2; + + pA = read_and_pad_reordered(pA, &inA1, &inA2); + + inB1 = arm_nn_read_q15x2_ia(&pB); + sum = __SMLAD(inA1, inB1, sum); + inB2 = arm_nn_read_q15x2_ia(&pB); + + sum = __SMLAD(inA2, inB2, sum); + + colCnt--; + } + colCnt = ch_im_in * dim_kernel_y * dim_kernel_x & 0x3; + while (colCnt) + { + q7_t inA1 = *pA++; + q15_t inB1 = *pB++; + sum += inA1 * inB1; + colCnt--; + } + *pOut = (q7_t)__SSAT((sum >> out_shift), 8); + pOut++; + } + } + +#else + (void)bufferA; + /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ + int i, j, k, l, m, n; + int conv_out; + int in_row, in_col; + + if (ch_im_in % 4 != 0 || ch_im_out % 2 != 0 || dim_kernel_x != 1 || dim_kernel_y != 1 || padding_x != 0 || + padding_y != 0 || stride_x != 1 || stride_y != 1) + { + /* check if the input dimension meets the constraints */ + return ARM_MATH_SIZE_MISMATCH; + } + + for (i = 0; i < ch_im_out; i++) + { + for (j = 0; j < dim_im_out_y; j++) + { + for (k = 0; k < dim_im_out_x; k++) + { + conv_out = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift); + for (m = 0; m < dim_kernel_y; m++) + { + for (n = 0; n < dim_kernel_x; n++) + { + // if-for implementation + in_row = stride_y * j + m - padding_y; + in_col = stride_x * k + n - padding_x; + if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in_y && in_col < dim_im_in_x) + { + for (l = 0; l < ch_im_in; l++) + { + conv_out += Im_in[(in_row * dim_im_in_x + in_col) * ch_im_in + l] * + wt[i * ch_im_in * dim_kernel_y * dim_kernel_x + (m * dim_kernel_y + n) * ch_im_in + + l]; + } + } + } + } + Im_out[i + (j * dim_im_out_x + k) * ch_im_out] = (q7_t)__SSAT((conv_out >> out_shift), 8); + } + } + } + +#endif /* ARM_MATH_DSP */ + + /* Return to application */ + return ARM_MATH_SUCCESS; +} + +/** + * @} end of NNConv group + */ diff --git a/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_1x1_s8_fast.c b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_1x1_s8_fast.c new file mode 100644 index 0000000..6183f55 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_1x1_s8_fast.c @@ -0,0 +1,161 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_convolve_1x1_s8_fast.c + * Description: Fast q7 version of 1x1 convolution (non-square shape) + * + * $Date: 12. November 2021 + * $Revision: V.2.0.4 + * + * Target Processor: Cortex-M Processors + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" +#include <stdio.h> + +#define DIM_KER_X (1U) +#define DIM_KER_Y (1U) + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup NNConv + * @{ + */ + +/* + * Fast s8 version for 1x1 convolution (non-square shape) + * + * Refer header file for details. + * + */ + +arm_status arm_convolve_1x1_s8_fast(const cmsis_nn_context *ctx, + const cmsis_nn_conv_params *conv_params, + const cmsis_nn_per_channel_quant_params *quant_params, + const cmsis_nn_dims *input_dims, + const q7_t *input_data, + const cmsis_nn_dims *filter_dims, + const q7_t *filter_data, + const cmsis_nn_dims *bias_dims, + const int32_t *bias_data, + const cmsis_nn_dims *output_dims, + q7_t *output_data) +{ + if (input_dims->c % 4 != 0 || conv_params->padding.w != 0 || conv_params->padding.h != 0 || + conv_params->stride.w != 1 || conv_params->stride.h != 1) + { + return ARM_MATH_SIZE_MISMATCH; + } + + (void)ctx; + (void)filter_dims; + (void)bias_dims; + +#if defined(ARM_MATH_MVEI) + + const int32_t col_len = input_dims->w * input_dims->h * input_dims->n; + const int32_t output_ch = output_dims->c; + const int32_t input_ch = input_dims->c; + const int32_t input_offset = conv_params->input_offset; + const int32_t out_offset = conv_params->output_offset; + const int32_t out_activation_min = conv_params->activation.min; + const int32_t out_activation_max = conv_params->activation.max; + int32_t *output_mult = quant_params->multiplier; + int32_t *output_shift = quant_params->shift; + + for (int i_items = 0; i_items <= (col_len - 4); i_items += 4) + { + + output_data = arm_nn_mat_mul_core_4x_s8(input_ch, + input_ch, + input_data + i_items * input_ch, + filter_data, + output_ch, + conv_params, + quant_params, + bias_data, + output_data); + } + + /* Handle left over elements */ + for (int i_items = (col_len & ~0x3); i_items < col_len; i_items++) + { + for (int i_out_ch = 0; i_out_ch < output_ch; i_out_ch++) + { + int32_t sum_row = 0; + int32_t acc; + (void)arm_nn_mat_mul_core_1x_s8( + input_ch, input_data + i_items * input_ch, filter_data + i_out_ch * input_ch, &sum_row, &acc); + if (bias_data) + { + acc += bias_data[i_out_ch]; + } + sum_row = (sum_row * input_offset); + acc += sum_row; + acc = arm_nn_requantize(acc, output_mult[i_out_ch], output_shift[i_out_ch]); + acc += out_offset; + + acc = MAX(acc, out_activation_min); + acc = MIN(acc, out_activation_max); + *output_data++ = acc; + } + } + +#else + /* Run the following code as reference implementation for Cortex-M processors with or without DSP extension */ + + const int32_t lhs_rows = input_dims->w * input_dims->h * input_dims->n; + const int32_t rhs_rows = output_dims->c; + const int32_t rhs_cols = input_dims->c; + + arm_nn_mat_mult_nt_t_s8(input_data, + filter_data, + bias_data, + output_data, + quant_params->multiplier, + quant_params->shift, + lhs_rows, + rhs_rows, + rhs_cols, + conv_params->input_offset, + conv_params->output_offset, + conv_params->activation.min, + conv_params->activation.max); + +#endif + + /* Return to application */ + return ARM_MATH_SUCCESS; +} + +int32_t arm_convolve_1x1_s8_fast_get_buffer_size(const cmsis_nn_dims *input_dims) +{ + (void)input_dims; + return 0; +} + +/** + * @} end of NNConv group + */ diff --git a/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_basic.c b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_basic.c new file mode 100644 index 0000000..0a6868a --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_basic.c @@ -0,0 +1,209 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_convolve_HWC_q15_basic.c + * Description: Q15 version of convolution + * + * $Date: July 20, 2021 + * $Revision: V.1.1.2 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup NNConv + * @{ + */ + +/** + * @brief Basic Q15 convolution function + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimention + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferB pointer to buffer space for output + * @return The function returns <code>ARM_MATH_SUCCESS</code> + * + * @details + * + * <b>Buffer size:</b> + * + * bufferA size: ch_im_in*dim_kernel*dim_kernel + * + * bufferB size: 0 + * + * This basic version is designed to work for any input tensor and weight + * dimension. + */ + +arm_status arm_convolve_HWC_q15_basic(const q15_t *Im_in, + const uint16_t dim_im_in, + const uint16_t ch_im_in, + const q15_t *wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel, + const uint16_t padding, + const uint16_t stride, + const q15_t *bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q15_t *Im_out, + const uint16_t dim_im_out, + q15_t *bufferA, + q7_t *bufferB) +{ + (void)bufferB; +#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) + /* Run the following code for Cortex-M4 and Cortex-M7 */ + + int16_t i_out_y, i_out_x, i_ker_y, i_ker_x; + + uint16_t im2col_out_pixel_index = 0; + q15_t *pBuffer = bufferA; + q15_t *pOut = Im_out; + q15_t *im_buffer = bufferA; + const q15_t *pA; + int i; + + /* This part implements the im2col function */ + for (i_out_y = 0; i_out_y < dim_im_out; i_out_y++) + { + for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++) + { + for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++) + { + for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++) + { + if (i_ker_y < 0 || i_ker_y >= dim_im_in || i_ker_x < 0 || i_ker_x >= dim_im_in) + { + /* Filling 0 for out-of-bound paddings */ + /* arm_fill_q15(0, pBuffer, ch_im_in); */ + memset(pBuffer, 0, sizeof(q15_t) * ch_im_in); + } + else + { + /* arm_copy_q15((q15_t *) Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, pBuffer, + * ch_im_in); */ + memcpy(pBuffer, + (q15_t *)Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, + sizeof(q15_t) * ch_im_in); + } + pBuffer += ch_im_in; + } + } + + pA = wt; + for (i = 0; i < ch_im_out; i++) + { + q31_t sum = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift); + const q15_t *pB = im_buffer; + uint16_t colCnt = ch_im_in * dim_kernel * dim_kernel >> 2; + while (colCnt) + { + q31_t inA1 = arm_nn_read_q15x2_ia(&pA); + q31_t inB1 = arm_nn_read_q15x2_ia(&pB); + q31_t inA2 = arm_nn_read_q15x2_ia(&pA); + q31_t inB2 = arm_nn_read_q15x2_ia(&pB); + + sum = __SMLAD(inA1, inB1, sum); + sum = __SMLAD(inA2, inB2, sum); + + colCnt--; + } + colCnt = ch_im_in * dim_kernel * dim_kernel & 0x3; + while (colCnt) + { + q15_t inA1 = *pA++; + q15_t inB1 = *pB++; + sum += inA1 * inB1; + colCnt--; + } + *pOut = (q15_t)__SSAT((sum >> out_shift), 16); + pOut++; + } + + /* counter reset */ + pBuffer = im_buffer; + im2col_out_pixel_index++; + } + } + +#else + (void)bufferA; + /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ + int i, j, k, l, m, n; + int conv_out; + int in_row, in_col; + + for (i = 0; i < ch_im_out; i++) + { + for (j = 0; j < dim_im_out; j++) + { + for (k = 0; k < dim_im_out; k++) + { + conv_out = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift); + for (m = 0; m < dim_kernel; m++) + { + for (n = 0; n < dim_kernel; n++) + { + in_row = stride * j + m - padding; + in_col = stride * k + n - padding; + if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in && in_col < dim_im_in) + { + for (l = 0; l < ch_im_in; l++) + { + conv_out += Im_in[(in_row * dim_im_in + in_col) * ch_im_in + l] * + wt[i * ch_im_in * dim_kernel * dim_kernel + (m * dim_kernel + n) * ch_im_in + l]; + } + } + } + } + Im_out[i + (j * dim_im_out + k) * ch_im_out] = (q15_t)__SSAT((conv_out >> out_shift), 16); + } + } + } + +#endif /* ARM_MATH_DSP */ + + /* Return to application */ + return ARM_MATH_SUCCESS; +} + +/** + * @} end of NNConv group + */ diff --git a/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_fast.c b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_fast.c new file mode 100644 index 0000000..66fbc00 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_fast.c @@ -0,0 +1,259 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_convolve_HWC_q15_fast.c + * Description: Fast Q15 version of convolution + * + * $Date: July 20, 2021 + * $Revision: V.1.1.2 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup NNConv + * @{ + */ + +/** + * @brief Fast Q15 convolution function + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimention + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferB pointer to buffer space for output + * @return The function returns either + * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking. + * + * @details + * + * <b>Buffer size:</b> + * + * bufferA size: 2*ch_im_in*dim_kernel*dim_kernel + * + * bufferB size: 0 + * + * <b>Input dimension constraints:</b> + * + * ch_im_in is multiple of 2 + * + * ch_im_out is multiple of 2 + * + * dim_im_out is a multiple of 2 + * + */ + +arm_status arm_convolve_HWC_q15_fast(const q15_t *Im_in, + const uint16_t dim_im_in, + const uint16_t ch_im_in, + const q15_t *wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel, + const uint16_t padding, + const uint16_t stride, + const q15_t *bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q15_t *Im_out, + const uint16_t dim_im_out, + q15_t *bufferA, + q7_t *bufferB) +{ + (void)bufferB; +#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) + int16_t i_out_y, i_out_x, i_ker_y, i_ker_x; + + q15_t *pBuffer = bufferA; + q15_t *im_buffer = bufferA; + q15_t *pOut = Im_out; + + if (ch_im_in % 2 != 0 || ch_im_out % 2 != 0 || dim_im_out & 0x1) + { + /* check if the input dimension meets the constraints */ + return ARM_MATH_SIZE_MISMATCH; + } + + /* Run the following code for Cortex-M4 and Cortex-M7 */ + + /* This part implements the im2col function */ + for (i_out_y = 0; i_out_y < dim_im_out; i_out_y++) + { + for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++) + { + for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++) + { + for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++) + { + if (i_ker_y < 0 || i_ker_y >= dim_im_in || i_ker_x < 0 || i_ker_x >= dim_im_in) + { + /* arm_fill_q15(0, pBuffer, ch_im_in); */ + memset(pBuffer, 0, sizeof(q15_t) * ch_im_in); + } + else + { + /* arm_copy_q15((q15_t *) Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, pBuffer, + * ch_im_in); */ + memcpy(pBuffer, + (q15_t *)Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, + sizeof(q15_t) * ch_im_in); + } + pBuffer += ch_im_in; + } + } + + if (i_out_x & 0x1) + { + int i; + /* initialize the matrix pointers for A */ + const q15_t *pA = wt; + + /* set up the second output pointers */ + q15_t *pOut2 = pOut + ch_im_out; + + /* this loop over rows in A */ + for (i = 0; i < ch_im_out; i += 2) + { + /* setup pointers for B */ + const q15_t *pB = im_buffer; + const q15_t *pB2 = pB + ch_im_in * dim_kernel * dim_kernel; + + /* aling the second pointer for A */ + const q15_t *pA2 = pA + ch_im_in * dim_kernel * dim_kernel; + + /* init the sum with bias */ + q31_t sum = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift); + q31_t sum2 = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift); + q31_t sum3 = ((q31_t)bias[i + 1] << bias_shift) + NN_ROUND(out_shift); + q31_t sum4 = ((q31_t)bias[i + 1] << bias_shift) + NN_ROUND(out_shift); + + uint16_t colCnt = ch_im_in * dim_kernel * dim_kernel >> 1; + /* accumulate over the vector */ + while (colCnt) + { + q31_t inA1 = arm_nn_read_q15x2_ia(&pA); + q31_t inB1 = arm_nn_read_q15x2_ia(&pB); + q31_t inA2 = arm_nn_read_q15x2_ia(&pA2); + q31_t inB2 = arm_nn_read_q15x2_ia(&pB2); + + sum = __SMLAD(inA1, inB1, sum); + sum2 = __SMLAD(inA1, inB2, sum2); + sum3 = __SMLAD(inA2, inB1, sum3); + sum4 = __SMLAD(inA2, inB2, sum4); + + colCnt--; + } /* while over colCnt */ + colCnt = ch_im_in * dim_kernel * dim_kernel & 0x1; + while (colCnt) + { + q15_t inA1 = *pA++; + q15_t inB1 = *pB++; + q15_t inA2 = *pA2++; + q15_t inB2 = *pB2++; + + sum += inA1 * inB1; + sum2 += inA1 * inB2; + sum3 += inA2 * inB1; + sum4 += inA2 * inB2; + colCnt--; + } /* while over colCnt */ + *pOut++ = (q15_t)__SSAT(sum >> out_shift, 16); + *pOut++ = (q15_t)__SSAT(sum3 >> out_shift, 16); + *pOut2++ = (q15_t)__SSAT(sum2 >> out_shift, 16); + *pOut2++ = (q15_t)__SSAT(sum4 >> out_shift, 16); + + /* skip the row computed with A2 */ + pA += ch_im_in * dim_kernel * dim_kernel; + } /* for over ch_im_out */ + + pOut += ch_im_out; + /* counter reset */ + pBuffer = im_buffer; + } + } + } + +#else + (void)bufferA; + /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ + int i, j, k, l, m, n; + int conv_out; + int in_row, in_col; + + if (ch_im_in % 2 != 0 || ch_im_out % 2 != 0) + { + /* check if the input dimension meets the constraints */ + return ARM_MATH_SIZE_MISMATCH; + } + + for (i = 0; i < ch_im_out; i++) + { + for (j = 0; j < dim_im_out; j++) + { + for (k = 0; k < dim_im_out; k++) + { + conv_out = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift); + for (m = 0; m < dim_kernel; m++) + { + for (n = 0; n < dim_kernel; n++) + { + in_row = stride * j + m - padding; + in_col = stride * k + n - padding; + if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in && in_col < dim_im_in) + { + for (l = 0; l < ch_im_in; l++) + { + conv_out += Im_in[(in_row * dim_im_in + in_col) * ch_im_in + l] * + wt[i * ch_im_in * dim_kernel * dim_kernel + (m * dim_kernel + n) * ch_im_in + l]; + } + } + } + } + Im_out[i + (j * dim_im_out + k) * ch_im_out] = (q15_t)__SSAT((conv_out >> out_shift), 16); + } + } + } + +#endif /* ARM_MATH_DSP */ + + /* Return to application */ + return ARM_MATH_SUCCESS; +} + +/** + * @} end of NNConv group + */ diff --git a/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_fast_nonsquare.c b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_fast_nonsquare.c new file mode 100644 index 0000000..7babe51 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_fast_nonsquare.c @@ -0,0 +1,270 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_convolve_HWC_q15_fast.c + * Description: Fast Q15 version of convolution + * + * $Date: July 20, 2021 + * $Revision: V.1.1.2 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup NNConv + * @{ + */ + +/** + * @brief Fast Q15 convolution function (non-sqaure shape) + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in_x input tensor dimention x + * @param[in] dim_im_in_y input tensor dimention y + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel_x filter kernel size x + * @param[in] dim_kernel_y filter kernel size y + * @param[in] padding_x padding size x + * @param[in] padding_y padding size y + * @param[in] stride_x convolution stride x + * @param[in] stride_y convolution stride y + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out_x output tensor dimension x + * @param[in] dim_im_out_y output tensor dimension y + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferB pointer to buffer space for output + * @return The function returns either + * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking. + * + * @details + * + * <b>Buffer size:</b> + * + * bufferA size: 2*ch_im_in*dim_kernel*dim_kernel + * + * bufferB size: 0 + * + * <b>Input dimension constraints:</b> + * + * ch_im_in is multiple of 2 + * + * ch_im_out is multiple of 2 + * + */ + +arm_status arm_convolve_HWC_q15_fast_nonsquare(const q15_t *Im_in, + const uint16_t dim_im_in_x, + const uint16_t dim_im_in_y, + const uint16_t ch_im_in, + const q15_t *wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel_x, + const uint16_t dim_kernel_y, + const uint16_t padding_x, + const uint16_t padding_y, + const uint16_t stride_x, + const uint16_t stride_y, + const q15_t *bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q15_t *Im_out, + const uint16_t dim_im_out_x, + const uint16_t dim_im_out_y, + q15_t *bufferA, + q7_t *bufferB) +{ + (void)bufferB; +#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) + int16_t i_out_y, i_out_x, i_ker_y, i_ker_x; + + q15_t *pBuffer = bufferA; + q15_t *im_buffer = bufferA; + q15_t *pOut = Im_out; + + if (ch_im_in % 2 != 0 || ch_im_out % 2 != 0) + { + /* check if the input dimension meets the constraints */ + return ARM_MATH_SIZE_MISMATCH; + } + + /* Run the following code for Cortex-M4 and Cortex-M7 */ + + /* This part implements the im2col function */ + for (i_out_y = 0; i_out_y < dim_im_out_y; i_out_y++) + { + for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++) + { + for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y; + i_ker_y++) + { + for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x; + i_ker_x++) + { + if (i_ker_y < 0 || i_ker_y >= dim_im_in_y || i_ker_x < 0 || i_ker_x >= dim_im_in_x) + { + /* arm_fill_q15(0, pBuffer, ch_im_in); */ + memset(pBuffer, 0, sizeof(q15_t) * ch_im_in); + } + else + { + /* arm_copy_q15((q15_t *) Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, pBuffer, + * ch_im_in); */ + memcpy(pBuffer, + (q15_t *)Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, + sizeof(q15_t) * ch_im_in); + } + pBuffer += ch_im_in; + } + } + + if (i_out_x & 0x1) + { + int i; + /* initialize the matrix pointers for A */ + const q15_t *pA = wt; + + /* set up the second output pointers */ + q15_t *pOut2 = pOut + ch_im_out; + + /* this loop over rows in A */ + for (i = 0; i < ch_im_out; i += 2) + { + /* setup pointers for B */ + const q15_t *pB = im_buffer; + const q15_t *pB2 = pB + ch_im_in * dim_kernel_y * dim_kernel_x; + + /* aling the second pointer for A */ + const q15_t *pA2 = pA + ch_im_in * dim_kernel_y * dim_kernel_x; + + /* init the sum with bias */ + q31_t sum = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift); + q31_t sum2 = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift); + q31_t sum3 = ((q31_t)bias[i + 1] << bias_shift) + NN_ROUND(out_shift); + q31_t sum4 = ((q31_t)bias[i + 1] << bias_shift) + NN_ROUND(out_shift); + + uint16_t colCnt = ch_im_in * dim_kernel_y * dim_kernel_x >> 1; + /* accumulate over the vector */ + while (colCnt) + { + q31_t inA1 = arm_nn_read_q15x2_ia(&pA); + q31_t inB1 = arm_nn_read_q15x2_ia(&pB); + q31_t inA2 = arm_nn_read_q15x2_ia(&pA2); + q31_t inB2 = arm_nn_read_q15x2_ia(&pB2); + + sum = __SMLAD(inA1, inB1, sum); + sum2 = __SMLAD(inA1, inB2, sum2); + sum3 = __SMLAD(inA2, inB1, sum3); + sum4 = __SMLAD(inA2, inB2, sum4); + + colCnt--; + } /* while over colCnt */ + colCnt = ch_im_in * dim_kernel_y * dim_kernel_x & 0x1; + while (colCnt) + { + q15_t inA1 = *pA++; + q15_t inB1 = *pB++; + q15_t inA2 = *pA2++; + q15_t inB2 = *pB2++; + + sum += inA1 * inB1; + sum2 += inA1 * inB2; + sum3 += inA2 * inB1; + sum4 += inA2 * inB2; + colCnt--; + } /* while over colCnt */ + *pOut++ = (q15_t)__SSAT(sum >> out_shift, 16); + *pOut++ = (q15_t)__SSAT(sum3 >> out_shift, 16); + *pOut2++ = (q15_t)__SSAT(sum2 >> out_shift, 16); + *pOut2++ = (q15_t)__SSAT(sum4 >> out_shift, 16); + + /* skip the row computed with A2 */ + pA += ch_im_in * dim_kernel_y * dim_kernel_x; + } /* for over ch_im_out */ + + pOut += ch_im_out; + /* counter reset */ + pBuffer = im_buffer; + } + } + } + +#else + (void)bufferA; + /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ + int i, j, k, l, m, n; + int conv_out; + int in_row, in_col; + + if (ch_im_in % 2 != 0 || ch_im_out % 2 != 0) + { + /* check if the input dimension meets the constraints */ + return ARM_MATH_SIZE_MISMATCH; + } + + for (i = 0; i < ch_im_out; i++) + { + for (j = 0; j < dim_im_out_y; j++) + { + for (k = 0; k < dim_im_out_x; k++) + { + conv_out = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift); + for (m = 0; m < dim_kernel_y; m++) + { + for (n = 0; n < dim_kernel_x; n++) + { + in_row = stride_y * j + m - padding_y; + in_col = stride_x * k + n - padding_x; + if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in_y && in_col < dim_im_in_x) + { + for (l = 0; l < ch_im_in; l++) + { + conv_out += Im_in[(in_row * dim_im_in_x + in_col) * ch_im_in + l] * + wt[i * ch_im_in * dim_kernel_x * dim_kernel_y + (m * dim_kernel_x + n) * ch_im_in + + l]; + } + } + } + } + Im_out[i + (j * dim_im_out_x + k) * ch_im_out] = (q15_t)__SSAT((conv_out >> out_shift), 16); + } + } + } + +#endif /* ARM_MATH_DSP */ + + /* Return to application */ + return ARM_MATH_SUCCESS; +} + +/** + * @} end of NNConv group + */ diff --git a/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_RGB.c b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_RGB.c new file mode 100644 index 0000000..618f492 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_RGB.c @@ -0,0 +1,280 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_convolve_HWC_q7_RGB.c + * Description: Q7 version of convolution for RGB image + * + * $Date: July 20, 2021 + * $Revision: V.1.1.2 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup NNConv + * @{ + */ + +/** + * @brief Q7 convolution function for RGB image + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimention + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferB pointer to buffer space for output + * @return The function returns either + * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking. + * + * @details + * + * <b>Buffer size:</b> + * + * bufferA size: 2*ch_im_in*dim_kernel*dim_kernel + * + * bufferB size: 0 + * + * <b>Input dimension constraints:</b> + * + * ch_im_in equals 3 + * + * This kernel is written exclusively for convolution with ch_im_in + * equals 3. This applies on the first layer of CNNs which has input + * image with RGB format. + */ + +arm_status arm_convolve_HWC_q7_RGB(const q7_t *Im_in, + const uint16_t dim_im_in, + const uint16_t ch_im_in, + const q7_t *wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel, + const uint16_t padding, + const uint16_t stride, + const q7_t *bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q7_t *Im_out, + const uint16_t dim_im_out, + q15_t *bufferA, + q7_t *bufferB) +{ + (void)bufferB; +#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) + /* Run the following code for Cortex-M4 and Cortex-M7 */ + int16_t i_out_y, i_out_x, i_ker_y, i_ker_x; + + /* + * Here we use bufferA as q15_t internally as computation are done with q15_t level + * im2col are done to output in q15_t format from q7_t input + */ + q15_t *pBuffer = bufferA; + q7_t *pOut = Im_out; + + // check if number of input channels is 3 + if (ch_im_in != 3) + { + return ARM_MATH_SIZE_MISMATCH; + } + // This part implements the im2col function + for (i_out_y = 0; i_out_y < dim_im_out; i_out_y++) + { + for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++) + { + for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++) + { + for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++) + { + if (i_ker_y < 0 || i_ker_y >= dim_im_in || i_ker_x < 0 || i_ker_x >= dim_im_in) + { + /* Equivalent to arm_fill_q15(0, pBuffer, ch_im_in) with assumption: ch_im_in = 3 */ + arm_memset_q7((q7_t *)pBuffer, (q7_t)0, 3 * sizeof(q15_t)); + pBuffer += 3; + } + else + { + /* + * Equivalent to: + * arm_q7_to_q15_no_shift( (q7_t*)Im_in+(i_ker_y*dim_im_in+i_ker_x)*3, pBuffer, 3); + */ + + const q7_t *pPixel = Im_in + (i_ker_y * dim_im_in + i_ker_x) * 3; + q31_t buf = arm_nn_read_q7x4(pPixel); + + union arm_nnword top; + union arm_nnword bottom; + + top.word = __SXTB16(buf); + bottom.word = __SXTB16(__ROR(buf, 8)); + +#ifndef ARM_MATH_BIG_ENDIAN + /* + * little-endian, | omit | 3rd | 2nd | 1st | + * MSB LSB + * top | 3rd | 1st |; bottom | omit | 2nd | + * + * version 1, need to swap 2nd and 3rd weight + * *__SIMD32(pBuffer) = top.word; + * *(pBuffer+2) = bottom.half_words[0]; + * + * version 2, no weight shuffling required + */ + *pBuffer++ = top.half_words[0]; + int32_t packed_word = __PKHBT(bottom.word, top.word, 0); + arm_memcpy_q7((q7_t *)pBuffer, (q7_t *)&packed_word, 4); +#else + /* + * big-endian, | 1st | 2nd | 3rd | omit | + * MSB LSB + * top | 2nd | omit |; bottom | 1st | 3rd | + * + * version 1, need to swap 2nd and 3rd weight + * *__SIMD32(pBuffer) = bottom.word; + * *(pBuffer+2) = top.half_words[1]; + * + * version 2, no weight shuffling required + */ + *pBuffer++ = bottom.half_words[0]; + int32_t packed_word = __PKHTB(top.word, bottom.word, 0); + arm_memcpy_q7((q7_t *)pBuffer, (q7_t *)&packed_word, 4); +#endif + pBuffer += 2; + } + } + } + + if (pBuffer == bufferA + 2 * 3 * dim_kernel * dim_kernel) + { + pOut = arm_nn_mat_mult_kernel_q7_q15( + wt, bufferA, ch_im_out, 3 * dim_kernel * dim_kernel, bias_shift, out_shift, bias, pOut); + + /* counter reset */ + pBuffer = bufferA; + } + } + } + + /* left-over because odd number of output pixels */ + if (pBuffer != bufferA) + { + const q7_t *pA = wt; + int i; + + for (i = 0; i < ch_im_out; i++) + { + q31_t sum = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift); + q15_t *pB = bufferA; + /* basically each time it process 4 entries */ + uint16_t colCnt = 3 * dim_kernel * dim_kernel >> 2; + + while (colCnt) + { + + q31_t inA1, inA2; + q31_t inB1, inB2; + + pA = read_and_pad(pA, &inA1, &inA2); + + inB1 = arm_nn_read_q15x2_ia((const q15_t **)&pB); + sum = __SMLAD(inA1, inB1, sum); + inB2 = arm_nn_read_q15x2_ia((const q15_t **)&pB); + sum = __SMLAD(inA2, inB2, sum); + + colCnt--; + } + colCnt = 3 * dim_kernel * dim_kernel & 0x3; + while (colCnt) + { + q7_t inA1 = *pA++; + q15_t inB1 = *pB++; + sum += inA1 * inB1; + colCnt--; + } + *pOut++ = (q7_t)__SSAT((sum >> out_shift), 8); + } + } +#else + (void)bufferA; + /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ + int i, j, k, l, m, n; + int conv_out; + int in_row, in_col; + + // check if number of input channels is 3 + if (ch_im_in != 3) + { + return ARM_MATH_SIZE_MISMATCH; + } + + for (i = 0; i < ch_im_out; i++) + { + for (j = 0; j < dim_im_out; j++) + { + for (k = 0; k < dim_im_out; k++) + { + conv_out = (bias[i] << bias_shift) + NN_ROUND(out_shift); + for (m = 0; m < dim_kernel; m++) + { + for (n = 0; n < dim_kernel; n++) + { + /* if-for implementation */ + in_row = stride * j + m - padding; + in_col = stride * k + n - padding; + if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in && in_col < dim_im_in) + { + for (l = 0; l < ch_im_in; l++) + { + conv_out += Im_in[(in_row * dim_im_in + in_col) * ch_im_in + l] * + wt[i * ch_im_in * dim_kernel * dim_kernel + (m * dim_kernel + n) * ch_im_in + l]; + } + } + } + } + Im_out[i + (j * dim_im_out + k) * ch_im_out] = (q7_t)__SSAT((conv_out >> out_shift), 8); + } + } + } + +#endif /* ARM_MATH_DSP */ + + /* Return to application */ + return (ARM_MATH_SUCCESS); +} + +/** + * @} end of NNConv group + */ diff --git a/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_basic.c b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_basic.c new file mode 100644 index 0000000..e274413 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_basic.c @@ -0,0 +1,227 @@ +/* + * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_convolve_HWC_q7_basic.c + * Description: Q7 version of convolution + * + * $Date: 20. July 2021 + * $Revision: V.1.1.1 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup NNConv + * @{ + */ + +/** + * @brief Basic Q7 convolution function + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimention + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferB pointer to buffer space for output + * @return The function returns <code>ARM_MATH_SUCCESS</code> + * + * @details + * + * <b>Buffer size:</b> + * + * bufferA size: 2*ch_im_in*dim_kernel*dim_kernel + * + * bufferB size: 0 + * + * This basic version is designed to work for any input tensor and weight + * dimension. + */ + +arm_status arm_convolve_HWC_q7_basic(const q7_t *Im_in, + const uint16_t dim_im_in, + const uint16_t ch_im_in, + const q7_t *wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel, + const uint16_t padding, + const uint16_t stride, + const q7_t *bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q7_t *Im_out, + const uint16_t dim_im_out, + q15_t *bufferA, + q7_t *bufferB) +{ + (void)bufferB; +#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) + /* Run the following code for Cortex-M4 and Cortex-M7 */ + + int16_t i_out_y, i_out_x, i_ker_y, i_ker_x; + + /* + * Here we use bufferA as q15_t internally as computation are done with q15_t level + * im2col are done to output in q15_t format from q7_t input + */ + q15_t *pBuffer = bufferA; + q7_t *pOut = Im_out; + + /* This part implements the im2col function */ + for (i_out_y = 0; i_out_y < dim_im_out; i_out_y++) + { + for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++) + { + for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++) + { + for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++) + { + if (i_ker_y < 0 || i_ker_y >= dim_im_in || i_ker_x < 0 || i_ker_x >= dim_im_in) + { + /* Filling 0 for out-of-bound paddings */ + /* arm_fill_q15(0, pBuffer, ch_im_in); */ + memset(pBuffer, 0, sizeof(q15_t) * ch_im_in); + } + else + { + /* Copying the pixel data to column */ + arm_q7_to_q15_no_shift( + (q7_t *)Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, pBuffer, ch_im_in); + } + pBuffer += ch_im_in; + } + } + + /* Computation is filed for every 2 columns */ + if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel * dim_kernel) + { + pOut = arm_nn_mat_mult_kernel_q7_q15( + wt, bufferA, ch_im_out, ch_im_in * dim_kernel * dim_kernel, bias_shift, out_shift, bias, pOut); + + /* counter reset */ + pBuffer = bufferA; + } + } + } + + /* left-over because odd number of output pixels */ + if (pBuffer != bufferA) + { + const q7_t *pA = wt; + int i; + + for (i = 0; i < ch_im_out; i++) + { + /* Load the accumulator with bias first */ + q31_t sum = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift); + + /* Point to the beging of the im2col buffer */ + const q15_t *pB = bufferA; + + /* Each time it process 4 entries */ + uint16_t colCnt = ch_im_in * dim_kernel * dim_kernel >> 2; + + while (colCnt) + { + q31_t inA1, inA2; + q31_t inB1, inB2; + + pA = read_and_pad(pA, &inA1, &inA2); + + inB1 = arm_nn_read_q15x2_ia(&pB); + sum = __SMLAD(inA1, inB1, sum); + inB2 = arm_nn_read_q15x2_ia(&pB); + + sum = __SMLAD(inA2, inB2, sum); + + colCnt--; + } + colCnt = ch_im_in * dim_kernel * dim_kernel & 0x3; + while (colCnt) + { + q7_t inA1 = *pA++; + q15_t inB1 = *pB++; + sum += inA1 * inB1; + colCnt--; + } + *pOut++ = (q7_t)__SSAT((sum >> out_shift), 8); + } + } +#else + /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ + (void)bufferA; + int i, j, k, l, m, n; + int conv_out; + int in_row, in_col; + + for (i = 0; i < ch_im_out; i++) + { + for (j = 0; j < dim_im_out; j++) + { + for (k = 0; k < dim_im_out; k++) + { + conv_out = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift); + for (m = 0; m < dim_kernel; m++) + { + for (n = 0; n < dim_kernel; n++) + { + // if-for implementation + in_row = stride * j + m - padding; + in_col = stride * k + n - padding; + if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in && in_col < dim_im_in) + { + for (l = 0; l < ch_im_in; l++) + { + conv_out += Im_in[(in_row * dim_im_in + in_col) * ch_im_in + l] * + wt[i * ch_im_in * dim_kernel * dim_kernel + (m * dim_kernel + n) * ch_im_in + l]; + } + } + } + } + Im_out[i + (j * dim_im_out + k) * ch_im_out] = (q7_t)__SSAT((conv_out >> out_shift), 8); + } + } + } + +#endif /* ARM_MATH_DSP */ + + /* Return to application */ + return ARM_MATH_SUCCESS; +} + +/** + * @} end of NNConv group + */ diff --git a/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_basic_nonsquare.c b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_basic_nonsquare.c new file mode 100644 index 0000000..b42a57d --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_basic_nonsquare.c @@ -0,0 +1,229 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_convolve_HWC_q7_basic.c + * Description: Q7 version of convolution + * + * $Date: July 20, 2021 + * $Revision: V.1.1.2 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup NNConv + * @{ + */ + +/** + * @brief Basic Q7 convolution function (non-sqaure shape) + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in_x input tensor dimention x + * @param[in] dim_im_in_y input tensor dimention y + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel_x filter kernel size x + * @param[in] dim_kernel_y filter kernel size y + * @param[in] padding_x padding size x + * @param[in] padding_y padding size y + * @param[in] stride_x convolution stride x + * @param[in] stride_y convolution stride y + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out_x output tensor dimension x + * @param[in] dim_im_out_y output tensor dimension y + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferB pointer to buffer space for output + * @return The function returns <code>ARM_MATH_SUCCESS</code> + */ + +arm_status arm_convolve_HWC_q7_basic_nonsquare(const q7_t *Im_in, + const uint16_t dim_im_in_x, + const uint16_t dim_im_in_y, + const uint16_t ch_im_in, + const q7_t *wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel_x, + const uint16_t dim_kernel_y, + const uint16_t padding_x, + const uint16_t padding_y, + const uint16_t stride_x, + const uint16_t stride_y, + const q7_t *bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q7_t *Im_out, + const uint16_t dim_im_out_x, + const uint16_t dim_im_out_y, + q15_t *bufferA, + q7_t *bufferB) +{ + (void)bufferB; +#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) + /* Run the following code for Cortex-M4 and Cortex-M7 */ + + int16_t i_out_y, i_out_x, i_ker_y, i_ker_x; + + /* + * Here we use bufferA as q15_t internally as computation are done with q15_t level + * im2col are done to output in q15_t format from q7_t input + */ + q15_t *pBuffer = bufferA; + q7_t *pOut = Im_out; + + /* This part implements the im2col function */ + for (i_out_y = 0; i_out_y < dim_im_out_y; i_out_y++) + { + for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++) + { + for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y; + i_ker_y++) + { + for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x; + i_ker_x++) + { + if (i_ker_y < 0 || i_ker_y >= dim_im_in_y || i_ker_x < 0 || i_ker_x >= dim_im_in_x) + { + /* Filling 0 for out-of-bound paddings */ + /* arm_fill_q15(0, pBuffer, ch_im_in); */ + memset(pBuffer, 0, sizeof(q15_t) * ch_im_in); + } + else + { + /* Copying the pixel data to column */ + arm_q7_to_q15_no_shift( + (q7_t *)Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, pBuffer, ch_im_in); + } + pBuffer += ch_im_in; + } + } + + /* Computation is filed for every 2 columns */ + if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_y * dim_kernel_x) + { + pOut = arm_nn_mat_mult_kernel_q7_q15( + wt, bufferA, ch_im_out, ch_im_in * dim_kernel_y * dim_kernel_x, bias_shift, out_shift, bias, pOut); + + /* counter reset */ + pBuffer = bufferA; + } + } + } + + /* left-over because odd number of output pixels */ + if (pBuffer != bufferA) + { + const q7_t *pA = wt; + int i; + + for (i = 0; i < ch_im_out; i++) + { + /* Load the accumulator with bias first */ + q31_t sum = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift); + + /* Point to the beging of the im2col buffer */ + const q15_t *pB = bufferA; + + /* Each time it process 4 entries */ + uint16_t colCnt = ch_im_in * dim_kernel_y * dim_kernel_x >> 2; + + while (colCnt) + { + q31_t inA1, inA2; + q31_t inB1, inB2; + + pA = read_and_pad(pA, &inA1, &inA2); + + inB1 = arm_nn_read_q15x2_ia(&pB); + sum = __SMLAD(inA1, inB1, sum); + inB2 = arm_nn_read_q15x2_ia(&pB); + + sum = __SMLAD(inA2, inB2, sum); + + colCnt--; + } + colCnt = ch_im_in * dim_kernel_y * dim_kernel_x & 0x3; + while (colCnt) + { + q7_t inA1 = *pA++; + q15_t inB1 = *pB++; + sum += inA1 * inB1; + colCnt--; + } + *pOut++ = (q7_t)__SSAT((sum >> out_shift), 8); + } + } +#else + /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ + (void)bufferA; + int i, j, k, l, m, n; + int conv_out; + int in_row, in_col; + + for (i = 0; i < ch_im_out; i++) + { + for (j = 0; j < dim_im_out_y; j++) + { + for (k = 0; k < dim_im_out_x; k++) + { + conv_out = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift); + for (m = 0; m < dim_kernel_y; m++) + { + for (n = 0; n < dim_kernel_x; n++) + { + // if-for implementation + in_row = stride_y * j + m - padding_y; + in_col = stride_x * k + n - padding_x; + if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in_y && in_col < dim_im_in_x) + { + for (l = 0; l < ch_im_in; l++) + { + conv_out += Im_in[(in_row * dim_im_in_x + in_col) * ch_im_in + l] * + wt[i * ch_im_in * dim_kernel_y * dim_kernel_x + (m * dim_kernel_x + n) * ch_im_in + + l]; + } + } + } + } + Im_out[i + (j * dim_im_out_x + k) * ch_im_out] = (q7_t)__SSAT((conv_out >> out_shift), 8); + } + } + } + +#endif /* ARM_MATH_DSP */ + + /* Return to application */ + return ARM_MATH_SUCCESS; +} + +/** + * @} end of NNConv group + */ diff --git a/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_fast.c b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_fast.c new file mode 100644 index 0000000..51d98fd --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_fast.c @@ -0,0 +1,380 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_convolve_HWC_q7_fast.c + * Description: Fast Q7 version of convolution + * + * $Date: July 20, 2021 + * $Revision: V.1.1.2 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup NNConv + * @{ + */ + +/** + * @brief Fast Q7 convolution function + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimention + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferB pointer to buffer space for output + * @return The function returns either + * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking. + * + * @details + * + * <b>Buffer size:</b> + * + * bufferA size: 2*ch_im_in*dim_kernel*dim_kernel + * + * bufferB size: 0 + * + * <b>Input dimension constraints:</b> + * + * ch_im_in is multiple of 4 ( because of the SIMD32 read and swap ) + * + * ch_im_out is multiple of 2 ( bacause 2x2 mat_mult kernel ) + * + * The im2col converts the Q7 tensor input into Q15 column, which is stored in + * bufferA. There is reordering happenning during this im2col process with + * arm_q7_to_q15_reordered_no_shift. For every four elements, the second and + * third elements are swapped. + * + * The computation kernel arm_nn_mat_mult_kernel_q7_q15_reordered does the + * GEMM computation with the reordered columns. + * + * To speed-up the determination of the padding condition, we split the + * computation into 3x3 parts, i.e., {top, mid, bottom} X {left, mid, right}. + * This reduces the total number of boundary condition checks and improves + * the data copying performance. + */ + +arm_status arm_convolve_HWC_q7_fast(const q7_t *Im_in, + const uint16_t dim_im_in, + const uint16_t ch_im_in, + const q7_t *wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel, + const uint16_t padding, + const uint16_t stride, + const q7_t *bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q7_t *Im_out, + const uint16_t dim_im_out, + q15_t *bufferA, + q7_t *bufferB) +{ + (void)bufferB; +#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) + /* Run the following code for Cortex-M4 and Cortex-M7 */ + + int16_t i_out_y, i_out_x, i_ker_y, i_ker_x; + + /* + * Here we use bufferA as q15_t internally as computation are done with q15_t level + * im2col are done to output in q15_t format from q7_t input + */ + + q15_t *pBuffer = bufferA; + q7_t *pOut = Im_out; + + if (ch_im_in % 4 != 0 || ch_im_out % 2 != 0) + { + /* check if the input dimension meets the constraints */ + return ARM_MATH_SIZE_MISMATCH; + } + + /* + * Here we split the entire matrix into three regions depending on the padding situation + * Top: i_out_y from 0 to padding - 1 + * Middle: i_out_y from padding to dim_im_out-padding-1 + * Bottom: i_out_y from dim_im_out-padding to dim_im_out-1 + */ + + /* top part */ + for (i_out_y = 0; i_out_y < padding; i_out_y++) + { + for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++) + { + /* This part implements the im2col function */ + for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++) + { + for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++) + { + if (i_ker_y < 0 || i_ker_y >= dim_im_in || i_ker_x < 0 || i_ker_x >= dim_im_in) + { + /* arm_fill_q15(0, pBuffer, ch_im_in); */ + memset(pBuffer, 0, sizeof(q15_t) * ch_im_in); + } + else + { + arm_q7_to_q15_reordered_no_shift( + (q7_t *)Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, pBuffer, ch_im_in); + } + pBuffer += ch_im_in; + } + } + + if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel * dim_kernel) + { + pOut = arm_nn_mat_mult_kernel_q7_q15_reordered( + wt, bufferA, ch_im_out, ch_im_in * dim_kernel * dim_kernel, bias_shift, out_shift, bias, pOut); + /* counter reset */ + pBuffer = bufferA; + } + } + } + + /* middle part, here we also divide the x into left, mid and right */ + for (; i_out_y < dim_im_out - padding; i_out_y++) + { + + /* left part */ + for (i_out_x = 0; i_out_x < padding; i_out_x++) + { + /* This part implements the im2col function */ + for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++) + { + for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++) + { + if (i_ker_x < 0 || i_ker_x >= dim_im_in) + { + /* arm_fill_q15(0, pBuffer, ch_im_in); */ + memset(pBuffer, 0, sizeof(q15_t) * ch_im_in); + } + else + { + arm_q7_to_q15_reordered_no_shift( + (q7_t *)Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, pBuffer, ch_im_in); + } + pBuffer += ch_im_in; + } + } + + if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel * dim_kernel) + { + pOut = arm_nn_mat_mult_kernel_q7_q15_reordered( + wt, bufferA, ch_im_out, ch_im_in * dim_kernel * dim_kernel, bias_shift, out_shift, bias, pOut); + /* counter reset */ + pBuffer = bufferA; + } + } + + /* mid part */ + for (; i_out_x < dim_im_out - padding; i_out_x++) + { + /* This part implements the im2col function */ + for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++) + { + arm_q7_to_q15_reordered_no_shift((q7_t *)Im_in + + (i_ker_y * dim_im_in + i_out_x * stride - padding) * ch_im_in, + pBuffer, + ch_im_in * dim_kernel); + pBuffer += ch_im_in * dim_kernel; + } + + if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel * dim_kernel) + { + pOut = arm_nn_mat_mult_kernel_q7_q15_reordered( + wt, bufferA, ch_im_out, ch_im_in * dim_kernel * dim_kernel, bias_shift, out_shift, bias, pOut); + /* counter reset */ + pBuffer = bufferA; + } + } + + /* right part */ + for (; i_out_x < dim_im_out; i_out_x++) + { + /* This part implements the im2col function */ + for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++) + { + for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++) + { + if (i_ker_x < 0 || i_ker_x >= dim_im_in) + { + /* arm_fill_q15(0, pBuffer, ch_im_in); */ + memset(pBuffer, 0, sizeof(q15_t) * ch_im_in); + } + else + { + arm_q7_to_q15_reordered_no_shift( + (q7_t *)Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, pBuffer, ch_im_in); + } + pBuffer += ch_im_in; + } + } + + if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel * dim_kernel) + { + pOut = arm_nn_mat_mult_kernel_q7_q15_reordered( + wt, bufferA, ch_im_out, ch_im_in * dim_kernel * dim_kernel, bias_shift, out_shift, bias, pOut); + /* counter reset */ + pBuffer = bufferA; + } + } + } + + for (; i_out_y < dim_im_out; i_out_y++) + { + for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++) + { + /* This part implements the im2col function */ + for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++) + { + for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++) + { + if (i_ker_y < 0 || i_ker_y >= dim_im_in || i_ker_x < 0 || i_ker_x >= dim_im_in) + { + /* arm_fill_q15(0, pBuffer, ch_im_in); */ + memset(pBuffer, 0, sizeof(q15_t) * ch_im_in); + } + else + { + arm_q7_to_q15_reordered_no_shift( + (q7_t *)Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, pBuffer, ch_im_in); + } + pBuffer += ch_im_in; + } + } + + if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel * dim_kernel) + { + pOut = arm_nn_mat_mult_kernel_q7_q15_reordered( + wt, bufferA, ch_im_out, ch_im_in * dim_kernel * dim_kernel, bias_shift, out_shift, bias, pOut); + /* counter reset */ + pBuffer = bufferA; + } + } + } + + /* check if there is left-over for compute */ + if (pBuffer != bufferA) + { + const q7_t *pA = wt; + int i; + + for (i = 0; i < ch_im_out; i++) + { + q31_t sum = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift); + const q15_t *pB = bufferA; + /* each time it process 4 entries */ + uint16_t colCnt = ch_im_in * dim_kernel * dim_kernel >> 2; + + while (colCnt) + { + + q31_t inA1, inA2; + q31_t inB1, inB2; + + pA = read_and_pad_reordered(pA, &inA1, &inA2); + + inB1 = arm_nn_read_q15x2_ia(&pB); + sum = __SMLAD(inA1, inB1, sum); + inB2 = arm_nn_read_q15x2_ia(&pB); + sum = __SMLAD(inA2, inB2, sum); + + colCnt--; + } + colCnt = ch_im_in * dim_kernel * dim_kernel & 0x3; + while (colCnt) + { + q7_t inA1 = *pA++; + q15_t inB1 = *pB++; + sum += inA1 * inB1; + colCnt--; + } + *pOut = (q7_t)__SSAT((sum >> out_shift), 8); + pOut++; + } + } +#else + (void)bufferA; + /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ + int i, j, k, l, m, n; + int conv_out; + int in_row, in_col; + + if (ch_im_in % 4 != 0 || ch_im_out % 2 != 0) + { + /* check if the input dimension meets the constraints */ + return ARM_MATH_SIZE_MISMATCH; + } + + for (i = 0; i < ch_im_out; i++) + { + for (j = 0; j < dim_im_out; j++) + { + for (k = 0; k < dim_im_out; k++) + { + conv_out = (bias[i] << bias_shift) + NN_ROUND(out_shift); + for (m = 0; m < dim_kernel; m++) + { + for (n = 0; n < dim_kernel; n++) + { + // if-for implementation + in_row = stride * j + m - padding; + in_col = stride * k + n - padding; + if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in && in_col < dim_im_in) + { + for (l = 0; l < ch_im_in; l++) + { + conv_out += Im_in[(in_row * dim_im_in + in_col) * ch_im_in + l] * + wt[i * ch_im_in * dim_kernel * dim_kernel + (m * dim_kernel + n) * ch_im_in + l]; + } + } + } + } + Im_out[i + (j * dim_im_out + k) * ch_im_out] = (q7_t)__SSAT((conv_out >> out_shift), 8); + } + } + } + +#endif /* ARM_MATH_DSP */ + + /* Return to application */ + return ARM_MATH_SUCCESS; +} + +/** + * @} end of NNConv group + */ diff --git a/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_fast_nonsquare.c b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_fast_nonsquare.c new file mode 100644 index 0000000..25f17bb --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_fast_nonsquare.c @@ -0,0 +1,378 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_convolve_HWC_q7_fast_nonsquare.c + * Description: Fast Q7 version of convolution (non-sqaure shape) + * + * $Date: July 20, 2021 + * $Revision: V.1.1.2 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup NNConv + * @{ + */ + +/** + * @brief Fast Q7 convolution function (non-sqaure shape) + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in_x input tensor dimention x + * @param[in] dim_im_in_y input tensor dimention y + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel_x filter kernel size x + * @param[in] dim_kernel_y filter kernel size y + * @param[in] padding_x padding size x + * @param[in] padding_y padding size y + * @param[in] stride_x convolution stride x + * @param[in] stride_y convolution stride y + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out_x output tensor dimension x + * @param[in] dim_im_out_y output tensor dimension y + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferB pointer to buffer space for output + * @return The function returns either + * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking. + * + * This function is the version with full list of optimization tricks, but with + * some constraints: + * ch_im_in is multiple of 4 + * ch_im_out is multiple of 2 + */ + +arm_status arm_convolve_HWC_q7_fast_nonsquare(const q7_t *Im_in, + const uint16_t dim_im_in_x, + const uint16_t dim_im_in_y, + const uint16_t ch_im_in, + const q7_t *wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel_x, + const uint16_t dim_kernel_y, + const uint16_t padding_x, + const uint16_t padding_y, + const uint16_t stride_x, + const uint16_t stride_y, + const q7_t *bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q7_t *Im_out, + const uint16_t dim_im_out_x, + const uint16_t dim_im_out_y, + q15_t *bufferA, + q7_t *bufferB) +{ + (void)bufferB; +#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) + /* Run the following code for Cortex-M4 and Cortex-M7 */ + + int16_t i_out_y, i_out_x, i_ker_y, i_ker_x; + + /* ----------------------- + * Here we use bufferA as q15_t internally as computation are done with q15_t level + * im2col are done to output in q15_t format from q7_t input + */ + + q15_t *pBuffer = bufferA; + q7_t *pOut = Im_out; + + if (ch_im_in % 4 != 0 || ch_im_out % 2 != 0) + { + /* check if the input dimension meets the constraints */ + return ARM_MATH_SIZE_MISMATCH; + } + + /* + * Here we split the entire matrix into three regions depending on the padding situation + * Top: i_out_y from 0 to padding - 1 + * Middle: i_out_y from padding to dim_im_out-padding-1 + * Bottom: i_out_y from dim_im_out-padding to dim_im_out-1 + */ + + /* top part */ + for (i_out_y = 0; i_out_y < padding_y; i_out_y++) + { + for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++) + { + /* This part implements the im2col function */ + for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y; + i_ker_y++) + { + for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x; + i_ker_x++) + { + if (i_ker_y < 0 || i_ker_y >= dim_im_in_y || i_ker_x < 0 || i_ker_x >= dim_im_in_x) + { + /* arm_fill_q15(0, pBuffer, ch_im_in); */ + memset(pBuffer, 0, sizeof(q15_t) * ch_im_in); + } + else + { + arm_q7_to_q15_reordered_no_shift( + (q7_t *)Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, pBuffer, ch_im_in); + } + pBuffer += ch_im_in; + } + } + + if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y) + { + pOut = arm_nn_mat_mult_kernel_q7_q15_reordered( + wt, bufferA, ch_im_out, ch_im_in * dim_kernel_x * dim_kernel_y, bias_shift, out_shift, bias, pOut); + /* counter reset */ + pBuffer = bufferA; + } + } + } + + /* middle part, here we also divide the x into left, mid and right */ + for (; i_out_y < dim_im_out_y - padding_y; i_out_y++) + { + + /* left part */ + for (i_out_x = 0; i_out_x < padding_x; i_out_x++) + { + /* This part implements the im2col function */ + for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y; + i_ker_y++) + { + for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x; + i_ker_x++) + { + if (i_ker_x < 0 || i_ker_x >= dim_im_in_x) + { + /* arm_fill_q15(0, pBuffer, ch_im_in); */ + memset(pBuffer, 0, sizeof(q15_t) * ch_im_in); + } + else + { + arm_q7_to_q15_reordered_no_shift( + (q7_t *)Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, pBuffer, ch_im_in); + } + pBuffer += ch_im_in; + } + } + + if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y) + { + pOut = arm_nn_mat_mult_kernel_q7_q15_reordered( + wt, bufferA, ch_im_out, ch_im_in * dim_kernel_x * dim_kernel_y, bias_shift, out_shift, bias, pOut); + /* counter reset */ + pBuffer = bufferA; + } + } + + /* mid part */ + for (; i_out_x < dim_im_out_x - padding_x; i_out_x++) + { + /* This part implements the im2col function */ + for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y; + i_ker_y++) + { + arm_q7_to_q15_reordered_no_shift( + (q7_t *)Im_in + (i_ker_y * dim_im_in_x + i_out_x * stride_x - padding_x) * ch_im_in, + pBuffer, + ch_im_in * dim_kernel_x); + pBuffer += ch_im_in * dim_kernel_x; + } + + if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y) + { + pOut = arm_nn_mat_mult_kernel_q7_q15_reordered( + wt, bufferA, ch_im_out, ch_im_in * dim_kernel_x * dim_kernel_y, bias_shift, out_shift, bias, pOut); + /* counter reset */ + pBuffer = bufferA; + } + } + + /* right part */ + for (; i_out_x < dim_im_out_x; i_out_x++) + { + /* This part implements the im2col function */ + for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y; + i_ker_y++) + { + for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x; + i_ker_x++) + { + if (i_ker_x < 0 || i_ker_x >= dim_im_in_x) + { + /* arm_fill_q15(0, pBuffer, ch_im_in); */ + memset(pBuffer, 0, sizeof(q15_t) * ch_im_in); + } + else + { + arm_q7_to_q15_reordered_no_shift( + (q7_t *)Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, pBuffer, ch_im_in); + } + pBuffer += ch_im_in; + } + } + + if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y) + { + pOut = arm_nn_mat_mult_kernel_q7_q15_reordered( + wt, bufferA, ch_im_out, ch_im_in * dim_kernel_x * dim_kernel_y, bias_shift, out_shift, bias, pOut); + /* counter reset */ + pBuffer = bufferA; + } + } + } + + for (; i_out_y < dim_im_out_y; i_out_y++) + { + for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++) + { + /* This part implements the im2col function */ + for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y; + i_ker_y++) + { + for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x; + i_ker_x++) + { + if (i_ker_y < 0 || i_ker_y >= dim_im_in_y || i_ker_x < 0 || i_ker_x >= dim_im_in_x) + { + /* arm_fill_q15(0, pBuffer, ch_im_in); */ + memset(pBuffer, 0, sizeof(q15_t) * ch_im_in); + } + else + { + arm_q7_to_q15_reordered_no_shift( + (q7_t *)Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, pBuffer, ch_im_in); + } + pBuffer += ch_im_in; + } + } + + if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y) + { + pOut = arm_nn_mat_mult_kernel_q7_q15_reordered( + wt, bufferA, ch_im_out, ch_im_in * dim_kernel_x * dim_kernel_y, bias_shift, out_shift, bias, pOut); + /* counter reset */ + pBuffer = bufferA; + } + } + } + + /* check if there is left-over for compute */ + if (pBuffer != bufferA) + { + const q7_t *pA = wt; + int i; + for (i = 0; i < ch_im_out; i++) + { + q31_t sum = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift); + const q15_t *pB = bufferA; + /* basically each time it process 4 entries */ + uint16_t colCnt = ch_im_in * dim_kernel_x * dim_kernel_y >> 2; + + while (colCnt) + { + + q31_t inA1, inA2; + q31_t inB1, inB2; + + pA = read_and_pad_reordered(pA, &inA1, &inA2); + + inB1 = arm_nn_read_q15x2_ia(&pB); + sum = __SMLAD(inA1, inB1, sum); + inB2 = arm_nn_read_q15x2_ia(&pB); + sum = __SMLAD(inA2, inB2, sum); + + colCnt--; + } + colCnt = (ch_im_in * dim_kernel_y * dim_kernel_x) & 0x3; + while (colCnt) + { + q7_t inA1 = *pA++; + q15_t inB1 = *pB++; + sum += inA1 * inB1; + colCnt--; + } + *pOut = (q7_t)__SSAT((sum >> out_shift), 8); + pOut++; + } + } + +#else + (void)bufferA; + /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ + int i, j, k, l, m, n; + int conv_out; + int in_row, in_col; + + if (ch_im_in % 4 != 0 || ch_im_out % 2 != 0) + { + /* check if the input dimension meets the constraints */ + return ARM_MATH_SIZE_MISMATCH; + } + + for (i = 0; i < ch_im_out; i++) + { + for (j = 0; j < dim_im_out_y; j++) + { + for (k = 0; k < dim_im_out_x; k++) + { + conv_out = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift); + for (m = 0; m < dim_kernel_y; m++) + { + for (n = 0; n < dim_kernel_x; n++) + { + /* if-for implementation */ + in_row = stride_y * j + m - padding_y; + in_col = stride_x * k + n - padding_x; + if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in_y && in_col < dim_im_in_x) + { + for (l = 0; l < ch_im_in; l++) + { + conv_out += Im_in[(in_row * dim_im_in_x + in_col) * ch_im_in + l] * + wt[i * ch_im_in * dim_kernel_y * dim_kernel_x + (m * dim_kernel_x + n) * ch_im_in + + l]; + } + } + } + } + Im_out[i + (j * dim_im_out_x + k) * ch_im_out] = (q7_t)__SSAT((conv_out >> out_shift), 8); + } + } + } + +#endif /* ARM_MATH_DSP */ + + /* Return to application */ + return ARM_MATH_SUCCESS; +} + +/** + * @} end of NNConv group + */ diff --git a/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_fast_s16.c b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_fast_s16.c new file mode 100644 index 0000000..f509f26 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_fast_s16.c @@ -0,0 +1,241 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_convolve_fast_s16.c + * Description: Optimized s16 version of convolution. + * + * $Date: 12 August 2021 + * $Revision: V.1.1.0 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup NNConv + * @{ + */ + +/* + * Basic s16 convolution function. + * + * Refer header file for details. Optimal use case for the DSP/MVE implementation is when input and output channels + * are multiples of 4 or atleast greater than 4. + * + */ + +arm_status arm_convolve_fast_s16(const cmsis_nn_context *ctx, + const cmsis_nn_conv_params *conv_params, + const cmsis_nn_per_channel_quant_params *quant_params, + const cmsis_nn_dims *input_dims, + const q15_t *input_data, + const cmsis_nn_dims *filter_dims, + const q7_t *filter_data, + const cmsis_nn_dims *bias_dims, + const int64_t *bias_data, + const cmsis_nn_dims *output_dims, + q15_t *output_data) +{ + (void)bias_dims; + if (filter_dims->w * filter_dims->h * input_dims->c >= 512) + { + return ARM_MATH_SIZE_MISMATCH; + } + + if (ctx->buf == NULL && arm_convolve_s8_get_buffer_size(input_dims, filter_dims) > 0) + { + return ARM_MATH_ARGUMENT_ERROR; + } + q15_t *buffer_a = (q15_t *)ctx->buf; + + const int32_t input_batches = input_dims->n; + const int32_t input_x = input_dims->w; + const int32_t input_y = input_dims->h; + const int32_t input_ch = input_dims->c; + const int32_t kernel_x = filter_dims->w; + const int32_t kernel_y = filter_dims->h; + const int32_t output_x = output_dims->w; + const int32_t output_y = output_dims->h; + const int32_t output_ch = output_dims->c; + + const int32_t pad_x = conv_params->padding.w; + const int32_t pad_y = conv_params->padding.h; + const int32_t stride_x = conv_params->stride.w; + const int32_t stride_y = conv_params->stride.h; + + const int16_t out_activation_min = conv_params->activation.min; + const int16_t out_activation_max = conv_params->activation.max; + int32_t *output_mult = quant_params->multiplier; + int32_t *output_shift = quant_params->shift; + + for (int i_batch = 0; i_batch < input_batches; i_batch++) + { +#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) + /* Generate two columns from the input tensor a GEMM computation */ + q15_t *two_column_buf = buffer_a; + q15_t *out = output_data; + /* This part implements the im2col function */ + for (int32_t i_out_y = 0; i_out_y < output_y; i_out_y++) + { + for (int32_t i_out_x = 0; i_out_x < output_x; i_out_x++) + { + for (int32_t i_ker_y = i_out_y * stride_y - pad_y; i_ker_y < i_out_y * stride_y - pad_y + kernel_y; + i_ker_y++) + { + for (int32_t i_ker_x = i_out_x * stride_x - pad_x; i_ker_x < i_out_x * stride_x - pad_x + kernel_x; + i_ker_x++) + { + if (i_ker_y < 0 || i_ker_y >= input_y || i_ker_x < 0 || i_ker_x >= input_x) + { + /* Filling 0 for out-of-bound paddings */ + arm_memset_q7((q7_t *)two_column_buf, 0, sizeof(q15_t) * input_ch); + } + else + { + arm_memcpy_q7((q7_t *)two_column_buf, + (const q7_t *)(input_data + (i_ker_y * input_x + i_ker_x) * input_ch), + input_ch * sizeof(q15_t)); + } + two_column_buf += input_ch; + } + } + /* Computation is filed for every 2 columns */ + if (two_column_buf == buffer_a + 2 * input_ch * kernel_y * kernel_x) + { + out = arm_nn_mat_mult_kernel_s16(filter_data, + buffer_a, + output_ch, + output_shift, + output_mult, + out_activation_min, + out_activation_max, + (input_ch * kernel_y * kernel_x), + bias_data, + out); + + /* Counter reset */ + two_column_buf = buffer_a; + } + } + } + + /* Left-over because odd number of output pixels */ + if (two_column_buf != buffer_a) + { + const q7_t *ker_a = filter_data; + int i; + + for (i = 0; i < output_ch; i++) + { + /* Init the accumulator*/ + q31_t sum = 0; + + /* Point to the beginning of the im2col buffer where the input is available as a rearranged column */ + const q15_t *ip_as_col = buffer_a; + + /* 4 multiply and accumulates are done in one loop. */ + uint16_t col_count = (input_ch * kernel_y * kernel_x) >> 2; + + while (col_count) + { + q31_t ker_a1, ker_a2; + q31_t ip_b1, ip_b2; + + ker_a = read_and_pad(ker_a, &ker_a1, &ker_a2); + + ip_b1 = arm_nn_read_q15x2_ia(&ip_as_col); + sum = __SMLAD(ker_a1, ip_b1, sum); + ip_b2 = arm_nn_read_q15x2_ia(&ip_as_col); + sum = __SMLAD(ker_a2, ip_b2, sum); + + col_count--; + } + /* Handle left over mac */ + col_count = input_ch * kernel_y * kernel_x & 0x3; + while (col_count) + { + q7_t ker_a1 = *ker_a++; + q15_t ip_b1 = *ip_as_col++; + sum += ker_a1 * ip_b1; + col_count--; + } + if (bias_data) + { + q31_t reduced_multiplier = REDUCE_MULTIPLIER(output_mult[i]); + q63_t acc_64 = sum + bias_data[i]; + sum = arm_nn_requantize_s64(acc_64, reduced_multiplier, output_shift[i]); + } + else + { + sum = arm_nn_requantize(sum, output_mult[i], output_shift[i]); + } + sum = MAX(sum, out_activation_min); + sum = MIN(sum, out_activation_max); + *out++ = (q15_t)sum; + } + } +#else + (void)input_data; + (void)output_data; + (void)bias_data; + (void)filter_data; + (void)buffer_a; + (void)kernel_x; + (void)kernel_y; + (void)pad_x; + (void)pad_y; + (void)stride_x; + (void)stride_y; + (void)out_activation_min; + (void)out_activation_max; + (void)output_mult; + (void)output_shift; + return ARM_MATH_ARGUMENT_ERROR; +#endif + /* Advance to the next batch */ + input_data += (input_x * input_y * input_ch); + output_data += (output_x * output_y * output_ch); + } + + /* Return to application */ + return ARM_MATH_SUCCESS; +} + +int32_t arm_convolve_fast_s16_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims) +{ +#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) + return (2 * input_dims->c * filter_dims->w * filter_dims->h) * (int32_t)sizeof(int16_t); +#else + (void)input_dims; + (void)filter_dims; + return 0; +#endif +} + +/** + * @} end of NNConv group + */ diff --git a/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_s16.c b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_s16.c new file mode 100644 index 0000000..9702575 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_s16.c @@ -0,0 +1,156 @@ +/* + * Copyright (C) 2010-2022 Arm Limited or its affiliates. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_convolve_s16.c + * Description: s16 version of convolution using symmetric quantization. + * + * $Date: January 13, 2022 + * $Revision: V.1.1.0 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup NNConv + * @{ + */ + +/* + * Basic s16 convolution function. + * + * Refer header file for details. Optimal use case for the DSP/MVE implementation is when input and output channels + * are multiples of 4 or atleast greater than 4. + * + */ + +arm_status arm_convolve_s16(const cmsis_nn_context *ctx, + const cmsis_nn_conv_params *conv_params, + const cmsis_nn_per_channel_quant_params *quant_params, + const cmsis_nn_dims *input_dims, + const q15_t *input_data, + const cmsis_nn_dims *filter_dims, + const q7_t *filter_data, + const cmsis_nn_dims *bias_dims, + const int64_t *bias_data, + const cmsis_nn_dims *output_dims, + q15_t *output_data) +{ + (void)bias_dims; + (void)ctx; + + const int32_t input_batches = input_dims->n; + const int32_t input_x = input_dims->w; + const int32_t input_y = input_dims->h; + const int32_t input_ch = input_dims->c; + const int32_t kernel_x = filter_dims->w; + const int32_t kernel_y = filter_dims->h; + const int32_t output_x = output_dims->w; + const int32_t output_y = output_dims->h; + const int32_t output_ch = output_dims->c; + + const int32_t pad_x = conv_params->padding.w; + const int32_t pad_y = conv_params->padding.h; + const int32_t stride_x = conv_params->stride.w; + const int32_t stride_y = conv_params->stride.h; + const int32_t dilation_x = conv_params->dilation.w; + const int32_t dilation_y = conv_params->dilation.h; + + const int32_t out_activation_min = conv_params->activation.min; + const int32_t out_activation_max = conv_params->activation.max; + int32_t *output_mult = quant_params->multiplier; + int32_t *output_shift = quant_params->shift; + + for (int i_batch = 0; i_batch < input_batches; i_batch++) + { + /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ + for (int32_t i_out_ch = 0; i_out_ch < output_ch; i_out_ch++) + { + const q31_t reduced_multiplier = REDUCE_MULTIPLIER(output_mult[i_out_ch]); + + for (int32_t base_idx_y = -pad_y, i_out_y = 0; i_out_y < output_y; base_idx_y += stride_y, i_out_y++) + { + for (int32_t base_idx_x = -pad_x, i_out_x = 0; i_out_x < output_x; base_idx_x += stride_x, i_out_x++) + { + int64_t conv_out_acc = 0; + + const int32_t start_y_max = (-base_idx_y + dilation_y - 1) / dilation_y; + const int32_t ker_y_start = MAX(0, start_y_max); + const int32_t start_x_max = (-base_idx_x + dilation_x - 1) / dilation_x; + const int32_t ker_x_start = MAX(0, start_x_max); + const int32_t end_min_y = (input_y - base_idx_y + dilation_y - 1) / dilation_y; + const int32_t ker_y_end = MIN(kernel_y, end_min_y); + const int32_t end_min_x = (input_x - base_idx_x + dilation_x - 1) / dilation_x; + const int32_t ker_x_end = MIN(kernel_x, end_min_x); + + for (int32_t i_ker_y = ker_y_start; i_ker_y < ker_y_end; i_ker_y++) + { + for (int32_t i_ker_x = ker_x_start; i_ker_x < ker_x_end; i_ker_x++) + { + const int32_t in_row = base_idx_y + dilation_y * i_ker_y; + const int32_t in_col = base_idx_x + dilation_x * i_ker_x; + + for (int32_t i_input_ch = 0; i_input_ch < input_ch; i_input_ch++) + { + conv_out_acc += input_data[(in_row * input_x + in_col) * input_ch + i_input_ch] * + filter_data[i_out_ch * input_ch * kernel_y * kernel_x + + (i_ker_y * kernel_x + i_ker_x) * input_ch + i_input_ch]; + } + } + } + + if (bias_data) + { + conv_out_acc += bias_data[i_out_ch]; + } + + int32_t conv_out = arm_nn_requantize_s64(conv_out_acc, reduced_multiplier, output_shift[i_out_ch]); + conv_out = MAX(conv_out, out_activation_min); + conv_out = MIN(conv_out, out_activation_max); + output_data[i_out_ch + (i_out_y * output_x + i_out_x) * output_ch] = (int16_t)conv_out; + } + } + } + /* Advance to the next batch */ + input_data += (input_x * input_y * input_ch); + output_data += (output_x * output_y * output_ch); + } + + /* Return to application */ + return ARM_MATH_SUCCESS; +} + +int32_t arm_convolve_s16_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims) +{ + (void)input_dims; + (void)filter_dims; + return 0; +} + +/** + * @} end of NNConv group + */ diff --git a/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_s8.c b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_s8.c new file mode 100644 index 0000000..e884b31 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_s8.c @@ -0,0 +1,335 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_convolve_s8.c + * Description: s8 version of convolution using symmetric quantization. + * + * $Date: December 14, 2021 + * $Revision: V.2.1.0 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup NNConv + * @{ + */ + +/* + * Basic s8 convolution function. + * + * Refer header file for details. Optimal use case for the DSP/MVE implementation is when input and output channels + * are multiples of 4 or atleast greater than 4. + * + */ + +arm_status arm_convolve_s8(const cmsis_nn_context *ctx, + const cmsis_nn_conv_params *conv_params, + const cmsis_nn_per_channel_quant_params *quant_params, + const cmsis_nn_dims *input_dims, + const q7_t *input_data, + const cmsis_nn_dims *filter_dims, + const q7_t *filter_data, + const cmsis_nn_dims *bias_dims, + const int32_t *bias_data, + const cmsis_nn_dims *output_dims, + q7_t *output_data) +{ + (void)bias_dims; + + if (ctx->buf == NULL && arm_convolve_s8_get_buffer_size(input_dims, filter_dims) > 0) + { + return ARM_MATH_ARGUMENT_ERROR; + } + q15_t *buffer_a = (q15_t *)ctx->buf; + + const int32_t input_batches = input_dims->n; + const uint16_t input_x = input_dims->w; + const uint16_t input_y = input_dims->h; + const uint16_t input_ch = input_dims->c; + const uint16_t kernel_x = filter_dims->w; + const uint16_t kernel_y = filter_dims->h; + const uint16_t output_x = output_dims->w; + const uint16_t output_y = output_dims->h; + const uint16_t output_ch = output_dims->c; + + const uint16_t pad_x = conv_params->padding.w; + const uint16_t pad_y = conv_params->padding.h; + const uint16_t stride_x = conv_params->stride.w; + const uint16_t stride_y = conv_params->stride.h; + + const int32_t input_offset = conv_params->input_offset; + const int32_t out_offset = conv_params->output_offset; + const int32_t out_activation_min = conv_params->activation.min; + const int32_t out_activation_max = conv_params->activation.max; + int32_t *output_mult = quant_params->multiplier; + int32_t *output_shift = quant_params->shift; + + int i_batch; + for (i_batch = 0; i_batch < input_batches; i_batch++) + { +#if defined(ARM_MATH_MVEI) + /* Generate upto four columns from the input tensor a GEMM computation */ + q7_t *im2col_buf = (q7_t *)buffer_a; + q7_t *out = output_data; + int32_t buffer_fill_cnt = 0; + int32_t padded = 0; + const int32_t num_elem = kernel_x * kernel_y * input_ch; + const int32_t dilation_x = conv_params->dilation.w; + const int32_t dilation_y = conv_params->dilation.h; + + /* This part implements the im2col function */ + for (int i_out_y = 0; i_out_y < output_y; i_out_y++) + { + for (int i_out_x = 0; i_out_x < output_x; i_out_x++) + { + const int32_t base_idx_x = stride_x * i_out_x - pad_x; + const int32_t base_idx_y = stride_y * i_out_y - pad_y; + + for (int32_t i_ker_y = 0; i_ker_y < kernel_y; i_ker_y++) + { + for (int32_t i_ker_x = 0; i_ker_x < kernel_x; i_ker_x++) + { + const int32_t k_y = base_idx_y + dilation_y * i_ker_y; + const int32_t k_x = base_idx_x + dilation_x * i_ker_x; + + if (k_y < 0 || k_y >= input_y || k_x < 0 || k_x >= input_x) + { + memset(im2col_buf, (int8_t)-input_offset, sizeof(q7_t) * input_ch); + padded = 1; + } + else + { + arm_memcpy_q7(im2col_buf, input_data + (k_y * input_x + k_x) * input_ch, input_ch); + } + im2col_buf += input_ch; + } + } + + buffer_fill_cnt++; + + /* Computation is filed for every 4 columns */ + if (buffer_fill_cnt == 4 && (padded == 0)) + { + buffer_fill_cnt = 0; + out = arm_nn_mat_mul_core_4x_s8(num_elem, + num_elem, + (q7_t *)buffer_a, + filter_data, + output_ch, + conv_params, + quant_params, + bias_data, + out); + im2col_buf = (q7_t *)buffer_a; + } + else if (buffer_fill_cnt == 4 && (padded != 0)) + { + buffer_fill_cnt = 0; + out = arm_nn_mat_mult_s8(filter_data, + (q7_t *)buffer_a, + output_ch, + 4, + output_shift, + output_mult, + out_offset, + input_offset, + 0, + out_activation_min, + out_activation_max, + num_elem, + bias_data, + out); + + im2col_buf = (q7_t *)buffer_a; + padded = 0; + } + } + } + /* Handle left over columns */ + if (buffer_fill_cnt != 0) + { + out = arm_nn_mat_mult_s8(filter_data, + (q7_t *)buffer_a, + output_ch, + buffer_fill_cnt, + output_shift, + output_mult, + out_offset, + input_offset, + 0, + out_activation_min, + out_activation_max, + num_elem, + bias_data, + out); + } +#else // #if defined(ARM_MATH_MVEI) + const uint16_t dilation_x = conv_params->dilation.w; + const uint16_t dilation_y = conv_params->dilation.h; + + int32_t i_out_y, i_out_x, i_ker_y, i_ker_x; + + /* Generate two columns from the input tensor a GEMM computation */ + q15_t *two_column_buf = buffer_a; + q7_t *out = output_data; + + /* This part implements the im2col function */ + for (i_out_y = 0; i_out_y < output_y; i_out_y++) + { + for (i_out_x = 0; i_out_x < output_x; i_out_x++) + { + const int32_t base_idx_y = stride_y * i_out_y - pad_y; + const int32_t base_idx_x = stride_x * i_out_x - pad_x; + + for (i_ker_y = 0; i_ker_y < kernel_y; i_ker_y++) + { + for (i_ker_x = 0; i_ker_x < kernel_x; i_ker_x++) + { + const int32_t k_y = base_idx_y + dilation_y * i_ker_y; + const int32_t k_x = base_idx_x + dilation_x * i_ker_x; + + if (k_y < 0 || k_y >= input_y || k_x < 0 || k_x >= input_x) + { + /* Filling 0 for out-of-bound paddings */ + memset(two_column_buf, 0, sizeof(q15_t) * input_ch); + } + else + { + /* Copying the pixel data to column */ + arm_q7_to_q15_with_offset( + input_data + (k_y * input_x + k_x) * input_ch, two_column_buf, input_ch, input_offset); + } + two_column_buf += input_ch; + } + } + + /* Computation is filed for every 2 columns */ + if (two_column_buf == buffer_a + 2 * input_ch * kernel_y * kernel_x) + { + out = arm_nn_mat_mult_kernel_s8_s16(filter_data, + buffer_a, + output_ch, + output_shift, + output_mult, + out_offset, + out_activation_min, + out_activation_max, + input_ch * kernel_y * kernel_x, + bias_data, + out); + + /* counter reset */ + two_column_buf = buffer_a; + } + } + } + + /* left-over because odd number of output pixels */ + if (two_column_buf != buffer_a) + { + const q7_t *ker_a = filter_data; + int i; + + for (i = 0; i < output_ch; i++) + { + /* Load the accumulator with bias first */ + q31_t sum = 0; + if (bias_data) + { + sum = bias_data[i]; + } + + /* Point to the beginning of the im2col buffer where the input is available as a rearranged column */ + const q15_t *ip_as_col = buffer_a; + + /* 4 multiply and accumulates are done in one loop. */ +#if defined(ARM_MATH_DSP) + uint16_t col_count = (input_ch * kernel_y * kernel_x) >> 2; + + while (col_count) + { + q31_t ker_a1, ker_a2; + q31_t ip_b1, ip_b2; + + ker_a = read_and_pad(ker_a, &ker_a1, &ker_a2); + + ip_b1 = arm_nn_read_q15x2_ia(&ip_as_col); + sum = __SMLAD(ker_a1, ip_b1, sum); + ip_b2 = arm_nn_read_q15x2_ia(&ip_as_col); + sum = __SMLAD(ker_a2, ip_b2, sum); + + col_count--; + } + /* Handle left over mac */ + col_count = input_ch * kernel_y * kernel_x & 0x3; +#else + uint16_t col_count = input_ch * kernel_y * kernel_x; +#endif + while (col_count) + { + q7_t ker_a1 = *ker_a++; + q15_t ip_b1 = *ip_as_col++; + sum += ker_a1 * ip_b1; + col_count--; + } + + sum = arm_nn_requantize(sum, output_mult[i], output_shift[i]); + sum += out_offset; + sum = MAX(sum, out_activation_min); + sum = MIN(sum, out_activation_max); + *out++ = (q7_t)sum; + } + } +#endif // #if defined(ARM_MATH_MVEI) + /* Advance to the next batch */ + input_data += (input_x * input_y * input_ch); + output_data += (output_x * output_y * output_ch); + } + + /* Return to application */ + return ARM_MATH_SUCCESS; +} + +int32_t arm_convolve_s8_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims) +{ +#if defined(ARM_MATH_MVEI) + int32_t col_length = input_dims->c * filter_dims->w * filter_dims->h; + // Get number of complete int16 lanes(multiple of 8) for given col_length. This is dependent on + // implementation of arm_nn_mat_mult_s8 + col_length = (col_length + 7) / 8; + // 4 -> number of im2col buffers, 8 -> 8 elements per Q register + return 4 * col_length * 8 * (int32_t)sizeof(int8_t); +#else + return (2 * input_dims->c * filter_dims->w * filter_dims->h) * (int32_t)sizeof(int16_t); +#endif +} + +/** + * @} end of NNConv group + */ diff --git a/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_wrapper_s16.c b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_wrapper_s16.c new file mode 100644 index 0000000..75bb26f --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_wrapper_s16.c @@ -0,0 +1,130 @@ +/* + * Copyright (C) 2021-2022 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_convolve_wrapper_s16.c + * Description: s16 convolution layer wrapper function with the main purpose to call the optimal kernel available in + * cmsis-nn to perform the convolution. + * + * $Date: 13 January 2022 + * $Revision: V.1.2.0 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup NNConv + * @{ + */ + +/* + * Convolution layer + * + * Refer header file for details. + * + */ + +arm_status arm_convolve_wrapper_s16(const cmsis_nn_context *ctx, + const cmsis_nn_conv_params *conv_params, + const cmsis_nn_per_channel_quant_params *quant_params, + const cmsis_nn_dims *input_dims, + const q15_t *input_data, + const cmsis_nn_dims *filter_dims, + const q7_t *filter_data, + const cmsis_nn_dims *bias_dims, + const int64_t *bias_data, + const cmsis_nn_dims *output_dims, + q15_t *output_data) +{ +#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) + if (filter_dims->w * filter_dims->h * input_dims->c < 512 && + (conv_params->dilation.w == 1 && conv_params->dilation.h == 1)) + { + return arm_convolve_fast_s16(ctx, + conv_params, + quant_params, + input_dims, + input_data, + filter_dims, + filter_data, + bias_dims, + bias_data, + output_dims, + output_data); + } + else + { + return arm_convolve_s16(ctx, + conv_params, + quant_params, + input_dims, + input_data, + filter_dims, + filter_data, + bias_dims, + bias_data, + output_dims, + output_data); + } +#else + return arm_convolve_s16(ctx, + conv_params, + quant_params, + input_dims, + input_data, + filter_dims, + filter_data, + bias_dims, + bias_data, + output_dims, + output_data); +#endif +} + +int32_t arm_convolve_wrapper_s16_get_buffer_size(const cmsis_nn_conv_params *conv_params, + const cmsis_nn_dims *input_dims, + const cmsis_nn_dims *filter_dims, + const cmsis_nn_dims *output_dims) +{ + (void)conv_params; + (void)output_dims; + +#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) + if (filter_dims->w * filter_dims->h * input_dims->c < 512 && + (conv_params->dilation.w == 1 && conv_params->dilation.h == 1)) + { + return arm_convolve_fast_s16_get_buffer_size(input_dims, filter_dims); + } + + return arm_convolve_s16_get_buffer_size(input_dims, filter_dims); +#else + return arm_convolve_s16_get_buffer_size(input_dims, filter_dims); +#endif +} + +/** + * @} end of NNConv group + */ diff --git a/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_wrapper_s8.c b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_wrapper_s8.c new file mode 100644 index 0000000..bf1cd70 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_wrapper_s8.c @@ -0,0 +1,133 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_convolve_wrapper_s8.c + * Description: s8 convolution layer wrapper function with the main purpose to call the optimal kernel available in + * cmsis-nn to perform the convolution. + * + * $Date: 02. December 2021 + * $Revision: V.1.1.0 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup NNConv + * @{ + */ + +/* + * Convolution layer + * + * Refer header file for details. + * + */ + +arm_status arm_convolve_wrapper_s8(const cmsis_nn_context *ctx, + const cmsis_nn_conv_params *conv_params, + const cmsis_nn_per_channel_quant_params *quant_params, + const cmsis_nn_dims *input_dims, + const q7_t *input_data, + const cmsis_nn_dims *filter_dims, + const q7_t *filter_data, + const cmsis_nn_dims *bias_dims, + const int32_t *bias_data, + const cmsis_nn_dims *output_dims, + q7_t *output_data) +{ + if ((conv_params->padding.w == 0) && (conv_params->padding.h == 0) && (input_dims->c % 4 == 0) && + (conv_params->stride.w == 1) && (conv_params->stride.h == 1) && (filter_dims->w == 1) && + (filter_dims->h == 1) && (conv_params->dilation.w == 1 && conv_params->dilation.h == 1)) + { + return arm_convolve_1x1_s8_fast(ctx, + conv_params, + quant_params, + input_dims, + input_data, + filter_dims, + filter_data, + bias_dims, + bias_data, + output_dims, + output_data); + } + else if ((output_dims->h == 1) && (input_dims->h == 1) && (filter_dims->h == 1) && (output_dims->w % 4 == 0) && + (input_dims->n == 1) && (conv_params->dilation.w == 1 && conv_params->dilation.h == 1)) + { + return arm_convolve_1_x_n_s8(ctx, + conv_params, + quant_params, + input_dims, + input_data, + filter_dims, + filter_data, + bias_dims, + bias_data, + output_dims, + output_data); + } + else + { + return arm_convolve_s8(ctx, + conv_params, + quant_params, + input_dims, + input_data, + filter_dims, + filter_data, + bias_dims, + bias_data, + output_dims, + output_data); + } +} + +int32_t arm_convolve_wrapper_s8_get_buffer_size(const cmsis_nn_conv_params *conv_params, + const cmsis_nn_dims *input_dims, + const cmsis_nn_dims *filter_dims, + const cmsis_nn_dims *output_dims) +{ + if ((conv_params->padding.w == 0) && (conv_params->padding.h == 0) && (input_dims->c % 4 == 0) && + (conv_params->stride.w == 1) && (conv_params->stride.h == 1) && (filter_dims->w == 1) && + (filter_dims->h == 1) && (conv_params->dilation.w == 1 && conv_params->dilation.h == 1)) + { + return arm_convolve_1x1_s8_fast_get_buffer_size(input_dims); + } + else if ((output_dims->h == 1) && (input_dims->h == 1) && (filter_dims->h == 1) && (output_dims->w % 4 == 0) && + (input_dims->n == 1) && (conv_params->dilation.w == 1 && conv_params->dilation.h == 1)) + { + return arm_convolve_1_x_n_s8_get_buffer_size(input_dims, filter_dims); + } + else + { + return arm_convolve_s8_get_buffer_size(input_dims, filter_dims); + } +} + +/** + * @} end of NNConv group + */ diff --git a/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_3x3_s8.c b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_3x3_s8.c new file mode 100644 index 0000000..d5569b3 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_3x3_s8.c @@ -0,0 +1,212 @@ +/* + * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_depthwise_conv_3x3_s8.c + * Description: Optimized s8 depthwise convolution function for channel + * multiplier of 1 and 3x3 kernel size. + * + * $Date: 09. October 2020 + * $Revision: V.2.0.1 + * + * Target Processor: Cortex-M CPUs + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup NNConv + * @{ + */ + +/* + * Optimized s8 depthwise convolution function with constraint that + * in_channel == out_channel and kernel_x == kernel_y == 3 with pads at most 1 + * + * Refer prototype header file for details. + * + */ + +arm_status arm_depthwise_conv_3x3_s8(const cmsis_nn_context *ctx, + const cmsis_nn_dw_conv_params *dw_conv_params, + const cmsis_nn_per_channel_quant_params *quant_params, + const cmsis_nn_dims *input_dims, + const q7_t *input, + const cmsis_nn_dims *filter_dims, + const q7_t *kernel, + const cmsis_nn_dims *bias_dims, + const int32_t *bias, + const cmsis_nn_dims *output_dims, + q7_t *output) +{ + (void)ctx; + (void)bias_dims; + + const int32_t input_x = input_dims->w; + const int32_t input_y = input_dims->h; + const int32_t input_ch = input_dims->c; + const int32_t output_ch = output_dims->c; + const int32_t pad_x = dw_conv_params->padding.w; + const int32_t pad_y = dw_conv_params->padding.h; + const int32_t stride_x = dw_conv_params->stride.w; + const int32_t stride_y = dw_conv_params->stride.h; + const int32_t *output_shift = quant_params->shift; + const int32_t *output_mult = quant_params->multiplier; + const int32_t output_x = output_dims->w; + const int32_t output_y = output_dims->h; + const int32_t output_offset = dw_conv_params->output_offset; + const int32_t input_offset = dw_conv_params->input_offset; + const int32_t output_activation_min = dw_conv_params->activation.min; + const int32_t output_activation_max = dw_conv_params->activation.max; + + /* Check input constraints input_ch == output_ch */ + if (input_ch != output_ch) + { + return ARM_MATH_SIZE_MISMATCH; + } + /* Check input constraints pad_x <= 1 */ + if (pad_x > 1 || filter_dims->w != 3 || filter_dims->h != 3) + { + return ARM_MATH_ARGUMENT_ERROR; + } + + for (int32_t in_h = -pad_y, out_h = 0, out_idx = 0; out_h < output_y; in_h += stride_y, ++out_h) + { + for (int32_t in_w = -pad_x, out_w = 0, ker_h_start = MAX(0, -in_h); out_w < output_x; in_w += stride_x, ++out_w) + { + int32_t in_ch = 0; + int32_t ker_w_start = MAX(0, -in_w); + + for (; in_ch <= (input_ch - 4); in_ch += 4) + { + int32_t out_buff0 = bias[in_ch + 0]; + int32_t out_buff1 = bias[in_ch + 1]; + int32_t out_buff2 = bias[in_ch + 2]; + int32_t out_buff3 = bias[in_ch + 3]; + + const int8_t *input_ptr = input + (in_h + ker_h_start) * (input_ch * input_x) + in_w * input_ch + in_ch; + const int8_t *kernel_ptr = kernel + ker_h_start * (input_ch * 3) + in_ch; + + for (int32_t ker_h = ker_h_start; ker_h < MIN(3, input_y - in_h); ++ker_h) + { + int32_t in_val = 0; + int32_t ker_val = 0; + + if (ker_w_start == 0) + { + in_val = arm_nn_read_q7x4(input_ptr); + ker_val = arm_nn_read_q7x4(kernel_ptr); + + out_buff0 += ((int8_t)in_val + input_offset) * (int8_t)ker_val; + out_buff1 += ((int8_t)(in_val >> 8) + input_offset) * (int8_t)(ker_val >> 8); + out_buff2 += ((int8_t)(in_val >> 16) + input_offset) * (int8_t)(ker_val >> 16); + out_buff3 += ((int8_t)(in_val >> 24) + input_offset) * (int8_t)(ker_val >> 24); + } + + in_val = arm_nn_read_q7x4(input_ptr + input_ch); + ker_val = arm_nn_read_q7x4(kernel_ptr + input_ch); + + out_buff0 += ((int8_t)in_val + input_offset) * (int8_t)ker_val; + out_buff1 += ((int8_t)(in_val >> 8) + input_offset) * (int8_t)(ker_val >> 8); + out_buff2 += ((int8_t)(in_val >> 16) + input_offset) * (int8_t)(ker_val >> 16); + out_buff3 += ((int8_t)(in_val >> 24) + input_offset) * (int8_t)(ker_val >> 24); + + if ((input_x - in_w) >= 3) + { + in_val = arm_nn_read_q7x4(input_ptr + (input_ch << 1)); + ker_val = arm_nn_read_q7x4(kernel_ptr + (input_ch << 1)); + + out_buff0 += ((int8_t)in_val + input_offset) * (int8_t)ker_val; + out_buff1 += ((int8_t)(in_val >> 8) + input_offset) * (int8_t)(ker_val >> 8); + out_buff2 += ((int8_t)(in_val >> 16) + input_offset) * (int8_t)(ker_val >> 16); + out_buff3 += ((int8_t)(in_val >> 24) + input_offset) * (int8_t)(ker_val >> 24); + } + + input_ptr += (input_ch * input_x); + kernel_ptr += (input_ch * 3); + } + + out_buff0 = arm_nn_requantize(out_buff0, output_mult[in_ch + 0], output_shift[in_ch + 0]); + out_buff1 = arm_nn_requantize(out_buff1, output_mult[in_ch + 1], output_shift[in_ch + 1]); + out_buff2 = arm_nn_requantize(out_buff2, output_mult[in_ch + 2], output_shift[in_ch + 2]); + out_buff3 = arm_nn_requantize(out_buff3, output_mult[in_ch + 3], output_shift[in_ch + 3]); + + out_buff0 += output_offset; + out_buff1 += output_offset; + out_buff2 += output_offset; + out_buff3 += output_offset; + + out_buff0 = MIN(MAX(out_buff0, output_activation_min), output_activation_max); + out_buff1 = MIN(MAX(out_buff1, output_activation_min), output_activation_max); + out_buff2 = MIN(MAX(out_buff2, output_activation_min), output_activation_max); + out_buff3 = MIN(MAX(out_buff3, output_activation_min), output_activation_max); + + output[out_idx++] = (int8_t)out_buff0; + output[out_idx++] = (int8_t)out_buff1; + output[out_idx++] = (int8_t)out_buff2; + output[out_idx++] = (int8_t)out_buff3; + } + + // Leftover + for (; in_ch < input_ch; ++in_ch) + { + int32_t out_buff = bias[in_ch]; + + const int8_t *input_ptr = input + (in_h + ker_h_start) * (input_ch * input_x) + in_w * input_ch + in_ch; + const int8_t *kernel_ptr = kernel + ker_h_start * (input_ch * 3) + in_ch; + + for (int32_t ker_h = ker_h_start; ker_h < MIN(3, input_y - in_h); ++ker_h) + { + if (ker_w_start == 0) + { + out_buff += (*(input_ptr) + input_offset) * *(kernel_ptr); + } + + out_buff += (*(input_ptr + input_ch) + input_offset) * *(kernel_ptr + input_ch); + + if ((input_x - in_w) >= 3) + { + out_buff += (*(input_ptr + (input_ch << 1)) + input_offset) * *(kernel_ptr + (input_ch << 1)); + } + + input_ptr += (input_ch * input_x); + kernel_ptr += (input_ch * 3); + } + + out_buff = arm_nn_requantize(out_buff, output_mult[in_ch], output_shift[in_ch]); + out_buff += output_offset; + out_buff = MIN(MAX(out_buff, output_activation_min), output_activation_max); + output[out_idx++] = (int8_t)out_buff; + } + } + } + + /* Return to application */ + return ARM_MATH_SUCCESS; +} + +/** + * @} end of NNConv group + */ diff --git a/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_s16.c b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_s16.c new file mode 100644 index 0000000..42e4bbd --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_s16.c @@ -0,0 +1,292 @@ +/* + * Copyright (C) 2022 Arm Limited or its affiliates. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_depthwise_conv_s16.c + * Description: s16 version of depthwise convolution. + * + * $Date: 26. Jan 2022 + * $Revision: V.1.0.0 + * + * Target Processor: Cortex-M CPUs + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup NNConv + * @{ + */ + +static void __attribute__((unused)) depthwise_conv_s16_mult_4_s16(const int16_t *input, + const int32_t input_x, + const int32_t input_y, + const int32_t input_ch, + const int8_t *kernel, + const int32_t output_ch, + const int32_t ch_mult, + const int32_t kernel_x, + const int32_t kernel_y, + const int32_t pad_x, + const int32_t pad_y, + const int32_t stride_x, + const int32_t stride_y, + const int64_t *bias, + int16_t *output, + const int32_t *output_shift, + const int32_t *output_mult, + const int32_t output_x, + const int32_t output_y, + const int32_t output_activation_min, + const int32_t output_activation_max) +{ + for (int32_t in_h = -pad_y, out_h = 0, out_idx = 0; out_h < output_y; in_h += stride_y, ++out_h) + { + for (int32_t in_w = -pad_x, out_w = 0, ker_h_start = MAX(0, -in_h); out_w < output_x; in_w += stride_x, ++out_w) + { + for (int32_t in_ch = 0, out_ch = 0, ker_w_start = MAX(0, -in_w); out_ch < output_ch; + ++in_ch, out_ch += ch_mult) + { + for (int mult_tile = 0; mult_tile < ch_mult; mult_tile += 4) + { + int32_t out_buff32[4] = {REDUCE_MULTIPLIER(output_mult[out_ch + 0 + mult_tile]), + REDUCE_MULTIPLIER(output_mult[out_ch + 1 + mult_tile]), + REDUCE_MULTIPLIER(output_mult[out_ch + 2 + mult_tile]), + REDUCE_MULTIPLIER(output_mult[out_ch + 3 + mult_tile])}; + + int64_t out_buff[4] = {0, 0, 0, 0}; + + if (bias) + { + out_buff[0] = bias[out_ch + 0 + mult_tile]; + out_buff[1] = bias[out_ch + 1 + mult_tile]; + out_buff[2] = bias[out_ch + 2 + mult_tile]; + out_buff[3] = bias[out_ch + 3 + mult_tile]; + } + + for (int32_t ker_h = ker_h_start; ker_h < MIN(kernel_y, input_y - in_h); ++ker_h) + { + int32_t ker_idx = ker_h * (output_ch * kernel_x) + ker_w_start * output_ch + out_ch; + int32_t in_idx = (in_h + ker_h) * (input_ch * input_x) + in_w * input_ch + in_ch; +#if defined(__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050) +#pragma clang loop unroll(disable) +#endif + for (int32_t ker_w = ker_w_start; ker_w < MIN(kernel_x, input_x - in_w); + ++ker_w, ker_idx += output_ch) + { + // TODO: Unroll of 4 with 64 bit accumulator will probably result in too much register + // spills. Try with unroll of 2 when enabling this. + int32_t in_val = input[in_idx + ker_w * input_ch]; + out_buff[0] += in_val * kernel[ker_idx + 0 + mult_tile]; + out_buff[1] += in_val * kernel[ker_idx + 1 + mult_tile]; + out_buff[2] += in_val * kernel[ker_idx + 2 + mult_tile]; + out_buff[3] += in_val * kernel[ker_idx + 3 + mult_tile]; + } + } + + out_buff32[0] = + arm_nn_requantize_s64(out_buff[0], out_buff32[0], output_shift[out_ch + 0 + mult_tile]); + out_buff32[1] = + arm_nn_requantize_s64(out_buff[1], out_buff32[1], output_shift[out_ch + 1 + mult_tile]); + out_buff32[2] = + arm_nn_requantize_s64(out_buff[2], out_buff32[2], output_shift[out_ch + 2 + mult_tile]); + out_buff32[3] = + arm_nn_requantize_s64(out_buff[3], out_buff32[3], output_shift[out_ch + 3 + mult_tile]); + + out_buff32[0] = MIN(MAX(out_buff32[0], output_activation_min), output_activation_max); + out_buff32[1] = MIN(MAX(out_buff32[1], output_activation_min), output_activation_max); + out_buff32[2] = MIN(MAX(out_buff32[2], output_activation_min), output_activation_max); + out_buff32[3] = MIN(MAX(out_buff32[3], output_activation_min), output_activation_max); + + output[out_idx++] = (int16_t)out_buff32[0]; + output[out_idx++] = (int16_t)out_buff32[1]; + output[out_idx++] = (int16_t)out_buff32[2]; + output[out_idx++] = (int16_t)out_buff32[3]; + } + } + } + } +} + +static void depthwise_conv_s16_generic_s16(const int16_t *input, + const uint16_t input_batches, + const uint16_t input_x, + const uint16_t input_y, + const uint16_t input_ch, + const int8_t *kernel, + const uint16_t ch_mult, + const uint16_t kernel_x, + const uint16_t kernel_y, + const uint16_t pad_x, + const uint16_t pad_y, + const uint16_t stride_x, + const uint16_t stride_y, + const int64_t *bias, + int16_t *output, + const int32_t *output_shift, + const int32_t *output_mult, + const uint16_t output_x, + const uint16_t output_y, + const int32_t output_activation_min, + const int32_t output_activation_max, + const uint16_t dilation_x, + const uint16_t dilation_y) + +{ + for (int i_batch = 0; i_batch < input_batches; i_batch++) + { + for (int i_out_y = 0; i_out_y < output_y; i_out_y++) + { + const int16_t base_idx_y = (i_out_y * stride_y) - pad_y; + for (int i_out_x = 0; i_out_x < output_x; i_out_x++) + { + const int16_t base_idx_x = (i_out_x * stride_x) - pad_x; + for (int i_input_ch = 0; i_input_ch < input_ch; i_input_ch++) + { + for (int i_ch_mult = 0; i_ch_mult < ch_mult; i_ch_mult++) + { + const int idx_out_ch = i_ch_mult + i_input_ch * ch_mult; + + const q31_t reduced_multiplier = REDUCE_MULTIPLIER(output_mult[idx_out_ch]); + int64_t acc_0 = 0; + + int ker_y_start; + int ker_x_start; + int ker_y_end; + int ker_x_end; + + if (dilation_x > 1) + { + const int32_t start_x_max = (-base_idx_x + dilation_x - 1) / dilation_x; + ker_x_start = MAX(0, start_x_max); + const int32_t end_min_x = (input_x - base_idx_x + dilation_x - 1) / dilation_x; + ker_x_end = MIN(kernel_x, end_min_x); + } + else + { + ker_x_start = MAX(0, -base_idx_x); + ker_x_end = MIN(kernel_x, input_x - base_idx_x); + } + + if (dilation_y > 1) + { + const int32_t start_y_max = (-base_idx_y + dilation_y - 1) / dilation_y; + ker_y_start = MAX(0, start_y_max); + const int32_t end_min_y = (input_y - base_idx_y + dilation_y - 1) / dilation_y; + ker_y_end = MIN(kernel_y, end_min_y); + } + else + { + ker_y_start = MAX(0, -base_idx_y); + ker_y_end = MIN(kernel_y, input_y - base_idx_y); + } + + if (bias) + { + acc_0 = bias[idx_out_ch]; + } + + for (int i_ker_y = ker_y_start; i_ker_y < ker_y_end; i_ker_y++) + { + const int32_t idx_y = base_idx_y + dilation_y * i_ker_y; + for (int i_ker_x = ker_x_start; i_ker_x < ker_x_end; i_ker_x++) + { + const int32_t idx_x = base_idx_x + dilation_x * i_ker_x; + int32_t idx_0 = (idx_y * input_x + idx_x) * input_ch + i_input_ch; + int32_t ker_idx_0 = (i_ker_y * kernel_x + i_ker_x) * (input_ch * ch_mult) + idx_out_ch; + + acc_0 += input[idx_0] * kernel[ker_idx_0]; + } + } + + /* Requantize and clamp output to provided range */ + int32_t result = arm_nn_requantize_s64(acc_0, reduced_multiplier, output_shift[idx_out_ch]); + result = MAX(result, output_activation_min); + result = MIN(result, output_activation_max); + *output++ = (int16_t)result; + } + } + } + } + /* Advance to the next batch */ + input += (input_x * input_y * input_ch); + } +} + +/* + * Basic s16 depthwise convolution function. + * + * Refer header file for details. + * + */ +arm_status arm_depthwise_conv_s16(const cmsis_nn_context *ctx, + const cmsis_nn_dw_conv_params *dw_conv_params, + const cmsis_nn_per_channel_quant_params *quant_params, + const cmsis_nn_dims *input_dims, + const q15_t *input, + const cmsis_nn_dims *filter_dims, + const q7_t *kernel, + const cmsis_nn_dims *bias_dims, + const int64_t *bias, + const cmsis_nn_dims *output_dims, + q15_t *output) +{ + const uint16_t dilation_x = dw_conv_params->dilation.w; + const uint16_t dilation_y = dw_conv_params->dilation.h; + + (void)bias_dims; + (void)ctx; + + depthwise_conv_s16_generic_s16(input, + input_dims->n, + input_dims->w, + input_dims->h, + input_dims->c, + kernel, + dw_conv_params->ch_mult, + filter_dims->w, + filter_dims->h, + dw_conv_params->padding.w, + dw_conv_params->padding.h, + dw_conv_params->stride.w, + dw_conv_params->stride.h, + bias, + output, + quant_params->shift, + quant_params->multiplier, + output_dims->w, + output_dims->h, + dw_conv_params->activation.min, + dw_conv_params->activation.max, + dilation_x, + dilation_y); + + /* Return to application */ + return ARM_MATH_SUCCESS; +} + +/** + * @} end of NNConv group + */ diff --git a/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_s8.c b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_s8.c new file mode 100644 index 0000000..297b7af --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_s8.c @@ -0,0 +1,347 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_depthwise_conv_s8.c + * Description: s8 version of depthwise convolution. + * + * $Date: 30. Dec 2021 + * $Revision: V.2.7.1 + * + * Target Processor: Cortex-M CPUs + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup NNConv + * @{ + */ + +static void depthwise_conv_s8_mult_4(const int8_t *input, + const int32_t input_x, + const int32_t input_y, + const int32_t input_ch, + const int8_t *kernel, + const int32_t output_ch, + const int32_t ch_mult, + const int32_t kernel_x, + const int32_t kernel_y, + const int32_t pad_x, + const int32_t pad_y, + const int32_t stride_x, + const int32_t stride_y, + const int32_t *bias, + int8_t *output, + const int32_t *output_shift, + const int32_t *output_mult, + const int32_t output_x, + const int32_t output_y, + const int32_t output_offset, + const int32_t input_offset, + const int32_t output_activation_min, + const int32_t output_activation_max) +{ + for (int32_t in_h = -pad_y, out_h = 0, out_idx = 0; out_h < output_y; in_h += stride_y, ++out_h) + { + for (int32_t in_w = -pad_x, out_w = 0, ker_h_start = MAX(0, -in_h); out_w < output_x; in_w += stride_x, ++out_w) + { + for (int32_t in_ch = 0, out_ch = 0, ker_w_start = MAX(0, -in_w); out_ch < output_ch; + ++in_ch, out_ch += ch_mult) + { + for (int mult_tile = 0; mult_tile < ch_mult; mult_tile += 4) + { + int32_t out_buff[4] = {0, 0, 0, 0}; + if (bias) + { + out_buff[0] = bias[out_ch + 0 + mult_tile]; + out_buff[1] = bias[out_ch + 1 + mult_tile]; + out_buff[2] = bias[out_ch + 2 + mult_tile]; + out_buff[3] = bias[out_ch + 3 + mult_tile]; + } + + for (int32_t ker_h = ker_h_start; ker_h < MIN(kernel_y, input_y - in_h); ++ker_h) + { + int32_t ker_idx = ker_h * (output_ch * kernel_x) + ker_w_start * output_ch + out_ch; + int32_t in_idx = (in_h + ker_h) * (input_ch * input_x) + in_w * input_ch + in_ch; +#if defined(__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050) +#pragma clang loop unroll(disable) +#endif + for (int32_t ker_w = ker_w_start; ker_w < MIN(kernel_x, input_x - in_w); + ++ker_w, ker_idx += output_ch) + { + int32_t in_val = input[in_idx + ker_w * input_ch] + input_offset; + out_buff[0] += in_val * kernel[ker_idx + 0 + mult_tile]; + out_buff[1] += in_val * kernel[ker_idx + 1 + mult_tile]; + out_buff[2] += in_val * kernel[ker_idx + 2 + mult_tile]; + out_buff[3] += in_val * kernel[ker_idx + 3 + mult_tile]; + } + } +#if defined(ARM_MATH_MVEI) + (void)out_idx; + int32x4_t res = vldrwq_s32(out_buff); + res = arm_requantize_mve_32x4(res, + vldrwq_s32(&output_mult[out_ch + mult_tile]), + vldrwq_s32(&output_shift[out_ch + mult_tile])); + res = vaddq_n_s32(res, output_offset); + + res = vmaxq_s32(res, vdupq_n_s32(output_activation_min)); + res = vminq_s32(res, vdupq_n_s32(output_activation_max)); + vstrbq_s32(output, res); + output += 4; +#else + out_buff[0] = arm_nn_requantize( + out_buff[0], output_mult[out_ch + 0 + mult_tile], output_shift[out_ch + 0 + mult_tile]); + out_buff[1] = arm_nn_requantize( + out_buff[1], output_mult[out_ch + 1 + mult_tile], output_shift[out_ch + 1 + mult_tile]); + out_buff[2] = arm_nn_requantize( + out_buff[2], output_mult[out_ch + 2 + mult_tile], output_shift[out_ch + 2 + mult_tile]); + out_buff[3] = arm_nn_requantize( + out_buff[3], output_mult[out_ch + 3 + mult_tile], output_shift[out_ch + 3 + mult_tile]); + + out_buff[0] += output_offset; + out_buff[1] += output_offset; + out_buff[2] += output_offset; + out_buff[3] += output_offset; + + out_buff[0] = MIN(MAX(out_buff[0], output_activation_min), output_activation_max); + out_buff[1] = MIN(MAX(out_buff[1], output_activation_min), output_activation_max); + out_buff[2] = MIN(MAX(out_buff[2], output_activation_min), output_activation_max); + out_buff[3] = MIN(MAX(out_buff[3], output_activation_min), output_activation_max); + + output[out_idx++] = (int8_t)out_buff[0]; + output[out_idx++] = (int8_t)out_buff[1]; + output[out_idx++] = (int8_t)out_buff[2]; + output[out_idx++] = (int8_t)out_buff[3]; + +#endif + } + } + } + } +} + +static void depthwise_conv_s8_generic(const q7_t *input, + const uint16_t input_batches, + const uint16_t input_x, + const uint16_t input_y, + const uint16_t input_ch, + const q7_t *kernel, + const uint16_t output_ch, + const uint16_t ch_mult, + const uint16_t kernel_x, + const uint16_t kernel_y, + const uint16_t pad_x, + const uint16_t pad_y, + const uint16_t stride_x, + const uint16_t stride_y, + const int32_t *bias, + q7_t *output, + const int32_t *output_shift, + const int32_t *output_mult, + const uint16_t output_x, + const uint16_t output_y, + const int32_t output_offset, + const int32_t input_offset, + const int32_t output_activation_min, + const int32_t output_activation_max, + const uint16_t dilation_x, + const uint16_t dilation_y) + +{ + (void)output_ch; + int i_out = 0; + int i_batch; + + for (i_batch = 0; i_batch < input_batches; i_batch++) + { + for (int i_out_y = 0; i_out_y < output_y; i_out_y++) + { + const int16_t base_idx_y = (i_out_y * stride_y) - pad_y; + for (int i_out_x = 0; i_out_x < output_x; i_out_x++) + { + const int16_t base_idx_x = (i_out_x * stride_x) - pad_x; + for (int i_input_ch = 0; i_input_ch < input_ch; i_input_ch++) + { + for (int i_ch_mult = 0; i_ch_mult < ch_mult; i_ch_mult++) + { + const int idx_out_ch = i_ch_mult + i_input_ch * ch_mult; + int32_t acc_0 = 0; + + int ker_y_start; + int ker_x_start; + int ker_y_end; + int ker_x_end; + + if (dilation_x > 1) + { + const int32_t start_x_max = (-base_idx_x + dilation_x - 1) / dilation_x; + ker_x_start = MAX(0, start_x_max); + const int32_t end_min_x = (input_x - base_idx_x + dilation_x - 1) / dilation_x; + ker_x_end = MIN(kernel_x, end_min_x); + } + else + { + ker_x_start = MAX(0, -base_idx_x); + ker_x_end = MIN(kernel_x, input_x - base_idx_x); + } + + if (dilation_y > 1) + { + const int32_t start_y_max = (-base_idx_y + dilation_y - 1) / dilation_y; + ker_y_start = MAX(0, start_y_max); + const int32_t end_min_y = (input_y - base_idx_y + dilation_y - 1) / dilation_y; + ker_y_end = MIN(kernel_y, end_min_y); + } + else + { + ker_y_start = MAX(0, -base_idx_y); + ker_y_end = MIN(kernel_y, input_y - base_idx_y); + } + + if (bias) + { + acc_0 = bias[idx_out_ch]; + } + + for (int i_ker_y = ker_y_start; i_ker_y < ker_y_end; i_ker_y++) + { + const int32_t idx_y = base_idx_y + dilation_y * i_ker_y; + for (int i_ker_x = ker_x_start; i_ker_x < ker_x_end; i_ker_x++) + { + const int32_t idx_x = base_idx_x + dilation_x * i_ker_x; + int32_t idx_0 = (idx_y * input_x + idx_x) * input_ch + i_input_ch; + int32_t ker_idx_0 = (i_ker_y * kernel_x + i_ker_x) * (input_ch * ch_mult) + idx_out_ch; + + acc_0 += (input[idx_0] + input_offset) * kernel[ker_idx_0]; + } + } + + /* Requantize and clamp output to provided range */ + acc_0 = arm_nn_requantize(acc_0, output_mult[idx_out_ch], output_shift[idx_out_ch]); + acc_0 += output_offset; + acc_0 = MAX(acc_0, output_activation_min); + acc_0 = MIN(acc_0, output_activation_max); + + output[i_out++] = acc_0; + } + } + } + } + /* Advance to the next batch */ + input += (input_x * input_y * input_ch); + } +} + +/* + * Basic s8 depthwise convolution function. + * + * Refer header file for details. + * Optimization using DSP extension is not available for the generic case where channel multiplier is > 1. + * + */ +arm_status arm_depthwise_conv_s8(const cmsis_nn_context *ctx, + const cmsis_nn_dw_conv_params *dw_conv_params, + const cmsis_nn_per_channel_quant_params *quant_params, + const cmsis_nn_dims *input_dims, + const q7_t *input, + const cmsis_nn_dims *filter_dims, + const q7_t *kernel, + const cmsis_nn_dims *bias_dims, + const int32_t *bias, + const cmsis_nn_dims *output_dims, + q7_t *output) +{ + const uint16_t dilation_x = dw_conv_params->dilation.w; + const uint16_t dilation_y = dw_conv_params->dilation.h; + + (void)dw_conv_params->dilation; + (void)bias_dims; + (void)ctx; + + if (dw_conv_params->ch_mult % 4 == 0 && input_dims->n == 1 && dw_conv_params->dilation.w == 1 && + dw_conv_params->dilation.h == 1) + { + depthwise_conv_s8_mult_4(input, + input_dims->w, + input_dims->h, + input_dims->c, + kernel, + output_dims->c, + dw_conv_params->ch_mult, + filter_dims->w, + filter_dims->h, + dw_conv_params->padding.w, + dw_conv_params->padding.h, + dw_conv_params->stride.w, + dw_conv_params->stride.h, + bias, + output, + quant_params->shift, + quant_params->multiplier, + output_dims->w, + output_dims->h, + dw_conv_params->output_offset, + dw_conv_params->input_offset, + dw_conv_params->activation.min, + dw_conv_params->activation.max); + } + else + { + depthwise_conv_s8_generic(input, + input_dims->n, + input_dims->w, + input_dims->h, + input_dims->c, + kernel, + output_dims->c, + dw_conv_params->ch_mult, + filter_dims->w, + filter_dims->h, + dw_conv_params->padding.w, + dw_conv_params->padding.h, + dw_conv_params->stride.w, + dw_conv_params->stride.h, + bias, + output, + quant_params->shift, + quant_params->multiplier, + output_dims->w, + output_dims->h, + dw_conv_params->output_offset, + dw_conv_params->input_offset, + dw_conv_params->activation.min, + dw_conv_params->activation.max, + dilation_x, + dilation_y); + } + + /* Return to application */ + return ARM_MATH_SUCCESS; +} + +/** + * @} end of NNConv group + */ diff --git a/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_s8_opt.c b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_s8_opt.c new file mode 100644 index 0000000..1edac04 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_s8_opt.c @@ -0,0 +1,433 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_depthwise_conv_s8_opt.c + * Description: Optimized s8 depthwise separable convolution function for + * channel multiplier of 1. + * + * $Date: January 26, 2021 + * $Revision: V.2.0.3 + * + * Target Processor: Cortex-M CPUs + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup NNConv + * @{ + */ + +/* + * Optimized s8 depthwise convolution function with constraint that in_channel equals out_channel + * + * Refer prototype header file for details. + * + */ + +arm_status arm_depthwise_conv_s8_opt(const cmsis_nn_context *ctx, + const cmsis_nn_dw_conv_params *dw_conv_params, + const cmsis_nn_per_channel_quant_params *quant_params, + const cmsis_nn_dims *input_dims, + const q7_t *input, + const cmsis_nn_dims *filter_dims, + const q7_t *kernel, + const cmsis_nn_dims *bias_dims, + const int32_t *bias, + const cmsis_nn_dims *output_dims, + q7_t *output) +{ + + const int32_t input_ch = input_dims->c; + const int32_t output_ch = output_dims->c; + + /* Check input constraints input_ch == output_ch */ + if (input_ch != output_ch) + { + return ARM_MATH_SIZE_MISMATCH; + } + + if (ctx->buf == NULL && arm_depthwise_conv_s8_opt_get_buffer_size(input_dims, filter_dims) > 0) + { + return ARM_MATH_ARGUMENT_ERROR; + } +#ifdef ARM_MATH_DSP + const int32_t input_x = input_dims->w; + const int32_t input_y = input_dims->h; + const int32_t kernel_x = filter_dims->w; + const int32_t kernel_y = filter_dims->h; + const int32_t pad_x = dw_conv_params->padding.w; + const int32_t pad_y = dw_conv_params->padding.h; + const int32_t stride_x = dw_conv_params->stride.w; + const int32_t stride_y = dw_conv_params->stride.h; + const int32_t *output_shift = quant_params->shift; + const int32_t *output_mult = quant_params->multiplier; + const int32_t output_x = output_dims->w; + const int32_t output_y = output_dims->h; + const int32_t output_offset = dw_conv_params->output_offset; + const int32_t input_offset = dw_conv_params->input_offset; + const int32_t output_activation_min = dw_conv_params->activation.min; + const int32_t output_activation_max = dw_conv_params->activation.max; + q15_t *buffer_a = (q15_t *)ctx->buf; + +#ifdef ARM_MATH_MVEI + (void)bias_dims; + /* Generate two columns from the input tensor */ + q7_t *lhs_buffer = (q7_t *)buffer_a; + q7_t *out = output; + int padded = 0; + int buffer_count = 0; + const int32_t kernel_size = kernel_x * kernel_y; + + /* This part implements the im2col function */ + for (int i_out_y = 0, base_idx_y = -pad_y; i_out_y < output_y; base_idx_y += stride_y, i_out_y++) + { + for (int i_out_x = 0, base_idx_x = -pad_x; i_out_x < output_x; base_idx_x += stride_x, i_out_x++) + { + for (int i_ker_y = base_idx_y; i_ker_y < base_idx_y + kernel_y; i_ker_y++) + { + for (int i_ker_x = base_idx_x; i_ker_x < base_idx_x + kernel_x; i_ker_x++) + { + if (i_ker_y < 0 || i_ker_y >= input_y || i_ker_x < 0 || i_ker_x >= input_x) + { + arm_memset_q7(lhs_buffer, (int8_t)-input_offset, (uint32_t)input_ch); + padded = 1; + } + else + { + arm_memcpy_q7(lhs_buffer, input + (i_ker_y * input_x + i_ker_x) * input_ch, (uint32_t)input_ch); + } + lhs_buffer += input_ch; + } + } + buffer_count++; + + if (buffer_count == 4) + { + lhs_buffer = (q7_t *)buffer_a; + if (padded == 0) + { + out = arm_nn_depthwise_conv_nt_t_s8(lhs_buffer, + kernel, + input_offset, + input_ch, + output_shift, + output_mult, + output_offset, + output_activation_min, + output_activation_max, + kernel_size, + bias, + out); + } + else + { + out = arm_nn_depthwise_conv_nt_t_padded_s8(lhs_buffer, + kernel, + input_offset, + input_ch, + output_shift, + output_mult, + output_offset, + output_activation_min, + output_activation_max, + kernel_size, + bias, + out); + padded = 0; + } + buffer_count = 0; + } + } + } + + /* Handle left over buffers */ + lhs_buffer = (q7_t *)buffer_a; + + for (int i_buf = 0; i_buf < buffer_count; i_buf++) + { + int32_t loop_count = (input_ch + 3) / 4; + + int32_t num_ch_to_process = input_ch; + for (int i_loop_cnt = 0, offset = 0; i_loop_cnt < loop_count; num_ch_to_process -= 4, offset += 4, i_loop_cnt++) + { + const int8_t *col_0 = lhs_buffer + (kernel_size * input_ch * i_buf) + offset; + const int8_t *row_0 = kernel + offset; + int32x4_t out_0 = vldrwq_s32(&bias[offset]); + + for (int i_ker = 0; i_ker < kernel_size; i_ker++) + { + const int32x4_t ker_0 = vldrbq_s32(row_0); + + int32x4_t ip_0 = vldrbq_s32(col_0); + ip_0 = vaddq_n_s32(ip_0, input_offset); + out_0 += vmulq_s32(ip_0, ker_0); + + col_0 += input_ch; + row_0 += input_ch; + } + + const int32x4_t mult = vldrwq_s32(&output_mult[offset]); + const int32x4_t shift = vldrwq_s32(&output_shift[offset]); + + out_0 = arm_requantize_mve_32x4(out_0, mult, shift); + out_0 = vaddq_n_s32(out_0, output_offset); + out_0 = vmaxq_s32(out_0, vdupq_n_s32(output_activation_min)); + out_0 = vminq_s32(out_0, vdupq_n_s32(output_activation_max)); + mve_pred16_t p = vctp32q((uint32_t)num_ch_to_process); + vstrbq_p_s32(out, out_0, p); + + out += 4; + } + + const int tail_ch = input_ch & 0x3; + if (tail_ch != 0) + { + out -= (4 - tail_ch); + } + } + +#else // ARM_MATH_DSP + (void)bias_dims; + /* Run the following code in cores using DSP extension */ + q15_t *const col_buffer_start = buffer_a; + q15_t *col_buffer = col_buffer_start; + const int32_t *const bias_start_pos = bias; + const q31_t *const out_mult_start_pos = output_mult; + const q31_t *const out_shift_start_pos = output_shift; + uint16_t row_count; + uint16_t row_shift; + + for (int i_out_y = 0; i_out_y < output_y; i_out_y++) + { + const int16_t base_idx_y = (i_out_y * stride_y) - pad_y; + for (int i_out_x = 0; i_out_x < output_x; i_out_x++) + { + const int16_t base_idx_x = (i_out_x * stride_x) - pad_x; + + /* Out of bounds is only considered for the y axis as it provides a contiguous zero'ing opportunity than + along the x axis */ + const int ker_y_start = MAX(0, -base_idx_y); + /* Condition for kernel end dimension: (base_idx_y + ker_y_end) < input_y */ + const int ker_y_end = MIN(kernel_y, input_y - base_idx_y); + + int32_t index = 0; + if (ker_y_start != 0) + { + memset(&col_buffer[index], 0, (kernel_x * input_ch) * ker_y_start * sizeof(q15_t)); + index += (kernel_x * input_ch) * ker_y_start; + } + + for (int i_ker_y = ker_y_start; i_ker_y < ker_y_end; i_ker_y++) + { + const int32_t idx_y = base_idx_y + i_ker_y; + + for (int i_ker_x = 0; i_ker_x < kernel_x; i_ker_x++) + { + const int32_t idx_x = base_idx_x + i_ker_x; + if (idx_x < 0 || idx_x >= input_x) + { + memset(&col_buffer[index], 0, input_ch * sizeof(q15_t)); + } + else + { + arm_q7_to_q15_with_offset((q7_t *)input + (idx_y * input_x + idx_x) * input_ch, + &col_buffer[index], + input_ch, + input_offset); + } + index += input_ch; + } + } + + const int diff = kernel_y - ker_y_end; + if (diff != 0) + { + memset(&col_buffer[index], 0, (kernel_x * input_ch) * diff * sizeof(q15_t)); + } + + row_count = output_ch / 4; + row_shift = 0; + bias = bias_start_pos; + output_mult = out_mult_start_pos; + output_shift = out_shift_start_pos; + + while (row_count) + { + q31_t sum = *bias++; + q31_t sum_2 = *bias++; + q31_t sum_3 = *bias++; + q31_t sum_4 = *bias++; + + uint16_t col_count = (kernel_x * kernel_y) / 2; + q15_t *col_pos = col_buffer_start + row_shift; + const q7_t *row_pos = kernel + row_shift; + row_shift += 4; + + while (col_count) + { + /* General idea is to read 4 + 4 (input, kernel) pair and re-arrange them in the right order to + use in a SMLAD instruction . One run of this loop produces 4 partial outputs with 8 MACs. */ + /* Note: variable names can be improved here to align with rows and columns. */ + q31_t ip_a1, ip_a2, ip_b1, ip_b2, op_a, op_b, op_c; + /* Read 4 weights */ + ip_b1 = arm_nn_read_q7x4(row_pos); + ip_a1 = arm_nn_read_q7x4(row_pos + input_ch); + op_a = arm_nn_read_q15x2(col_pos); + op_b = arm_nn_read_q15x2(col_pos + input_ch); + + ip_a2 = __SXTB16(ip_b1); + ip_b1 = __SXTB16(__ROR(ip_b1, 8)); + + ip_b2 = __SXTB16(ip_a1); + ip_a1 = __SXTB16(__ROR(ip_a1, 8)); + + op_c = __PKHBT(op_b, op_a, 16); + op_a = __PKHTB(op_b, op_a, 16); + op_b = __PKHBT(ip_b2, ip_a2, 16); + sum = __SMLAD(op_c, op_b, sum); + + op_b = __PKHBT(ip_b1, ip_a1, 16); + sum_2 = __SMLAD(op_a, op_b, sum_2); + + op_a = arm_nn_read_q15x2(col_pos + 2); + op_b = arm_nn_read_q15x2(col_pos + input_ch + 2); + + op_c = __PKHBT(op_b, op_a, 16); + op_a = __PKHTB(op_b, op_a, 16); + op_b = __PKHTB(ip_a2, ip_b2, 16); + sum_3 = __SMLAD(op_c, op_b, sum_3); + + op_b = __PKHTB(ip_a1, ip_b1, 16); + sum_4 = __SMLAD(op_a, op_b, sum_4); + + row_pos += input_ch << 1; + col_pos += input_ch << 1; + col_count--; + } + + col_count = (kernel_x * kernel_y) & 0x1; + while (col_count) + { + sum += row_pos[0] * col_pos[0]; + sum_2 += row_pos[1] * col_pos[1]; + sum_3 += row_pos[2] * col_pos[2]; + sum_4 += row_pos[3] * col_pos[3]; + + row_pos += input_ch; + col_pos += input_ch; + + col_count--; + } + sum = arm_nn_requantize(sum, *output_mult++, *output_shift++); + sum += output_offset; + sum = MAX(sum, output_activation_min); + sum = MIN(sum, output_activation_max); + *output++ = (q7_t)sum; + + sum_2 = arm_nn_requantize(sum_2, *output_mult++, *output_shift++); + sum_2 += output_offset; + sum_2 = MAX(sum_2, output_activation_min); + sum_2 = MIN(sum_2, output_activation_max); + *output++ = (q7_t)sum_2; + sum_3 = arm_nn_requantize(sum_3, *output_mult++, *output_shift++); + sum_3 += output_offset; + sum_3 = MAX(sum_3, output_activation_min); + sum_3 = MIN(sum_3, output_activation_max); + *output++ = (q7_t)sum_3; + + sum_4 = arm_nn_requantize(sum_4, *output_mult++, *output_shift++); + sum_4 += output_offset; + sum_4 = MAX(sum_4, output_activation_min); + sum_4 = MIN(sum_4, output_activation_max); + *output++ = (q7_t)sum_4; + + row_count--; + } + + row_count = output_ch & 0x3; + while (row_count) + { + q15_t *col_pos = col_buffer_start + row_shift; + const q7_t *row_pos = kernel + row_shift; + q31_t sum = *bias++; + const uint16_t col_count = (kernel_x * kernel_y); + row_shift += 1; + + for (int i = 0; i < col_count; i++) + { + sum += row_pos[i * input_ch] * col_pos[i * input_ch]; + } + sum = arm_nn_requantize(sum, *output_mult++, *output_shift++); + sum += output_offset; + sum = MAX(sum, output_activation_min); + sum = MIN(sum, output_activation_max); + *output++ = (q7_t)sum; + + row_count--; + } + + // clear counter and pointers + col_buffer = col_buffer_start; + } + } +#endif +#else + /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ + return arm_depthwise_conv_s8(ctx, + dw_conv_params, + quant_params, + input_dims, + input, + filter_dims, + kernel, + bias_dims, + bias, + output_dims, + output); +#endif /* ARM_MATH_MVEI | ARM_MATH_DSP */ + + /* Return to application */ + return ARM_MATH_SUCCESS; +} + +int32_t arm_depthwise_conv_s8_opt_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims) +{ +#if defined(ARM_MATH_MVEI) + /* The + 4 accounts for out of bounds read of the lhs buffers in the *_nt_t_* functions. */ + return (2 * input_dims->c * filter_dims->w * filter_dims->h) * (int32_t)sizeof(int16_t) + 4; +#elif defined(ARM_MATH_DSP) + return (input_dims->c * filter_dims->w * filter_dims->h) * sizeof(int16_t); +#else + (void)input_dims; + (void)filter_dims; + return 0; +#endif +} + +/** + * @} end of NNConv group + */ diff --git a/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_u8_basic_ver1.c b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_u8_basic_ver1.c new file mode 100644 index 0000000..c9d0afc --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_u8_basic_ver1.c @@ -0,0 +1,336 @@ +/* + * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_depthwise_conv_u8_basic_ver1.c + * Description: u8 depthwise convolution function + * + * $Date: 09. October 2020 + * $Revision: V.1.1.1 + * + * Target : Cortex-M CPUs + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup NNConv + * @{ + */ + +static void depthwise_conv_u8_mult_4(const uint8_t *input, + const int32_t input_x, + const int32_t input_y, + const int32_t input_ch, + const uint8_t *kernel, + const int32_t output_ch, + const int32_t ch_mult, + const int32_t kernel_x, + const int32_t kernel_y, + const int32_t pad_x, + const int32_t pad_y, + const int32_t stride_x, + const int32_t stride_y, + const int32_t *bias, + uint8_t *output, + const int32_t output_shift, + const int32_t output_mult, + const int32_t output_x, + const int32_t output_y, + const int32_t output_offset, + const int32_t input_offset, + const int32_t filter_offset, + const int32_t output_activation_min, + const int32_t output_activation_max) +{ + for (int32_t in_h = -pad_y, out_h = 0, out_idx = 0; out_h < output_y; in_h += stride_y, ++out_h) + { + for (int32_t in_w = -pad_x, out_w = 0, ker_h_start = MAX(0, -in_h); out_w < output_x; in_w += stride_x, ++out_w) + { + for (int32_t in_ch = 0, out_ch = 0, ker_w_start = MAX(0, -in_w); out_ch < output_ch; + ++in_ch, out_ch += ch_mult) + { + for (int mult_tile = 0; mult_tile < ch_mult; mult_tile += 4) + { + int32_t out_buff[4]; + + out_buff[0] = 0; + out_buff[1] = 0; + out_buff[2] = 0; + out_buff[3] = 0; + + for (int32_t ker_h = ker_h_start; ker_h < MIN(kernel_y, input_y - in_h); ++ker_h) + { + int32_t ker_idx = ker_h * (output_ch * kernel_x) + ker_w_start * output_ch + out_ch; + int32_t in_idx = (in_h + ker_h) * (input_ch * input_x) + in_w * input_ch + in_ch; + + for (int32_t ker_w = ker_w_start; ker_w < MIN(kernel_x, input_x - in_w); + ++ker_w, ker_idx += output_ch) + { + int32_t in_val = input[in_idx + ker_w * input_ch] + input_offset; + out_buff[0] += in_val * (kernel[ker_idx + 0 + mult_tile] + filter_offset); + out_buff[1] += in_val * (kernel[ker_idx + 1 + mult_tile] + filter_offset); + out_buff[2] += in_val * (kernel[ker_idx + 2 + mult_tile] + filter_offset); + out_buff[3] += in_val * (kernel[ker_idx + 3 + mult_tile] + filter_offset); + } + } + + if (bias != NULL) + { + out_buff[0] += bias[out_ch + 0 + mult_tile]; + out_buff[1] += bias[out_ch + 1 + mult_tile]; + out_buff[2] += bias[out_ch + 2 + mult_tile]; + out_buff[3] += bias[out_ch + 3 + mult_tile]; + } + out_buff[0] = arm_nn_requantize(out_buff[0], output_mult, output_shift); + out_buff[1] = arm_nn_requantize(out_buff[1], output_mult, output_shift); + out_buff[2] = arm_nn_requantize(out_buff[2], output_mult, output_shift); + out_buff[3] = arm_nn_requantize(out_buff[3], output_mult, output_shift); + + out_buff[0] += output_offset; + out_buff[1] += output_offset; + out_buff[2] += output_offset; + out_buff[3] += output_offset; + + out_buff[0] = MIN(MAX(out_buff[0], output_activation_min), output_activation_max); + out_buff[1] = MIN(MAX(out_buff[1], output_activation_min), output_activation_max); + out_buff[2] = MIN(MAX(out_buff[2], output_activation_min), output_activation_max); + out_buff[3] = MIN(MAX(out_buff[3], output_activation_min), output_activation_max); + + output[out_idx++] = (uint8_t)out_buff[0]; + output[out_idx++] = (uint8_t)out_buff[1]; + output[out_idx++] = (uint8_t)out_buff[2]; + output[out_idx++] = (uint8_t)out_buff[3]; + } + } + } + } +} + +static void depthwise_conv_u8_generic(const uint8_t *input, + const int32_t input_x, + const int32_t input_y, + const int32_t input_ch, + const uint8_t *kernel, + const int32_t output_ch, + const int32_t ch_mult, + const int32_t kernel_x, + const int32_t kernel_y, + const int32_t pad_x, + const int32_t pad_y, + const int32_t stride_x, + const int32_t stride_y, + const int32_t *bias, + uint8_t *output, + const int32_t output_shift, + const int32_t output_mult, + const int32_t output_x, + const int32_t output_y, + const int32_t output_offset, + const int32_t input_offset, + const int32_t filter_offset, + const int32_t output_activation_min, + const int32_t output_activation_max) +{ + (void)output_ch; + int i_out = 0; + for (int i_out_y = 0; i_out_y < output_y; i_out_y++) + { + const int16_t base_idx_y = (i_out_y * stride_y) - pad_y; + for (int i_out_x = 0; i_out_x < output_x; i_out_x++) + { + const int16_t base_idx_x = (i_out_x * stride_x) - pad_x; + for (int i_input_ch = 0; i_input_ch < input_ch; i_input_ch++) + { + for (int i_ch_mult = 0; i_ch_mult < ch_mult; i_ch_mult++) + { + const int idx_out_ch = i_ch_mult + i_input_ch * ch_mult; + int32_t acc_0; + /* Condition for kernel start dimension: (base_idx_<x,y> + ker_<x,y>_start) >= 0 */ + const int ker_y_start = MAX(0, -base_idx_y); + const int ker_x_start = MAX(0, -base_idx_x); + /* Condition for kernel end dimension: (base_idx_<x,y> + ker_<x,y>_end) < input_<x,y> */ + const int ker_y_end = MIN(kernel_y, input_y - base_idx_y); + const int ker_x_end = MIN(kernel_x, input_x - base_idx_x); + acc_0 = 0; + + for (int i_ker_y = ker_y_start; i_ker_y < ker_y_end; i_ker_y++) + { + const int32_t idx_y = base_idx_y + i_ker_y; + for (int i_ker_x = ker_x_start; i_ker_x < ker_x_end; i_ker_x++) + { + const int32_t idx_x = base_idx_x + i_ker_x; + int32_t idx_0 = (idx_y * input_x + idx_x) * input_ch + i_input_ch; + int32_t ker_idx_0 = (i_ker_y * kernel_x + i_ker_x) * (input_ch * ch_mult) + idx_out_ch; + + acc_0 += (input[idx_0] + input_offset) * (kernel[ker_idx_0] + filter_offset); + } + } + if (bias != NULL) + { + acc_0 += bias[idx_out_ch]; + } + + /* Requantize and clamp output to provided range */ + acc_0 = arm_nn_requantize(acc_0, output_mult, output_shift); + acc_0 += output_offset; + acc_0 = MAX(acc_0, output_activation_min); + acc_0 = MIN(acc_0, output_activation_max); + + output[i_out++] = acc_0; + } + } + } + } +} + +/** + * @brief uint8 depthwise convolution function with asymmetric quantization + * + * @param[in] input Pointer to input tensor + * @param[in] input_x Width of input tensor + * @param[in] input_y Height of input tensor + * @param[in] input_ch Channels in input tensor + * @param[in] kernel Pointer to kernel weights + * @param[in] kernel_x Width of kernel + * @param[in] kernel_y Height of kernel + * @param[in] ch_mult Number of channel multiplier + * @param[in] pad_x Padding sizes x + * @param[in] pad_y Padding sizes y + * @param[in] stride_x Convolution stride along the width + * @param[in] stride_y Convolution stride along the height + * @param[in] dilation_x Dilation along width. Not used and intended for future enhancement. + * @param[in] dilation_y Dilation along height. Not used and intended for future enhancement. + * @param[in] bias Pointer to optional bias values. If no bias is + * available, NULL is expected + * @param[in] input_offset Input tensor zero offset + * @param[in] filter_offset Kernel tensor zero offset + * @param[in] output_offset Output tensor zero offset + * @param[in,out] output Pointer to output tensor + * @param[in] output_x Width of output tensor + * @param[in] output_y Height of output tensor + * @param[in] output_activation_min Minimum value to clamp the output to. Range : {0, 255} + * @param[in] output_activation_max Minimum value to clamp the output to. Range : {0, 255} + * @param[in] output_shift Amount of right-shift for output + * @param[in] output_mult Output multiplier for requantization + * @return The function returns one of the following + * <code>ARM_MATH_SIZE_MISMATCH</code> - Not supported dimension of tensors + * <code>ARM_MATH_SUCCESS</code> - Successful operation + * <code>ARM_MATH_ARGUMENT_ERROR</code> - Implementation not available + * + * + */ + +arm_status arm_depthwise_conv_u8_basic_ver1(const uint8_t *input, + const uint16_t input_x, + const uint16_t input_y, + const uint16_t input_ch, + const uint8_t *kernel, + const uint16_t kernel_x, + const uint16_t kernel_y, + const int16_t ch_mult, + const int16_t pad_x, + const int16_t pad_y, + const int16_t stride_x, + const int16_t stride_y, + const int16_t dilation_x, + const int16_t dilation_y, + const int32_t *bias, + const int32_t input_offset, + const int32_t filter_offset, + const int32_t output_offset, + uint8_t *output, + const uint16_t output_x, + const uint16_t output_y, + const int32_t output_activation_min, + const int32_t output_activation_max, + const int32_t output_shift, + const int32_t output_mult) +{ + (void)dilation_x; + (void)dilation_y; + + if (ch_mult % 4 == 0) + { + depthwise_conv_u8_mult_4(input, + input_x, + input_y, + input_ch, + kernel, + ch_mult * input_ch, + ch_mult, + kernel_x, + kernel_y, + pad_x, + pad_y, + stride_x, + stride_y, + bias, + output, + output_shift, + output_mult, + output_x, + output_y, + output_offset, + input_offset, + filter_offset, + output_activation_min, + output_activation_max); + } + else + { + depthwise_conv_u8_generic(input, + input_x, + input_y, + input_ch, + kernel, + ch_mult * input_ch, + ch_mult, + kernel_x, + kernel_y, + pad_x, + pad_y, + stride_x, + stride_y, + bias, + output, + output_shift, + output_mult, + output_x, + output_y, + output_offset, + input_offset, + filter_offset, + output_activation_min, + output_activation_max); + } + + /* Return to application */ + return ARM_MATH_SUCCESS; +} + +/** + * @} end of NNConv group + */ diff --git a/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_wrapper_s8.c b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_wrapper_s8.c new file mode 100644 index 0000000..23c8e46 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_wrapper_s8.c @@ -0,0 +1,135 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_depthwise_conv_wrapper_s8.c + * Description: Wrapper API to select appropriate depthwise conv API based + * on dimensions. + * + * $Date: 20. Dec 2021 + * $Revision: V.1.4.0 + * + * Target Processor: Cortex-M CPUs + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup NNConv + * @{ + */ + +/* + * s8 Depthwise conv wrapper function + * + * Refer header file for details. + * + */ +arm_status arm_depthwise_conv_wrapper_s8(const cmsis_nn_context *ctx, + const cmsis_nn_dw_conv_params *dw_conv_params, + const cmsis_nn_per_channel_quant_params *quant_params, + const cmsis_nn_dims *input_dims, + const q7_t *input, + const cmsis_nn_dims *filter_dims, + const q7_t *filter, + const cmsis_nn_dims *bias_dims, + const int32_t *bias, + const cmsis_nn_dims *output_dims, + q7_t *output) +{ + arm_status status = ARM_MATH_SUCCESS; + if (1 == dw_conv_params->ch_mult && input_dims->n == 1 && dw_conv_params->dilation.w == 1 && + dw_conv_params->dilation.h == 1) + { +#if !defined(ARM_MATH_MVEI) + if ((filter_dims->w == 3) && (filter_dims->h == 3) && (dw_conv_params->padding.h <= 1) && + (dw_conv_params->padding.w <= 1)) + { + status = arm_depthwise_conv_3x3_s8(ctx, + dw_conv_params, + quant_params, + input_dims, + input, + filter_dims, + filter, + bias_dims, + bias, + output_dims, + output); + } + else +#endif + { + status = arm_depthwise_conv_s8_opt(ctx, + dw_conv_params, + quant_params, + input_dims, + input, + filter_dims, + filter, + bias_dims, + bias, + output_dims, + output); + } + } + else + { + status = arm_depthwise_conv_s8(ctx, + dw_conv_params, + quant_params, + input_dims, + input, + filter_dims, + filter, + bias_dims, + bias, + output_dims, + output); + } + + /* Return to application */ + return status; +} + +int32_t arm_depthwise_conv_wrapper_s8_get_buffer_size(const cmsis_nn_dw_conv_params *dw_conv_params, + const cmsis_nn_dims *input_dims, + const cmsis_nn_dims *filter_dims, + const cmsis_nn_dims *output_dims) +{ + (void)dw_conv_params; + int32_t size = 0; + + if (input_dims->c == output_dims->c && input_dims->n == 1 && dw_conv_params->dilation.w == 1 && + dw_conv_params->dilation.h == 1) + { + size = arm_depthwise_conv_s8_opt_get_buffer_size(input_dims, filter_dims); + } + + return size; +} + +/** + * @} end of NNConv group + */ diff --git a/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_separable_conv_HWC_q7.c b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_separable_conv_HWC_q7.c new file mode 100644 index 0000000..729147f --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_separable_conv_HWC_q7.c @@ -0,0 +1,422 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_depthwise_separable_conv_HWC_q7.c + * Description: Q7 depthwise separable convolution function + * + * $Date: July 20, 2021 + * $Revision: V.1.1.2 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup NNConv + * @{ + */ + +/** + * @brief Q7 depthwise separable convolution function + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimension + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferB pointer to buffer space for output + * @return The function returns either + * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking. + * + * @details + * + * <b>Buffer size:</b> + * + * bufferA size: 2*ch_im_in*dim_kernel*dim_kernel + * + * bufferB size: 0 + * + * <b>Input dimension constraints:</b> + * + * ch_im_in equals ch_im_out + * + * Implementation: + * There are 3 nested loop here: + * Inner loop: calculate each output value with MAC instruction over an accumulator + * Mid loop: loop over different output channel + * Outer loop: loop over different output (x, y) + */ + +arm_status arm_depthwise_separable_conv_HWC_q7(const q7_t *Im_in, + const uint16_t dim_im_in, + const uint16_t ch_im_in, + const q7_t *wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel, + const uint16_t padding, + const uint16_t stride, + const q7_t *bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q7_t *Im_out, + const uint16_t dim_im_out, + q15_t *bufferA, + q7_t *bufferB) +{ + (void)bufferB; +#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) + /* Run the following code for Cortex-M4 and Cortex-M7 */ + + int16_t i_out_y, i_out_x; + int16_t i_ker_y, i_ker_x; + q7_t *colBuffer = (q7_t *)bufferA; + q7_t *pBuffer = colBuffer; + const q7_t *pBias = bias; + q7_t *pOut = Im_out; + uint16_t rowCnt; + uint16_t row_shift; + + /* do some checking here, basically ch_im_in == ch_im_out */ + if (ch_im_in != ch_im_out) + { + return ARM_MATH_SIZE_MISMATCH; + } + + for (i_out_y = 0; i_out_y < dim_im_out; i_out_y++) + { + for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++) + { + /* we first do im2col here */ + for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++) + { + for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++) + { + if (i_ker_y < 0 || i_ker_y >= dim_im_in || i_ker_x < 0 || i_ker_x >= dim_im_in) + { + /* arm_fill_q7(0, pBuffer, ch_im_in); */ + memset(pBuffer, 0, ch_im_in); + } + else + { + /* arm_copy_q7((q7_t *) Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, pBuffer, ch_im_in); + */ + memcpy(pBuffer, (q7_t *)Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, ch_im_in); + } + pBuffer += ch_im_in; + } + } + + /* we will do the computation here for each channel */ + rowCnt = ch_im_out >> 2; + row_shift = 0; + pBias = bias; + + while (rowCnt) + { + q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + q31_t sum3 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + + uint16_t colCnt = (dim_kernel * dim_kernel) >> 1; + q7_t *pB = colBuffer + row_shift; + const q7_t *pA = wt + row_shift; + row_shift += 4; + +#ifdef USE_INTRINSIC + +#ifndef ARM_MATH_BIG_ENDIAN + + while (colCnt) + { + q31_t inA1, inA2, inB1, inB2, opA, opB; + + inB1 = arm_nn_read_q7x4(pB); + pB += ch_im_in; + opB = arm_nn_read_q7x4(pB); + pB += ch_im_in; + inB2 = __PKHTB(opB, inB1, 16); + inB1 = __PKHBT(inB1, opB, 16); + inA1 = arm_nn_read_q7x4(pA); + pA += ch_im_in; + opB = arm_nn_read_q7x4(pA); + pA += ch_im_in; + inA2 = __PKHTB(opB, inA1, 16); + inA1 = __PKHBT(inA1, opB, 16); + opA = __SXTB16(inA1); + opB = __SXTB16(inB1); + sum = __SMLAD(opA, opB, sum); + opA = __SXTB16(__ROR(inA1, 8)); + opB = __SXTB16(__ROR(inB1, 8)); + sum2 = __SMLAD(opA, opB, sum2); + opA = __SXTB16(inA2); + opB = __SXTB16(inB2); + sum3 = __SMLAD(opA, opB, sum3); + opA = __SXTB16(__ROR(inA2, 8)); + opB = __SXTB16(__ROR(inB2, 8)); + sum4 = __SMLAD(opA, opB, sum4); + colCnt--; + } +#else + + while (colCnt) + { + q31_t inA1, inA2, inB1, inB2, opA, opB; + + inB1 = arm_nn_read_q7x4(pB); + pB += ch_im_in; + opB = arm_nn_read_q7x4(pB); + pB += ch_im_in; + inB2 = __PKHBT(opB, inB1, 16); + inB1 = __PKHTB(inB1, opB, 16); + inA1 = arm_nn_read_q7x4(pA); + pA += ch_im_in; + opB = arm_nn_read_q7x4(pA); + pA += ch_im_in; + inA2 = __PKHBT(opB, inA1, 16); + inA1 = __PKHTB(inA1, opB, 16); + opA = __SXTB16(inA1); + opB = __SXTB16(inB1); + sum2 = __SMLAD(opA, opB, sum2); + opA = __SXTB16(__ROR(inA1, 8)); + opB = __SXTB16(__ROR(inB1, 8)); + sum = __SMLAD(opA, opB, sum); + opA = __SXTB16(inA2); + opB = __SXTB16(inB2); + sum4 = __SMLAD(opA, opB, sum4); + opA = __SXTB16(__ROR(inA2, 8)); + opB = __SXTB16(__ROR(inB2, 8)); + sum3 = __SMLAD(opA, opB, sum3); + colCnt--; + } + +#endif /* ARM_MATH_BIG_ENDIAN */ + +#else + +#ifndef ARM_MATH_BIG_ENDIAN + /* + * r0 r1 r2 r3 r4 r5 + * inA1, inA2, inB1, inB2, opA, opB + */ + + asm volatile("COL_LOOP_%=:\n" + "ldr.w r2, [%[pB], #0]\n" + "add.w %[pB], %[pB], %[ch_im_in]\n" + "ldr.w r5, [%[pB], #0]\n" + "add.w %[pB], %[pB], %[ch_im_in]\n" + "pkhtb r3, r5, r2, ASR #16\n" + "pkhbt r2, r2, r5, LSL #16\n" + "ldr.w r0, [%[pA], #0]\n" + "add.w %[pA], %[pA], %[ch_im_in]\n" + "ldr.w r5, [%[pA], #0]\n" + "add.w %[pA], %[pA], %[ch_im_in]\n" + "pkhtb r1, r5, r0, ASR #16\n" + "pkhbt r0, r0, r5, LSL #16\n" + "sxtb16 r4, r0\n" + "sxtb16 r5, r2\n" + "smlad %[sum], r4, r5, %[sum]\n" + "mov.w r4, r0, ror #8\n" + "mov.w r5, r2, ror #8\n" + "sxtb16 r4, r4\n" + "sxtb16 r5, r5\n" + "smlad %[sum2], r4, r5, %[sum2]\n" + "sxtb16 r4, r1\n" + "sxtb16 r5, r3\n" + "smlad %[sum3], r4, r5, %[sum3]\n" + "mov.w r4, r1, ror #8\n" + "mov.w r5, r3, ror #8\n" + "sxtb16 r4, r4\n" + "sxtb16 r5, r5\n" + "smlad %[sum4], r4, r5, %[sum4]\n" + "subs %[colCnt], #1\n" + "bne COL_LOOP_%=\n" + : [ sum ] "+r"(sum), + [ sum2 ] "+r"(sum2), + [ sum3 ] "+r"(sum3), + [ sum4 ] "+r"(sum4), + [ pB ] "+r"(pB), + [ pA ] "+r"(pA) + : [ colCnt ] "r"(colCnt), [ ch_im_in ] "r"(ch_im_in) + : "r0", "r1", "r2", "r3", "r4", "r5"); +#else + /* + * r0 r1 r2 r3 r4 r5 + * inA1, inA2, inB1, inB2, opA, opB + */ + asm volatile("COL_LOOP_%=:\n" + "ldr.w r2, [%[pB], #0]\n" + "add.w %[pB], %[pB], %[ch_im_in]\n" + "ldr.w r5, [%[pB], #0]\n" + "add.w %[pB], %[pB], %[ch_im_in]\n" + "pkhbt r3, r5, r2, LSL #16\n" + "pkhtb r2, r2, r5, ASR #16\n" + "ldr.w r0, [%[pA], #0]\n" + "add.w %[pA], %[pA], %[ch_im_in]\n" + "ldr.w r5, [%[pA], #0]\n" + "add.w %[pA], %[pA], %[ch_im_in]\n" + "pkhbt r1, r5, r0, LSL #16\n" + "pkhtb r0, r0, r5, ASR #16\n" + "sxtb16 r4, r0\n" + "sxtb16 r5, r2\n" + "smlad %[sum2], r4, r5, %[sum2]\n" + "mov.w r4, r0, ror #8\n" + "mov.w r5, r2, ror #8\n" + "sxtb16 r4, r4\n" + "sxtb16 r5, r5\n" + "smlad %[sum], r4, r5, %[sum]\n" + "sxtb16 r4, r1\n" + "sxtb16 r5, r3\n" + "smlad %[sum4], r4, r5, %[sum4]\n" + "mov.w r4, r1, ror #8\n" + "mov.w r5, r3, ror #8\n" + "sxtb16 r4, r4\n" + "sxtb16 r5, r5\n" + "smlad %[sum3], r4, r5, %[sum3]\n" + "subs %[colCnt], #1\n" + "bne COL_LOOP_%=\n" + : [ sum ] "+r"(sum), + [ sum2 ] "+r"(sum2), + [ sum3 ] "+r"(sum3), + [ sum4 ] "+r"(sum4), + [ pB ] "+r"(pB), + [ pA ] "+r"(pA) + : [ colCnt ] "r"(colCnt), [ ch_im_in ] "r"(ch_im_in) + : "r0", "r1", "r2", "r3", "r4", "r5"); + +#endif /* ARM_MATH_BIG_ENDIAN */ + +#endif /* USE_INTRINSIC */ + + colCnt = (dim_kernel * dim_kernel) & 0x1; + while (colCnt) + { + union arm_nnword inA, inB; + inA.word = arm_nn_read_q7x4(pA); + pA += ch_im_in; + inB.word = arm_nn_read_q7x4(pB); + pB += ch_im_in; + sum += inA.bytes[0] * inB.bytes[0]; + sum2 += inA.bytes[1] * inB.bytes[1]; + sum3 += inA.bytes[2] * inB.bytes[2]; + sum4 += inA.bytes[3] * inB.bytes[3]; + colCnt--; + } + + *pOut++ = (q7_t)__SSAT((sum >> out_shift), 8); + *pOut++ = (q7_t)__SSAT((sum2 >> out_shift), 8); + *pOut++ = (q7_t)__SSAT((sum3 >> out_shift), 8); + *pOut++ = (q7_t)__SSAT((sum4 >> out_shift), 8); + + rowCnt--; + } + + rowCnt = ch_im_out & 0x3; + while (rowCnt) + { + q7_t *pB = colBuffer + row_shift; + const q7_t *pA = wt + row_shift; + q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + uint16_t colCnt = (dim_kernel * dim_kernel); + + row_shift += 1; + + while (colCnt) + { + q7_t A1 = *pA; + q7_t B1 = *pB; + pA += ch_im_in; + pB += ch_im_in; + sum += A1 * B1; + + colCnt--; + } + *pOut++ = (q7_t)__SSAT((sum >> out_shift), 8); + rowCnt--; + } + + /* clear counter and pointers */ + pBuffer = colBuffer; + } + } + +#else + (void)bufferA; + /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ + int i_out_y, i_out_x, i_ch_out, i_ker_x, i_ker_y; + int conv_out; + + /* do some checking here, basically ch_im_in == ch_im_out */ + if (ch_im_in != ch_im_out) + { + return ARM_MATH_SIZE_MISMATCH; + } + + for (i_out_y = 0; i_out_y < dim_im_out; i_out_y++) + { + for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++) + { + for (i_ch_out = 0; i_ch_out < ch_im_out; i_ch_out++) + { + // for each output + conv_out = ((q31_t)(bias[i_ch_out]) << bias_shift) + NN_ROUND(out_shift); + for (i_ker_y = 0; i_ker_y < dim_kernel; i_ker_y++) + { + for (i_ker_x = 0; i_ker_x < dim_kernel; i_ker_x++) + { + int in_row = stride * i_out_y + i_ker_y - padding; + int in_col = stride * i_out_x + i_ker_x - padding; + if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in && in_col < dim_im_in) + { + conv_out += Im_in[(in_row * dim_im_in + in_col) * ch_im_in + i_ch_out] * + wt[(i_ker_y * dim_kernel + i_ker_x) * ch_im_out + i_ch_out]; + } + } + } + Im_out[(i_out_y * dim_im_out + i_out_x) * ch_im_out + i_ch_out] = + (q7_t)__SSAT((conv_out >> out_shift), 8); + } + } + } + +#endif /* ARM_MATH_DSP */ + + /* Return to application */ + return ARM_MATH_SUCCESS; +} + +/** + * @} end of NNConv group + */ diff --git a/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_separable_conv_HWC_q7_nonsquare.c b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_separable_conv_HWC_q7_nonsquare.c new file mode 100644 index 0000000..829acf9 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_separable_conv_HWC_q7_nonsquare.c @@ -0,0 +1,427 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_depthwise_separable_conv_HWC_q7_nonsquare.c + * Description: Q7 depthwise separable convolution function (non-square shape) + * + * $Date: July 20, 2021 + * $Revision: V.1.1.2 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup NNConv + * @{ + */ + +/** + * @brief Q7 depthwise separable convolution function (non-square shape) + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in_x input tensor dimension x + * @param[in] dim_im_in_y input tensor dimension y + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel_x filter kernel size x + * @param[in] dim_kernel_y filter kernel size y + * @param[in] padding_x padding sizes x + * @param[in] padding_y padding sizes y + * @param[in] stride_x convolution stride x + * @param[in] stride_y convolution stride y + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out_x output tensor dimension x + * @param[in] dim_im_out_y output tensor dimension y + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferB pointer to buffer space for output + * @return The function returns either + * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking. + * + * This function is the version with full list of optimization tricks, but with + * some constraints: + * ch_im_in is equal to ch_im_out + * + */ + +arm_status arm_depthwise_separable_conv_HWC_q7_nonsquare(const q7_t *Im_in, + const uint16_t dim_im_in_x, + const uint16_t dim_im_in_y, + const uint16_t ch_im_in, + const q7_t *wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel_x, + const uint16_t dim_kernel_y, + const uint16_t padding_x, + const uint16_t padding_y, + const uint16_t stride_x, + const uint16_t stride_y, + const q7_t *bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q7_t *Im_out, + const uint16_t dim_im_out_x, + const uint16_t dim_im_out_y, + q15_t *bufferA, + q7_t *bufferB) +{ + + (void)bufferB; + +#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) + /* Run the following code for Cortex-M4 and Cortex-M7 */ + + /* + * Implementation: + * There are 3 nested loop here: + * Inner loop: calculate each output value with MAC instruction over an accumulator + * Mid loop: loop over different output channel + * Outer loop: loop over different output (x, y) + * + */ + + int16_t i_out_y, i_out_x; + int16_t i_ker_y, i_ker_x; + q7_t *colBuffer = (q7_t *)bufferA; + q7_t *pBuffer = colBuffer; + const q7_t *pBias = bias; + q7_t *pOut = Im_out; + uint16_t rowCnt; + uint16_t row_shift; + + /* do some checking here, basically ch_im_in == ch_im_out */ + if (ch_im_in != ch_im_out) + { + return ARM_MATH_SIZE_MISMATCH; + } + + for (i_out_y = 0; i_out_y < dim_im_out_y; i_out_y++) + { + for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++) + { + /* we first do im2col here */ + for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y; + i_ker_y++) + { + for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x; + i_ker_x++) + { + if (i_ker_y < 0 || i_ker_y >= dim_im_in_y || i_ker_x < 0 || i_ker_x >= dim_im_in_x) + { + /* arm_fill_q7(0, pBuffer, ch_im_in); */ + memset(pBuffer, 0, ch_im_in); + } + else + { + /* arm_copy_q7((q7_t *) Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, pBuffer, + * ch_im_in); */ + memcpy(pBuffer, (q7_t *)Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, ch_im_in); + } + pBuffer += ch_im_in; + } + } + + /* we will do the computation here for each channel */ + rowCnt = ch_im_out >> 2; + row_shift = 0; + pBias = bias; + + while (rowCnt) + { + q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + q31_t sum3 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + + uint16_t colCnt = (dim_kernel_x * dim_kernel_y) >> 1; + q7_t *pB = colBuffer + row_shift; + const q7_t *pA = wt + row_shift; + row_shift += 4; + +#ifdef USE_INTRINSIC + +#ifndef ARM_MATH_BIG_ENDIAN + + while (colCnt) + { + q31_t inA1, inA2, inB1, inB2, opA, opB; + + inB1 = arm_nn_read_q7x4(pB); + pB += ch_im_in; + opB = arm_nn_read_q7x4(pB); + pB += ch_im_in; + inB2 = __PKHTB(opB, inB1, 16); + inB1 = __PKHBT(inB1, opB, 16); + inA1 = arm_nn_read_q7x4(pA); + pA += ch_im_in; + opB = arm_nn_read_q7x4(pA); + pA += ch_im_in; + inA2 = __PKHTB(opB, inA1, 16); + inA1 = __PKHBT(inA1, opB, 16); + opA = __SXTB16(inA1); + opB = __SXTB16(inB1); + sum = __SMLAD(opA, opB, sum); + opA = __SXTB16(__ROR(inA1, 8)); + opB = __SXTB16(__ROR(inB1, 8)); + sum2 = __SMLAD(opA, opB, sum2); + opA = __SXTB16(inA2); + opB = __SXTB16(inB2); + sum3 = __SMLAD(opA, opB, sum3); + opA = __SXTB16(__ROR(inA2, 8)); + opB = __SXTB16(__ROR(inB2, 8)); + sum4 = __SMLAD(opA, opB, sum4); + colCnt--; + } +#else + + while (colCnt) + { + q31_t inA1, inA2, inB1, inB2, opA, opB; + + inB1 = arm_nn_read_q7x4(pB); + pB += ch_im_in; + opB = arm_nn_read_q7x4(pB); + pB += ch_im_in; + inB2 = __PKHBT(opB, inB1, 16); + inB1 = __PKHTB(inB1, opB, 16); + inA1 = arm_nn_read_q7x4(pA); + pA += ch_im_in; + opB = arm_nn_read_q7x4(pA); + pA += ch_im_in; + inA2 = __PKHBT(opB, inA1, 16); + inA1 = __PKHTB(inA1, opB, 16); + opA = __SXTB16(inA1); + opB = __SXTB16(inB1); + sum2 = __SMLAD(opA, opB, sum2); + opA = __SXTB16(__ROR(inA1, 8)); + opB = __SXTB16(__ROR(inB1, 8)); + sum = __SMLAD(opA, opB, sum); + opA = __SXTB16(inA2); + opB = __SXTB16(inB2); + sum4 = __SMLAD(opA, opB, sum4); + opA = __SXTB16(__ROR(inA2, 8)); + opB = __SXTB16(__ROR(inB2, 8)); + sum3 = __SMLAD(opA, opB, sum3); + colCnt--; + } + +#endif /* ARM_MATH_BIG_ENDIAN */ + +#else + +#ifndef ARM_MATH_BIG_ENDIAN + // r0 r1 r2 r3 r4 r5 + // inA1, inA2, inB1, inB2, opA, opB + asm volatile("COL_LOOP:\n" + "ldr.w r2, [%[pB], #0]\n" + "add.w %[pB], %[pB], %[ch_im_in]\n" + "ldr.w r5, [%[pB], #0]\n" + "add.w %[pB], %[pB], %[ch_im_in]\n" + "pkhtb r3, r5, r2, ASR #16\n" + "pkhbt r2, r2, r5, LSL #16\n" + "ldr.w r0, [%[pA], #0]\n" + "add.w %[pA], %[pA], %[ch_im_in]\n" + "ldr.w r5, [%[pA], #0]\n" + "add.w %[pA], %[pA], %[ch_im_in]\n" + "pkhtb r1, r5, r0, ASR #16\n" + "pkhbt r0, r0, r5, LSL #16\n" + "sxtb16 r4, r0\n" + "sxtb16 r5, r2\n" + "smlad %[sum], r4, r5, %[sum]\n" + "mov.w r4, r0, ror #8\n" + "mov.w r5, r2, ror #8\n" + "sxtb16 r4, r4\n" + "sxtb16 r5, r5\n" + "smlad %[sum2], r4, r5, %[sum2]\n" + "sxtb16 r4, r1\n" + "sxtb16 r5, r3\n" + "smlad %[sum3], r4, r5, %[sum3]\n" + "mov.w r4, r1, ror #8\n" + "mov.w r5, r3, ror #8\n" + "sxtb16 r4, r4\n" + "sxtb16 r5, r5\n" + "smlad %[sum4], r4, r5, %[sum4]\n" + "subs %[colCnt], #1\n" + "bne COL_LOOP\n" + : [ sum ] "+r"(sum), + [ sum2 ] "+r"(sum2), + [ sum3 ] "+r"(sum3), + [ sum4 ] "+r"(sum4), + [ pB ] "+r"(pB), + [ pA ] "+r"(pA) + : [ colCnt ] "r"(colCnt), [ ch_im_in ] "r"(ch_im_in) + : "r0", "r1", "r2", "r3", "r4", "r5"); +#else + // r0 r1 r2 r3 r4 r5 + // inA1, inA2, inB1, inB2, opA, opB + asm volatile("COL_LOOP:\n" + "ldr.w r2, [%[pB], #0]\n" + "add.w %[pB], %[pB], %[ch_im_in]\n" + "ldr.w r5, [%[pB], #0]\n" + "add.w %[pB], %[pB], %[ch_im_in]\n" + "pkhbt r3, r5, r2, LSL #16\n" + "pkhtb r2, r2, r5, ASR #16\n" + "ldr.w r0, [%[pA], #0]\n" + "add.w %[pA], %[pA], %[ch_im_in]\n" + "ldr.w r5, [%[pA], #0]\n" + "add.w %[pA], %[pA], %[ch_im_in]\n" + "pkhbt r1, r5, r0, LSL #16\n" + "pkhtb r0, r0, r5, ASR #16\n" + "sxtb16 r4, r0\n" + "sxtb16 r5, r2\n" + "smlad %[sum2], r4, r5, %[sum2]\n" + "mov.w r4, r0, ror #8\n" + "mov.w r5, r2, ror #8\n" + "sxtb16 r4, r4\n" + "sxtb16 r5, r5\n" + "smlad %[sum], r4, r5, %[sum]\n" + "sxtb16 r4, r1\n" + "sxtb16 r5, r3\n" + "smlad %[sum4], r4, r5, %[sum4]\n" + "mov.w r4, r1, ror #8\n" + "mov.w r5, r3, ror #8\n" + "sxtb16 r4, r4\n" + "sxtb16 r5, r5\n" + "smlad %[sum3], r4, r5, %[sum3]\n" + "subs %[colCnt], #1\n" + "bne COL_LOOP\n" + : [ sum ] "+r"(sum), + [ sum2 ] "+r"(sum2), + [ sum3 ] "+r"(sum3), + [ sum4 ] "+r"(sum4), + [ pB ] "+r"(pB), + [ pA ] "+r"(pA) + : [ colCnt ] "r"(colCnt), [ ch_im_in ] "r"(ch_im_in) + : "r0", "r1", "r2", "r3", "r4", "r5"); +#endif /*ARM_MATH_BIG_ENDIAN */ + +#endif /* USE_INTRINSIC */ + + colCnt = (dim_kernel_x * dim_kernel_y) & 0x1; + while (colCnt) + { + union arm_nnword inA, inB; + inA.word = arm_nn_read_q7x4(pA); + pA += ch_im_in; + inB.word = arm_nn_read_q7x4(pB); + pB += ch_im_in; + sum += inA.bytes[0] * inB.bytes[0]; + sum2 += inA.bytes[1] * inB.bytes[1]; + sum3 += inA.bytes[2] * inB.bytes[2]; + sum4 += inA.bytes[3] * inB.bytes[3]; + colCnt--; + } + + *pOut++ = (q7_t)__SSAT((sum >> out_shift), 8); + *pOut++ = (q7_t)__SSAT((sum2 >> out_shift), 8); + *pOut++ = (q7_t)__SSAT((sum3 >> out_shift), 8); + *pOut++ = (q7_t)__SSAT((sum4 >> out_shift), 8); + + rowCnt--; + } + + rowCnt = ch_im_out & 0x3; + while (rowCnt) + { + q7_t *pB = colBuffer + row_shift; + const q7_t *pA = wt + row_shift; + q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + uint16_t colCnt = (dim_kernel_x * dim_kernel_y); + + row_shift += 1; + + while (colCnt) + { + q7_t A1 = *pA; + q7_t B1 = *pB; + pA += ch_im_in; + pB += ch_im_in; + sum += A1 * B1; + + colCnt--; + } + *pOut++ = (q7_t)__SSAT((sum >> out_shift), 8); + rowCnt--; + } + + // clear counter and pointers + pBuffer = colBuffer; + } + } + +#else + (void)bufferA; + + /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ + int i_out_y, i_out_x, i_ch_out; + int i_ker_y, i_ker_x; + + /* do some checking here, basically ch_im_in == ch_im_out */ + if (ch_im_in != ch_im_out) + { + return ARM_MATH_SIZE_MISMATCH; + } + + for (i_out_y = 0; i_out_y < dim_im_out_y; i_out_y++) + { + for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++) + { + for (i_ch_out = 0; i_ch_out < ch_im_out; i_ch_out++) + { + // for each output + int conv_out = ((q31_t)(bias[i_ch_out]) << bias_shift) + NN_ROUND(out_shift); + for (i_ker_y = 0; i_ker_y < dim_kernel_y; i_ker_y++) + { + for (i_ker_x = 0; i_ker_x < dim_kernel_x; i_ker_x++) + { + int in_row = stride_y * i_out_y + i_ker_y - padding_y; + int in_col = stride_x * i_out_x + i_ker_x - padding_x; + if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in_y && in_col < dim_im_in_x) + { + conv_out += Im_in[(in_row * dim_im_in_x + in_col) * ch_im_in + i_ch_out] * + wt[(i_ker_y * dim_kernel_x + i_ker_x) * ch_im_out + i_ch_out]; + } + } + } + Im_out[(i_out_y * dim_im_out_x + i_out_x) * ch_im_out + i_ch_out] = + (q7_t)__SSAT((conv_out >> out_shift), 8); + } + } + } + +#endif /* ARM_MATH_DSP */ + + /* Return to application */ + return ARM_MATH_SUCCESS; +} + +/** + * @} end of NNConv group + */ diff --git a/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_depthwise_conv_s8_core.c b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_depthwise_conv_s8_core.c new file mode 100644 index 0000000..481eeba --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_depthwise_conv_s8_core.c @@ -0,0 +1,218 @@ +/* + * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_nn_depthwise_conv_s8_core.c + * Description: Depthwise convolution on im2col buffers. + * + * $Date: 09. October 2020 + * $Revision: V.1.0.4 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ + +#include "arm_nnsupportfunctions.h" + +/* + * Depthwise conv on an im2col buffer where the input channel equals + * output channel. + * + * Refer header file for details. + * + */ + +q7_t *arm_nn_depthwise_conv_s8_core(const q7_t *row, + const q15_t *col, + const uint16_t num_ch, + const int32_t *out_shift, + const int32_t *out_mult, + const int32_t out_offset, + const int32_t activation_min, + const int32_t activation_max, + const uint16_t kernel_size, + const int32_t *const output_bias, + q7_t *out) +{ +#if defined(ARM_MATH_MVEI) + int32_t ch_per_loop = num_ch / 4; + + const int32_t *bias = output_bias; + int8_t *out_tmp = out; + + int32_t idx = 0; + + while (ch_per_loop > 0) + { + int32x4_t ip_0; + int32x4_t ip_1; + int32_t ker_loop = kernel_size / 3; + int32x4_t out_0 = vldrwq_s32(bias); + int32x4_t out_1 = out_0; + bias += 4; + + const int32_t offset = idx * 4; + const int8_t *row_0 = row + offset; + const int16_t *col_0 = col + offset; + const int16_t *col_1 = col + kernel_size * num_ch + offset; + + int32x4_t ker_0 = vldrbq_s32(row_0); + + while (ker_loop > 0) + { + const int8_t *row_1 = row_0 + num_ch; + const int8_t *row_2 = row_0 + 2 * num_ch; + const int32x4_t ker_1 = vldrbq_s32(row_1); + const int32x4_t ker_2 = vldrbq_s32(row_2); + + ip_0 = vldrhq_s32(col_0); + ip_1 = vldrhq_s32(col_1); + col_0 += num_ch; + col_1 += num_ch; + + out_0 += vmulq_s32(ip_0, ker_0); + out_1 += vmulq_s32(ip_1, ker_0); + + ip_0 = vldrhq_s32(col_0); + ip_1 = vldrhq_s32(col_1); + col_0 += num_ch; + col_1 += num_ch; + + out_0 += vmulq_s32(ip_0, ker_1); + out_1 += vmulq_s32(ip_1, ker_1); + + ip_0 = vldrhq_s32(col_0); + ip_1 = vldrhq_s32(col_1); + col_0 += num_ch; + col_1 += num_ch; + + out_0 += vmulq_s32(ip_0, ker_2); + out_1 += vmulq_s32(ip_1, ker_2); + row_0 += 3 * num_ch; + + ker_0 = vldrbq_s32(row_0); + ker_loop--; + } + + idx++; + /* Handle tail kernel elements */ + ker_loop = kernel_size - ((kernel_size / 3) * 3); + while (ker_loop > 0) + { + ip_0 = vldrhq_s32(col_0); + ip_1 = vldrhq_s32(col_1); + + out_0 += vmulq_s32(ip_0, ker_0); + out_1 += vmulq_s32(ip_1, ker_0); + + col_0 += num_ch; + col_1 += num_ch; + + ip_0 = vldrhq_s32(col_0); + ip_1 = vldrhq_s32(col_1); + + row_0 += num_ch; + ker_0 = vldrbq_s32(row_0); + ker_loop--; + } + const int32x4_t mult = vldrwq_s32(out_mult); + const int32x4_t shift = vldrwq_s32(out_shift); + out_mult += 4; + out_shift += 4; + + out_0 = arm_requantize_mve_32x4(out_0, mult, shift); + out_1 = arm_requantize_mve_32x4(out_1, mult, shift); + + out_0 = vaddq_n_s32(out_0, out_offset); + out_0 = vmaxq_s32(out_0, vdupq_n_s32(activation_min)); + out_0 = vminq_s32(out_0, vdupq_n_s32(activation_max)); + vstrbq_s32(out_tmp, out_0); + + out_1 = vaddq_n_s32(out_1, out_offset); + out_1 = vmaxq_s32(out_1, vdupq_n_s32(activation_min)); + out_1 = vminq_s32(out_1, vdupq_n_s32(activation_max)); + vstrbq_s32(out_tmp + num_ch, out_1); + + out_tmp += 4; + ch_per_loop--; + } + + int32_t tail_ch = num_ch & 3; + if (tail_ch != 0) + { + int32_t ch_idx = (num_ch & ~3); + int32x4_t col_0_sum; + int32x4_t col_1_sum; + + const int32_t single_buffer_size = kernel_size * num_ch; + for (int i = 0; i < tail_ch; i++) + { + const int16_t *col_pos_0 = col + ch_idx; + const int16_t *col_pos_1 = col_pos_0 + single_buffer_size; + + const int8_t *row_pos = row + ch_idx; + int32_t sum_0 = bias[i]; + int32_t sum_1 = bias[i]; + + for (int j = 0; j < kernel_size; j++) + { + const int8_t row_val = row_pos[j * num_ch]; + sum_0 += row_val * col_pos_0[j * num_ch]; + sum_1 += row_val * col_pos_1[j * num_ch]; + } + col_0_sum[i] = sum_0; + col_1_sum[i] = sum_1; + + ch_idx++; + } + const mve_pred16_t p = vctp32q((uint32_t)tail_ch); + const int32x4_t mult = vldrwq_z_s32(out_mult, p); + const int32x4_t shift = vldrwq_z_s32(out_shift, p); + + col_0_sum = arm_requantize_mve_32x4(col_0_sum, mult, shift); + col_1_sum = arm_requantize_mve_32x4(col_1_sum, mult, shift); + + col_0_sum = vaddq_n_s32(col_0_sum, out_offset); + col_0_sum = vmaxq_s32(col_0_sum, vdupq_n_s32(activation_min)); + col_0_sum = vminq_s32(col_0_sum, vdupq_n_s32(activation_max)); + vstrbq_p_s32(out_tmp, col_0_sum, p); + + col_1_sum = vaddq_n_s32(col_1_sum, out_offset); + col_1_sum = vmaxq_s32(col_1_sum, vdupq_n_s32(activation_min)); + col_1_sum = vminq_s32(col_1_sum, vdupq_n_s32(activation_max)); + vstrbq_p_s32(out_tmp + num_ch, col_1_sum, p); + + out_tmp += tail_ch; + } + + return out_tmp + num_ch; +#else + (void)row; + (void)col; + (void)num_ch; + (void)out_shift; + (void)out_mult; + (void)out_offset; + (void)activation_min; + (void)activation_max; + (void)kernel_size; + (void)output_bias; + (void)out; + return NULL; +#endif +} diff --git a/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_q7_q15.c b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_q7_q15.c new file mode 100644 index 0000000..05c95b6 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_q7_q15.c @@ -0,0 +1,186 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_nn_mat_mult_kernel_q7_q15.c + * Description: Matrix-multiplication function for convolution + * + * $Date: January 26, 2021 + * $Revision: V.1.0.2 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @brief Matrix-multiplication function for convolution. + * + * @details Refer to header file for details. + * + */ + +q7_t *arm_nn_mat_mult_kernel_q7_q15(const q7_t *pA, + const q15_t *pInBuffer, + const uint16_t ch_im_out, + const uint16_t numCol_A, + const uint16_t bias_shift, + const uint16_t out_shift, + const q7_t *bias, + q7_t *pOut) +{ +#if defined(ARM_MATH_DSP) + /* set up the second output pointers */ + q7_t *pOut2 = pOut + ch_im_out; + const q7_t *pBias = bias; + + uint16_t rowCnt = ch_im_out >> 1; + /* this loop over rows in A */ + while (rowCnt) + { + /* setup pointers for B */ + const q15_t *pB = pInBuffer; + const q15_t *pB2 = pB + numCol_A; + + /* align the second pointer for A */ + const q7_t *pA2 = pA + numCol_A; + + /* init the sum with bias */ + q31_t sum = ((q31_t)(*pBias) << bias_shift) + NN_ROUND(out_shift); + q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + q31_t sum3 = ((q31_t)(*pBias) << bias_shift) + NN_ROUND(out_shift); + q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + + uint16_t colCnt = numCol_A >> 2; + /* accumulate over the vector */ + while (colCnt) + { + q31_t inA11, inA12, inA21, inA22; + + q31_t inB1 = arm_nn_read_q15x2_ia(&pB); + q31_t inB2 = arm_nn_read_q15x2_ia(&pB2); + + pA = read_and_pad(pA, &inA11, &inA12); + pA2 = read_and_pad(pA2, &inA21, &inA22); + + sum = __SMLAD(inA11, inB1, sum); + sum2 = __SMLAD(inA11, inB2, sum2); + sum3 = __SMLAD(inA21, inB1, sum3); + sum4 = __SMLAD(inA21, inB2, sum4); + + inB1 = arm_nn_read_q15x2_ia(&pB); + inB2 = arm_nn_read_q15x2_ia(&pB2); + + sum = __SMLAD(inA12, inB1, sum); + sum2 = __SMLAD(inA12, inB2, sum2); + sum3 = __SMLAD(inA22, inB1, sum3); + sum4 = __SMLAD(inA22, inB2, sum4); + + colCnt--; + } /* while over colCnt */ + colCnt = numCol_A & 0x3; + while (colCnt) + { + q7_t inA1 = *pA++; + q15_t inB1 = *pB++; + q7_t inA2 = *pA2++; + q15_t inB2 = *pB2++; + + sum += inA1 * inB1; + sum2 += inA1 * inB2; + sum3 += inA2 * inB1; + sum4 += inA2 * inB2; + colCnt--; + } /* while over colCnt */ + *pOut++ = (q7_t)__SSAT((sum >> out_shift), 8); + *pOut++ = (q7_t)__SSAT((sum3 >> out_shift), 8); + *pOut2++ = (q7_t)__SSAT((sum2 >> out_shift), 8); + *pOut2++ = (q7_t)__SSAT((sum4 >> out_shift), 8); + + /* skip the row computed with A2 */ + pA += numCol_A; + rowCnt--; + } /* for over ch_im_out */ + + /* compute left-over row if any */ + if (ch_im_out & 0x1) + { + /* setup pointers for B */ + const q15_t *pB = pInBuffer; + const q15_t *pB2 = pB + numCol_A; + + /* load the bias */ + q31_t sum = ((q31_t)(*pBias) << bias_shift) + NN_ROUND(out_shift); + q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + + uint16_t colCnt = numCol_A >> 2; + while (colCnt) + { + q31_t inA11, inA12; + + q31_t inB1 = arm_nn_read_q15x2_ia(&pB); + q31_t inB2 = arm_nn_read_q15x2_ia(&pB2); + + pA = read_and_pad(pA, &inA11, &inA12); + + sum = __SMLAD(inA11, inB1, sum); + sum2 = __SMLAD(inA11, inB2, sum2); + + inB1 = arm_nn_read_q15x2_ia(&pB); + inB2 = arm_nn_read_q15x2_ia(&pB2); + + sum = __SMLAD(inA12, inB1, sum); + sum2 = __SMLAD(inA12, inB2, sum2); + + colCnt--; + } + colCnt = numCol_A & 0x3; + while (colCnt) + { + q7_t inA1 = *pA++; + q15_t inB1 = *pB++; + q15_t inB2 = *pB2++; + + sum += inA1 * inB1; + sum2 += inA1 * inB2; + colCnt--; + } + + *pOut++ = (q7_t)__SSAT((sum >> out_shift), 8); + *pOut2++ = (q7_t)__SSAT((sum2 >> out_shift), 8); + } + + pOut += ch_im_out; + + /* return the new output pointer with offset */ + return pOut; +#else + (void)pA; + (void)pInBuffer; + (void)ch_im_out; + (void)numCol_A; + (void)bias_shift; + (void)out_shift; + (void)bias; + (void)pOut; + /* To be completed */ + return NULL; +#endif /* ARM_MATH_DSP */ +} diff --git a/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_q7_q15_reordered.c b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_q7_q15_reordered.c new file mode 100644 index 0000000..0870ac3 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_q7_q15_reordered.c @@ -0,0 +1,137 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_nn_mat_mult_kernel_q7_q15_reordered.c + * Description: Matrix-multiplication function for convolution with reordered columns + * + * $Date: January 26, 2021 + * $Revision: V.1.0.2 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @brief Matrix-multiplication function for convolution with re-ordered input. + * + * @details Refer to header file for details. + * + */ + +q7_t *arm_nn_mat_mult_kernel_q7_q15_reordered(const q7_t *pA, + const q15_t *pInBuffer, + const uint16_t ch_im_out, + const uint16_t numCol_A, + const uint16_t bias_shift, + const uint16_t out_shift, + const q7_t *bias, + q7_t *pOut) +{ + +#if defined(ARM_MATH_DSP) + /* set up the second output pointers */ + q7_t *pOut2 = pOut + ch_im_out; + int i; + + /* this loop over rows in A */ + for (i = 0; i < ch_im_out; i += 2) + { + /* setup pointers for B */ + const q15_t *pB = pInBuffer; + const q15_t *pB2 = pB + numCol_A; + + /* align the second pointer for A */ + const q7_t *pA2 = pA + numCol_A; + + /* init the sum with bias */ + q31_t sum = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift); + q31_t sum2 = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift); + q31_t sum3 = ((q31_t)(bias[i + 1]) << bias_shift) + NN_ROUND(out_shift); + q31_t sum4 = ((q31_t)(bias[i + 1]) << bias_shift) + NN_ROUND(out_shift); + + uint16_t colCnt = numCol_A >> 2; + /* accumulate over the vector */ + while (colCnt) + { + q31_t inA11, inA12, inA21, inA22; + + q31_t inB1 = arm_nn_read_q15x2_ia(&pB); + q31_t inB2 = arm_nn_read_q15x2_ia(&pB2); + + pA = read_and_pad_reordered(pA, &inA11, &inA12); + pA2 = read_and_pad_reordered(pA2, &inA21, &inA22); + + sum = __SMLAD(inA11, inB1, sum); + sum2 = __SMLAD(inA11, inB2, sum2); + sum3 = __SMLAD(inA21, inB1, sum3); + sum4 = __SMLAD(inA21, inB2, sum4); + + inB1 = arm_nn_read_q15x2_ia(&pB); + inB2 = arm_nn_read_q15x2_ia(&pB2); + + sum = __SMLAD(inA12, inB1, sum); + sum2 = __SMLAD(inA12, inB2, sum2); + sum3 = __SMLAD(inA22, inB1, sum3); + sum4 = __SMLAD(inA22, inB2, sum4); + + colCnt--; + } /* while over colCnt */ + colCnt = numCol_A & 0x3; + while (colCnt) + { + q7_t inA1 = *pA++; + q15_t inB1 = *pB++; + q7_t inA2 = *pA2++; + q15_t inB2 = *pB2++; + + sum += inA1 * inB1; + sum2 += inA1 * inB2; + sum3 += inA2 * inB1; + sum4 += inA2 * inB2; + colCnt--; + } /* while over colCnt */ + *pOut++ = (q7_t)__SSAT((sum >> out_shift), 8); + *pOut++ = (q7_t)__SSAT((sum3 >> out_shift), 8); + *pOut2++ = (q7_t)__SSAT((sum2 >> out_shift), 8); + *pOut2++ = (q7_t)__SSAT((sum4 >> out_shift), 8); + + /* skip the row computed with A2 */ + pA += numCol_A; + } /* for over ch_im_out */ + + pOut += ch_im_out; + + /* return the new output pointer with offset */ + return pOut; +#else + (void)pA; + (void)pInBuffer; + (void)ch_im_out; + (void)numCol_A; + (void)bias_shift; + (void)out_shift; + (void)bias; + (void)pOut; + /* To be completed */ + return NULL; +#endif /* ARM_MATH_DSP */ +} diff --git a/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_s8_s16.c b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_s8_s16.c new file mode 100644 index 0000000..cb30068 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_s8_s16.c @@ -0,0 +1,245 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_nn_mat_mult_kernel_s8_s16.c + * Description: Matrix-multiplication function for convolution + * + * $Date: 14. December 2021 + * $Revision: V.1.1.0 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/* + * Matrix-multiplication function for convolution with per-channel requantization. + * + * Refer header file for details. + * + */ + +q7_t *arm_nn_mat_mult_kernel_s8_s16(const q7_t *input_a, + const q15_t *input_b, + const uint16_t output_ch, + const int32_t *out_shift, + const int32_t *out_mult, + const int32_t out_offset, + const int16_t activation_min, + const int16_t activation_max, + const uint16_t num_col_a, + const int32_t *const output_bias, + q7_t *out_0) +{ +#if !defined(ARM_MATH_MVEI) + /* set up the second output pointers */ + q7_t *out_1 = out_0 + output_ch; + const int32_t *bias = output_bias; + + uint16_t row_count = output_ch / 2; + const q7_t *ip_a0 = input_a; + /* this loop over rows in A */ + while (row_count) + { + /* setup pointers for B */ + const q15_t *ip_b0 = input_b; + const q15_t *ip_b1 = ip_b0 + num_col_a; + + /* align the second pointer for A */ + const q7_t *ip_a1 = ip_a0 + num_col_a; + + q31_t ch_0_out_0 = 0; + q31_t ch_0_out_1 = 0; + q31_t ch_1_out_0 = 0; + q31_t ch_1_out_1 = 0; + /* Init accumulator with bias for channel N and N + 1 */ + if (bias) + { + ch_0_out_0 = *bias; + ch_0_out_1 = *bias++; + ch_1_out_0 = *bias; + ch_1_out_1 = *bias++; + } + +#if defined(ARM_MATH_DSP) + uint16_t col_count = num_col_a / 4; + /* accumulate over the vector */ + while (col_count) + { + q31_t a01, a02, a11, a12; + q31_t b0 = arm_nn_read_q15x2_ia(&ip_b0); + q31_t b1 = arm_nn_read_q15x2_ia(&ip_b1); + + ip_a0 = read_and_pad(ip_a0, &a01, &a02); + ip_a1 = read_and_pad(ip_a1, &a11, &a12); + + ch_0_out_0 = __SMLAD(a01, b0, ch_0_out_0); + ch_0_out_1 = __SMLAD(a01, b1, ch_0_out_1); + ch_1_out_0 = __SMLAD(a11, b0, ch_1_out_0); + ch_1_out_1 = __SMLAD(a11, b1, ch_1_out_1); + + b0 = arm_nn_read_q15x2_ia(&ip_b0); + b1 = arm_nn_read_q15x2_ia(&ip_b1); + + ch_0_out_0 = __SMLAD(a02, b0, ch_0_out_0); + ch_0_out_1 = __SMLAD(a02, b1, ch_0_out_1); + ch_1_out_0 = __SMLAD(a12, b0, ch_1_out_0); + ch_1_out_1 = __SMLAD(a12, b1, ch_1_out_1); + + col_count--; + } /* while over col_count */ + col_count = num_col_a & 0x3; +#else + uint16_t col_count = num_col_a; +#endif + while (col_count) + { + q7_t a0 = *ip_a0++; + q15_t b0 = *ip_b0++; + q7_t a1 = *ip_a1++; + q15_t b1 = *ip_b1++; + + ch_0_out_0 += a0 * b0; + ch_0_out_1 += a0 * b1; + ch_1_out_0 += a1 * b0; + ch_1_out_1 += a1 * b1; + col_count--; + } /* while over col_count */ + + ch_0_out_0 = arm_nn_requantize(ch_0_out_0, *out_mult, *out_shift); + ch_0_out_0 += out_offset; + ch_0_out_0 = MAX(ch_0_out_0, activation_min); + ch_0_out_0 = MIN(ch_0_out_0, activation_max); + *out_0++ = (q7_t)ch_0_out_0; + + ch_0_out_1 = arm_nn_requantize(ch_0_out_1, *out_mult, *out_shift); + ch_0_out_1 += out_offset; + ch_0_out_1 = MAX(ch_0_out_1, activation_min); + ch_0_out_1 = MIN(ch_0_out_1, activation_max); + *out_1++ = (q7_t)ch_0_out_1; + out_mult++; + out_shift++; + + ch_1_out_0 = arm_nn_requantize(ch_1_out_0, *out_mult, *out_shift); + ch_1_out_0 += out_offset; + ch_1_out_0 = MAX(ch_1_out_0, activation_min); + ch_1_out_0 = MIN(ch_1_out_0, activation_max); + *out_0++ = (q7_t)ch_1_out_0; + + ch_1_out_1 = arm_nn_requantize(ch_1_out_1, *out_mult, *out_shift); + ch_1_out_1 += out_offset; + ch_1_out_1 = MAX(ch_1_out_1, activation_min); + ch_1_out_1 = MIN(ch_1_out_1, activation_max); + *out_1++ = (q7_t)ch_1_out_1; + out_mult++; + out_shift++; + + /* skip row */ + ip_a0 += num_col_a; + row_count--; + } + + /* compute the last odd numbered row if any */ + if (output_ch & 0x1) + { + /* setup pointers for B */ + const q15_t *ip_b0 = input_b; + const q15_t *ip_b1 = ip_b0 + num_col_a; + + q31_t ch_0_out_0 = 0; + q31_t ch_0_out_1 = 0; + + /* load the bias */ + if (bias) + { + ch_0_out_0 = *bias; + ch_0_out_1 = *bias++; + } + +#if defined(ARM_MATH_DSP) + uint16_t col_count = num_col_a >> 2; + while (col_count) + { + q31_t a01, a02; + q31_t b0 = arm_nn_read_q15x2_ia(&ip_b0); + q31_t b1 = arm_nn_read_q15x2_ia(&ip_b1); + + ip_a0 = read_and_pad(ip_a0, &a01, &a02); + + ch_0_out_0 = __SMLAD(a01, b0, ch_0_out_0); + ch_0_out_1 = __SMLAD(a01, b1, ch_0_out_1); + + b0 = arm_nn_read_q15x2_ia(&ip_b0); + b1 = arm_nn_read_q15x2_ia(&ip_b1); + ch_0_out_0 = __SMLAD(a02, b0, ch_0_out_0); + ch_0_out_1 = __SMLAD(a02, b1, ch_0_out_1); + + col_count--; + } + col_count = num_col_a & 0x3; +#else + uint16_t col_count = num_col_a; +#endif + while (col_count) + { + q7_t a0 = *ip_a0++; + q15_t b0 = *ip_b0++; + q15_t b1 = *ip_b1++; + + ch_0_out_0 += a0 * b0; + ch_0_out_1 += a0 * b1; + col_count--; + } + ch_0_out_0 = arm_nn_requantize(ch_0_out_0, *out_mult, *out_shift); + ch_0_out_0 += out_offset; + ch_0_out_0 = MAX(ch_0_out_0, activation_min); + ch_0_out_0 = MIN(ch_0_out_0, activation_max); + *out_0++ = (q7_t)ch_0_out_0; + + ch_0_out_1 = arm_nn_requantize(ch_0_out_1, *out_mult, *out_shift); + ch_0_out_1 += out_offset; + ch_0_out_1 = MAX(ch_0_out_1, activation_min); + ch_0_out_1 = MIN(ch_0_out_1, activation_max); + *out_1++ = (q7_t)ch_0_out_1; + out_mult++; + out_shift++; + } + + out_0 += output_ch; + + /* return the new output pointer with offset */ + return out_0; +#else + (void)input_a; + (void)input_b; + (void)output_ch; + (void)out_shift; + (void)out_mult; + (void)out_offset; + (void)activation_min; + (void)activation_max; + (void)num_col_a; + (void)output_bias; + (void)out_0; + /* To be completed */ + return NULL; +#endif +} diff --git a/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_s8_s16_reordered.c b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_s8_s16_reordered.c new file mode 100644 index 0000000..842a180 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_s8_s16_reordered.c @@ -0,0 +1,201 @@ +/* + * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_nn_mat_mult_kernel_s8_s16_reordered.c + * Description: Matrix-multiplication function for convolution with reordered columns + * + * $Date: 09. October 2020 + * $Revision: V.1.0.3 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/* + * Matrix-multiplication with re-ordered input and bias inputs for convolution with per-channel + * requantization. The re-ordering is a consequence of sign extension is done by the SXTB16 command. + * + * Refer header file for details. This function differs from arm_nn_mat_mult_kernel_s8_s16(), in that it uses + * read_and_pad_reordered() instead of arm_nn_mat_mult_kernel_s8_s16(). Investigating the cycles impact and + * unifying these two functions is a potential future improvement. + * + */ + +q7_t *arm_nn_mat_mult_kernel_s8_s16_reordered(const q7_t *input_a, + const q15_t *input_b, + const uint16_t output_ch, + const int32_t *out_shift, + const int32_t *out_mult, + const int32_t out_offset, + const int16_t activation_min, + const int16_t activation_max, + const uint16_t num_col_a, + const int32_t *const output_bias, + q7_t *out_0) +{ +#if defined(ARM_MATH_DSP) + /* set up the second output pointers */ + q7_t *out_1 = out_0 + output_ch; + const int32_t *bias = output_bias; + + uint16_t row_count = output_ch / 2; + const q7_t *ip_a0 = input_a; + /* this loop over rows in A */ + while (row_count) + { + /* setup pointers for B */ + const q15_t *ip_b0 = input_b; + const q15_t *ip_b1 = ip_b0 + num_col_a; + + /* align the second pointer for A */ + const q7_t *ip_a1 = ip_a0 + num_col_a; + + /* Init accumulator with bias for channel N and N + 1 */ + q31_t ch_0_out_0 = *bias; + q31_t ch_0_out_1 = *bias++; + q31_t ch_1_out_0 = *bias; + q31_t ch_1_out_1 = *bias++; + + uint16_t col_count = num_col_a / 4; + /* accumulate over the vector */ + while (col_count) + { + q31_t a01, a02, a11, a12; + q31_t b0 = arm_nn_read_q15x2_ia(&ip_b0); + q31_t b1 = arm_nn_read_q15x2_ia(&ip_b1); + + ip_a0 = read_and_pad_reordered(ip_a0, &a01, &a02); + ip_a1 = read_and_pad_reordered(ip_a1, &a11, &a12); + + ch_0_out_0 = __SMLAD(a01, b0, ch_0_out_0); + ch_0_out_1 = __SMLAD(a01, b1, ch_0_out_1); + ch_1_out_0 = __SMLAD(a11, b0, ch_1_out_0); + ch_1_out_1 = __SMLAD(a11, b1, ch_1_out_1); + + b0 = arm_nn_read_q15x2_ia(&ip_b0); + b1 = arm_nn_read_q15x2_ia(&ip_b1); + + ch_0_out_0 = __SMLAD(a02, b0, ch_0_out_0); + ch_0_out_1 = __SMLAD(a02, b1, ch_0_out_1); + ch_1_out_0 = __SMLAD(a12, b0, ch_1_out_0); + ch_1_out_1 = __SMLAD(a12, b1, ch_1_out_1); + + col_count--; + } /* while over col_count */ + + ch_0_out_0 = arm_nn_requantize(ch_0_out_0, *out_mult, *out_shift); + ch_0_out_0 += out_offset; + ch_0_out_0 = MAX(ch_0_out_0, activation_min); + ch_0_out_0 = MIN(ch_0_out_0, activation_max); + *out_0++ = (q7_t)ch_0_out_0; + + ch_0_out_1 = arm_nn_requantize(ch_0_out_1, *out_mult, *out_shift); + ch_0_out_1 += out_offset; + ch_0_out_1 = MAX(ch_0_out_1, activation_min); + ch_0_out_1 = MIN(ch_0_out_1, activation_max); + *out_1++ = (q7_t)ch_0_out_1; + out_mult++; + out_shift++; + + ch_1_out_0 = arm_nn_requantize(ch_1_out_0, *out_mult, *out_shift); + ch_1_out_0 += out_offset; + ch_1_out_0 = MAX(ch_1_out_0, activation_min); + ch_1_out_0 = MIN(ch_1_out_0, activation_max); + *out_0++ = (q7_t)ch_1_out_0; + + ch_1_out_1 = arm_nn_requantize(ch_1_out_1, *out_mult, *out_shift); + ch_1_out_1 += out_offset; + ch_1_out_1 = MAX(ch_1_out_1, activation_min); + ch_1_out_1 = MIN(ch_1_out_1, activation_max); + *out_1++ = (q7_t)ch_1_out_1; + out_mult++; + out_shift++; + + /* skip row */ + ip_a0 += num_col_a; + row_count--; + } + + if (output_ch & 1) + { + /* setup pointers for B */ + const q15_t *ip_b0 = input_b; + const q15_t *ip_b1 = ip_b0 + num_col_a; + + /* Init accumulator with bias for channel N + 1 */ + q31_t ch_0_out_0 = *bias; + q31_t ch_0_out_1 = ch_0_out_0; + + int32_t col_count = num_col_a / 4; + while (col_count) + { + q31_t a01, a02; + q31_t b0 = arm_nn_read_q15x2_ia(&ip_b0); + q31_t b1 = arm_nn_read_q15x2_ia(&ip_b1); + + ip_a0 = read_and_pad_reordered(ip_a0, &a01, &a02); + + ch_0_out_0 = __SMLAD(a01, b0, ch_0_out_0); + ch_0_out_1 = __SMLAD(a01, b1, ch_0_out_1); + + b0 = arm_nn_read_q15x2_ia(&ip_b0); + b1 = arm_nn_read_q15x2_ia(&ip_b1); + + ch_0_out_0 = __SMLAD(a02, b0, ch_0_out_0); + ch_0_out_1 = __SMLAD(a02, b1, ch_0_out_1); + + col_count--; + } /* while over col_count */ + + ch_0_out_0 = arm_nn_requantize(ch_0_out_0, *out_mult, *out_shift); + ch_0_out_0 += out_offset; + ch_0_out_0 = MAX(ch_0_out_0, activation_min); + ch_0_out_0 = MIN(ch_0_out_0, activation_max); + *out_0++ = (q7_t)ch_0_out_0; + + ch_0_out_1 = arm_nn_requantize(ch_0_out_1, *out_mult, *out_shift); + ch_0_out_1 += out_offset; + ch_0_out_1 = MAX(ch_0_out_1, activation_min); + ch_0_out_1 = MIN(ch_0_out_1, activation_max); + *out_1++ = (q7_t)ch_0_out_1; + } + + out_0 += output_ch; + + /* return the new output pointer with offset */ + return out_0; +#else + (void)input_a; + (void)input_b; + (void)output_ch; + (void)out_shift; + (void)out_mult; + (void)out_offset; + (void)activation_min; + (void)activation_max; + (void)num_col_a; + (void)output_bias; + (void)out_0; + /* To be completed */ + return NULL; +#endif +} diff --git a/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_s8.c b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_s8.c new file mode 100644 index 0000000..adfa702 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_s8.c @@ -0,0 +1,180 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_nn_mat_mult_s8.c + * Description: General Matrix-multiplication function + * + * $Date: 27. October 2021 + * $Revision: V.2.0.6 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ + +#include "arm_nnsupportfunctions.h" + +/* + * s8 General matrix multiplication function with per-channel requantization for upto 4 column batches. + * + * Refer header file for details. + * + */ + +q7_t *arm_nn_mat_mult_s8(const q7_t *input_row, + const q7_t *input_col, + const uint16_t output_ch, + const uint16_t col_batches, + const int32_t *output_shift, + const int32_t *output_mult, + const int32_t out_offset, + const int32_t col_offset, + const int32_t row_offset, + const int16_t activation_min, + const int16_t activation_max, + const uint16_t row_len, + const int32_t *const bias, + q7_t *out) +{ +#if defined(ARM_MATH_MVEI) + (void)row_offset; + if (col_batches == 4) + { + for (int i_out_ch = 0; i_out_ch < output_ch; i_out_ch++) + { + int32_t row_len_tmp = row_len; + const int8_t *ip_r0 = input_row + (i_out_ch * row_len); + const int8_t *ip_c0 = input_col; + const int8_t *ip_c1 = input_col + row_len; + const int8_t *ip_c2 = input_col + (2 * row_len); + const int8_t *ip_c3 = input_col + (3 * row_len); + + int32_t acc_0 = 0; + int32_t acc_1 = 0; + int32_t acc_2 = 0; + int32_t acc_3 = 0; + const int32_t row_loop_cnt = (row_len + 7) / 8; + + for (int i_row_loop = 0; i_row_loop < row_loop_cnt; i_row_loop++) + { + mve_pred16_t p = vctp16q((uint32_t)row_len_tmp); + const int16x8_t offset = vdupq_m_n_s16(vuninitializedq_s16(), col_offset, p); + row_len_tmp -= 8; + + int16x8_t c0 = vldrbq_s16(ip_c0); + ip_c0 += 8; + c0 = vaddq_s16(c0, offset); + + int16x8_t c1 = vldrbq_s16(ip_c1); + ip_c1 += 8; + c1 = vaddq_s16(c1, offset); + + int16x8_t c2 = vldrbq_s16(ip_c2); + ip_c2 += 8; + c2 = vaddq_s16(c2, offset); + + int16x8_t c3 = vldrbq_s16(ip_c3); + ip_c3 += 8; + c3 = vaddq_s16(c3, offset); + + int16x8_t r0 = vldrbq_z_s16(ip_r0, p); + ip_r0 += 8; + + acc_0 = vmladavaq_p_s16(acc_0, r0, c0, p); + acc_1 = vmladavaq_p_s16(acc_1, r0, c1, p); + acc_2 = vmladavaq_p_s16(acc_2, r0, c2, p); + acc_3 = vmladavaq_p_s16(acc_3, r0, c3, p); + } + + int32x4_t res = {acc_0, acc_1, acc_2, acc_3}; + if (bias) + { + res = vaddq_n_s32(res, bias[i_out_ch]); + } + res = arm_requantize_mve(res, output_mult[i_out_ch], output_shift[i_out_ch]); + res = vaddq_n_s32(res, out_offset); + + res = vmaxq_s32(res, vdupq_n_s32(activation_min)); + res = vminq_s32(res, vdupq_n_s32(activation_max)); + + const uint32x4_t scatter_offset = {0, output_ch, output_ch * 2, output_ch * 3}; + vstrbq_scatter_offset_s32(&out[i_out_ch], scatter_offset, res); + } + out += 4 * output_ch; + } + else + { + for (int i_col_batch = (col_batches & ~0x3); i_col_batch < (col_batches & 0x3); i_col_batch++) + { + for (int i_out_ch = 0; i_out_ch < output_ch; i_out_ch++) + { + int32_t row_len_tmp = row_len; + + const int8_t *ip_r0 = input_row + (i_out_ch * row_len); + const int8_t *ip_c0 = input_col + (i_col_batch * row_len); + int32_t acc_0 = 0; + const int32_t row_loop_cnt = (row_len + 7) / 8; + + for (int i_row_loop = 0; i_row_loop < row_loop_cnt; i_row_loop++) + { + const mve_pred16_t p = vctp16q((uint32_t)row_len_tmp); + const int16x8_t offset = vdupq_m_n_s16(vuninitializedq_s16(), col_offset, p); + row_len_tmp -= 8; + + int16x8_t c0 = vldrbq_s16(ip_c0); + ip_c0 += 8; + c0 = vaddq_s16(c0, offset); + + int16x8_t r0 = vldrbq_z_s16(ip_r0, p); + ip_r0 += 8; + acc_0 = vmladavaq_p_s16(acc_0, r0, c0, p); + } + + if (bias) + { + acc_0 += bias[i_out_ch]; + } + acc_0 = arm_nn_requantize(acc_0, output_mult[i_out_ch], output_shift[i_out_ch]); + acc_0 += out_offset; + acc_0 = MAX(acc_0, activation_min); + acc_0 = MIN(acc_0, activation_max); + out[i_out_ch] = (q7_t)acc_0; + } + out += output_ch; + } + } + return out; + +#else + (void)input_row; + (void)input_col; + (void)output_ch; + (void)col_batches; + (void)output_shift; + (void)output_mult; + (void)out_offset; + (void)col_offset; + (void)row_offset; + (void)activation_min; + (void)activation_max; + (void)row_len; + (void)bias; + (void)out; + return NULL; +#endif +} diff --git a/Drivers/CMSIS/NN/Source/FullyConnectedFunctions/CMakeLists.txt b/Drivers/CMSIS/NN/Source/FullyConnectedFunctions/CMakeLists.txt new file mode 100644 index 0000000..cccd996 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/FullyConnectedFunctions/CMakeLists.txt @@ -0,0 +1,21 @@ +# +# Copyright (c) 2019-2021 Arm Limited. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +file(GLOB SRC "./*_s8.c") +target_sources(cmsis-nn PRIVATE ${SRC} arm_fully_connected_s16.c) + diff --git a/Drivers/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_mat_q7_vec_q15.c b/Drivers/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_mat_q7_vec_q15.c new file mode 100644 index 0000000..9eb02eb --- /dev/null +++ b/Drivers/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_mat_q7_vec_q15.c @@ -0,0 +1,197 @@ +/* + * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_fully_connected_mat_q7_vec_q15.c + * Description: Mixed Q15-Q7 fully-connected layer function + * + * $Date: 20. July 2021 + * $Revision: V.1.1.1 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup FC + * @{ + */ + +/** + * @brief Mixed Q15-Q7 fully-connected layer function + * @param[in] pV pointer to input vector + * @param[in] pM pointer to matrix weights + * @param[in] dim_vec length of the vector + * @param[in] num_of_rows number of rows in weight matrix + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in] bias pointer to bias + * @param[in,out] pOut pointer to output vector + * @param[in,out] vec_buffer pointer to buffer space for input + * @return The function returns <code>ARM_MATH_SUCCESS</code> + * + * @details + * + * <b>Buffer size:</b> + * + * vec_buffer size: 0 + * + * Q7_Q15 version of the fully connected layer + * + * Weights are in q7_t and Activations are in q15_t + * + */ + +arm_status arm_fully_connected_mat_q7_vec_q15(const q15_t *pV, + const q7_t *pM, + const uint16_t dim_vec, + const uint16_t num_of_rows, + const uint16_t bias_shift, + const uint16_t out_shift, + const q7_t *bias, + q15_t *pOut, + q15_t *vec_buffer) +{ + (void)vec_buffer; +#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) + /* Run the following code for Cortex-M4 and Cortex-M7 */ + + const q7_t *pB = pM; + const q7_t *pB2; + q15_t *pO = pOut; + const q7_t *pBias = bias; + const q15_t *pA = pV; + + uint16_t rowCnt = num_of_rows >> 1; + + while (rowCnt) + { + q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + uint16_t colCnt = dim_vec >> 2; + + pA = pV; + pB2 = pB + dim_vec; + + while (colCnt) + { + q31_t inV, inM11, inM12, inM21, inM22; + pB = read_and_pad(pB, &inM11, &inM12); + pB2 = read_and_pad(pB2, &inM21, &inM22); + + inV = arm_nn_read_q15x2_ia(&pA); + + sum = __SMLAD(inV, inM11, sum); + sum2 = __SMLAD(inV, inM21, sum2); + + inV = arm_nn_read_q15x2_ia(&pA); + + sum = __SMLAD(inV, inM12, sum); + sum2 = __SMLAD(inV, inM22, sum2); + + colCnt--; + } + colCnt = dim_vec & 0x3; + while (colCnt) + { + q15_t inV = *pA++; + q7_t inM = *pB++; + q7_t inM2 = *pB2++; + + sum += inV * inM; + sum2 += inV * inM2; + colCnt--; + } /* while over colCnt */ + *pO++ = (q15_t)(__SSAT((sum >> out_shift), 16)); + *pO++ = (q15_t)(__SSAT((sum2 >> out_shift), 16)); + + /*adjust the pointers and counters */ + pB += dim_vec; + rowCnt--; + } + + /* left-over part of the rows */ + rowCnt = num_of_rows & 0x1; + + while (rowCnt) + { + q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + uint16_t colCnt = dim_vec >> 2; + + pA = pV; + + while (colCnt) + { + q31_t inV1, inV2, inM11, inM12; + + pB = read_and_pad(pB, &inM11, &inM12); + + inV1 = arm_nn_read_q15x2_ia(&pA); + sum = __SMLAD(inV1, inM11, sum); + + inV2 = arm_nn_read_q15x2_ia(&pA); + sum = __SMLAD(inV2, inM12, sum); + + colCnt--; + } + + /* left-over of the vector */ + colCnt = dim_vec & 0x3; + while (colCnt) + { + q15_t inV = *pA++; + q7_t inM = *pB++; + sum += inV * inM; + colCnt--; + } + + *pO++ = (q15_t)(__SSAT((sum >> out_shift), 16)); + + rowCnt--; + } + +#else + int i, j; + /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ + for (i = 0; i < num_of_rows; i++) + { + int ip_out = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift); + for (j = 0; j < dim_vec; j++) + { + ip_out += pV[j] * pM[i * dim_vec + j]; + } + pOut[i] = (q15_t)__SSAT((ip_out >> out_shift), 16); + } + +#endif /* ARM_MATH_DSP */ + + /* Return to ARM_MATH_SUCCESS */ + return (ARM_MATH_SUCCESS); +} + +/** + * @} end of FC group + */ diff --git a/Drivers/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_mat_q7_vec_q15_opt.c b/Drivers/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_mat_q7_vec_q15_opt.c new file mode 100644 index 0000000..a2da772 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_mat_q7_vec_q15_opt.c @@ -0,0 +1,417 @@ +/* + * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_fully_connected_mat_q7_vec_q15_opt.c + * Description: Mixed Q15-Q7 opt fully-connected layer function + * + * $Date: 20. July 2021 + * $Revision: V.1.1.1 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup FC + * @{ + */ + +/** + * @brief Mixed Q15-Q7 opt fully-connected layer function + * @param[in] pV pointer to input vector + * @param[in] pM pointer to matrix weights + * @param[in] dim_vec length of the vector + * @param[in] num_of_rows number of rows in weight matrix + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in] bias pointer to bias + * @param[in,out] pOut pointer to output vector + * @param[in,out] vec_buffer pointer to buffer space for input + * @return The function returns <code>ARM_MATH_SUCCESS</code> + * + * @details + * + * <b>Buffer size:</b> + * + * vec_buffer size: 0 + * + * Q7_Q15 version of the fully connected layer + * + * Weights are in q7_t and Activations are in q15_t + * + * Limitation: x4 version requires weight reordering to work + * + * Here we use only one pointer to read 4 rows in the weight + * matrix. So if the original q7_t matrix looks like this: + * + * | a11 | a12 | a13 | a14 | a15 | a16 | a17 | + * + * | a21 | a22 | a23 | a24 | a25 | a26 | a27 | + * + * | a31 | a32 | a33 | a34 | a35 | a36 | a37 | + * + * | a41 | a42 | a43 | a44 | a45 | a46 | a47 | + * + * | a51 | a52 | a53 | a54 | a55 | a56 | a57 | + * + * | a61 | a62 | a63 | a64 | a65 | a66 | a67 | + * + * We operates on multiple-of-4 rows, so the first four rows becomes + * + * | a11 | a21 | a12 | a22 | a31 | a41 | a32 | a42 | + * + * | a13 | a23 | a14 | a24 | a33 | a43 | a34 | a44 | + * + * | a15 | a25 | a16 | a26 | a35 | a45 | a36 | a46 | + * + * The column left over will be in-order. + * which is: + * | a17 | a27 | a37 | a47 | + * + * For the left-over rows, we do 1x1 computation, so the data remains + * as its original order. + * + * So the stored weight matrix looks like this: + * + * | a11 | a21 | a12 | a22 | a31 | a41 | + * + * | a32 | a42 | a13 | a23 | a14 | a24 | + * + * | a33 | a43 | a34 | a44 | a15 | a25 | + * + * | a16 | a26 | a35 | a45 | a36 | a46 | + * + * | a17 | a27 | a37 | a47 | a51 | a52 | + * + * | a53 | a54 | a55 | a56 | a57 | a61 | + * + * | a62 | a63 | a64 | a65 | a66 | a67 | + * + */ + +arm_status arm_fully_connected_mat_q7_vec_q15_opt(const q15_t *pV, + const q7_t *pM, + const uint16_t dim_vec, + const uint16_t num_of_rows, + const uint16_t bias_shift, + const uint16_t out_shift, + const q7_t *bias, + q15_t *pOut, + q15_t *vec_buffer) +{ + + (void)vec_buffer; +#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) + /* Run the following code for Cortex-M4 and Cortex-M7 */ + + const q7_t *pB = pM; + q15_t *pO = pOut; + const q7_t *pBias = bias; + const q15_t *pA = pV; + + uint16_t rowCnt = num_of_rows >> 2; + + while (rowCnt) + { + q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + q31_t sum3 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + + uint16_t colCnt = dim_vec >> 1; + + pA = pV; + +#ifdef USE_INTRINSIC + +#ifndef ARM_MATH_BIG_ENDIAN + + while (colCnt) + { + q31_t inM11, inM12, inM13, inM14; + q31_t inV; + + inV = arm_nn_read_q15x2_ia(&pA); + inM11 = arm_nn_read_q7x4_ia(&pB); + inM12 = __SXTB16(__ROR(inM11, 8)); + inM11 = __SXTB16(inM11); + sum = __SMLAD(inM11, inV, sum); + sum2 = __SMLAD(inM12, inV, sum2); + inM13 = arm_nn_read_q7x4_ia(&pB); + inM14 = __SXTB16(__ROR(inM13, 8)); + inM13 = __SXTB16(inM13); + sum3 = __SMLAD(inM13, inV, sum3); + sum4 = __SMLAD(inM14, inV, sum4); + colCnt--; + } + +#else + + while (colCnt) + { + q31_t inM11, inM12, inM13, inM14; + q31_t inV; + + inV = *__SIMD32(pA)++; + inM11 = arm_nn_read_q7x4_ia(&pB); + inM12 = __SXTB16(__ROR(inM11, 8)); + inM11 = __SXTB16(inM11); + sum = __SMLAD(inM12, inV, sum); + sum2 = __SMLAD(inM11, inV, sum2); + inM13 = arm_nn_read_q7x4_ia(&pB); + inM14 = __SXTB16(__ROR(inM13, 8)); + inM13 = __SXTB16(inM13); + sum3 = __SMLAD(inM14, inV, sum3); + sum4 = __SMLAD(inM13, inV, sum4); + colCnt--; + } + +#endif /* ARM_MATH_BIG_ENDIAN */ + +#else + + /* + * register needed: + * loop counter: colCnt + * accumulators: sum, sum2, sum3, sum4 + * pointers: pB, pA + * weight data: inM11, inM12, inM13, inM14 + * activation data: inV + */ + +#ifndef ARM_MATH_BIG_ENDIAN + asm volatile("COL_LOOP_%=:\n" + "ldr.w r4, [%[pA]], #4\n" + "ldr.w r1, [%[pB]], #8\n" + "mov.w r0, r1, ror #8\n" + "sxtb16 r0, r0\n" + "sxtb16 r1, r1\n" + "smlad %[sum], r4, r1, %[sum]\n" + "smlad %[sum2], r4, r0, %[sum2]\n" + "ldr.w r3, [%[pB], #-4]\n" + "mov.w r2, r3, ror #8\n" + "sxtb16 r2, r2\n" + "sxtb16 r3, r3\n" + "smlad %[sum3], r4, r3, %[sum3]\n" + "smlad %[sum4], r4, r2, %[sum4]\n" + "subs %[colCnt], #1\n" + "bne COL_LOOP_%=\n" + : [ sum ] "+r"(sum), + [ sum2 ] "+r"(sum2), + [ sum3 ] "+r"(sum3), + [ sum4 ] "+r"(sum4), + [ pB ] "+r"(pB), + [ pA ] "+r"(pA) + : [ colCnt ] "r"(colCnt) + : "r0", "r1", "r2", "r3", "r4"); +#else + asm volatile("COL_LOOP_%=:\n" + "ldr.w r4, [%[pA]], #4\n" + "ldr.w r1, [%[pB]], #8\n" + "mov.w r0, r1, ror #8\n" + "sxtb16 r0, r0\n" + "sxtb16 r1, r1\n" + "smlad %[sum], r4, r0, %[sum]\n" + "smlad %[sum2], r4, r1, %[sum2]\n" + "ldr.w r3, [%[pB], #-4]\n" + "mov.w r2, r3, ror #8\n" + "sxtb16 r2, r2\n" + "sxtb16 r3, r3\n" + "smlad %[sum3], r4, r2, %[sum3]\n" + "smlad %[sum4], r4, r3, %[sum4]\n" + "subs %[colCnt], #1\n" + "bne COL_LOOP_%=\n" + : [ sum ] "+r"(sum), + [ sum2 ] "+r"(sum2), + [ sum3 ] "+r"(sum3), + [ sum4 ] "+r"(sum4), + [ pB ] "+r"(pB), + [ pA ] "+r"(pA) + : [ colCnt ] "r"(colCnt) + : "r0", "r1", "r2", "r3", "r4"); +#endif /* ARM_MATH_BIG_ENDIAN */ + +#endif /* USE_INTRINSIC */ + + colCnt = dim_vec & 0x1; + while (colCnt) + { + q15_t inV = *pA++; + q7_t inM = *pB++; + q7_t inM2 = *pB++; + q7_t inM3 = *pB++; + q7_t inM4 = *pB++; + + sum += inV * inM; + sum2 += inV * inM2; + sum3 += inV * inM3; + sum4 += inV * inM4; + colCnt--; + } /* while over colCnt */ + *pO++ = (q15_t)(__SSAT((sum >> out_shift), 16)); + *pO++ = (q15_t)(__SSAT((sum2 >> out_shift), 16)); + *pO++ = (q15_t)(__SSAT((sum3 >> out_shift), 16)); + *pO++ = (q15_t)(__SSAT((sum4 >> out_shift), 16)); + + /* adjust the pointers and counters */ + rowCnt--; + } + + /* left-over part of the rows */ + rowCnt = num_of_rows & 0x3; + + while (rowCnt) + { + q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + + uint16_t colCnt = dim_vec >> 2; + + pA = pV; + + while (colCnt) + { + q31_t inV1, inV2, inM11, inM12; + + pB = read_and_pad(pB, &inM11, &inM12); + + inV1 = arm_nn_read_q15x2_ia(&pA); + sum = __SMLAD(inV1, inM11, sum); + + inV2 = arm_nn_read_q15x2_ia(&pA); + sum = __SMLAD(inV2, inM12, sum); + + colCnt--; + } + + /* left-over of the vector */ + colCnt = dim_vec & 0x3; + while (colCnt) + { + q15_t inV = *pA++; + q7_t inM = *pB++; + sum += inV * inM; + colCnt--; + } + + *pO++ = (q15_t)(__SSAT((sum >> out_shift), 16)); + + rowCnt--; + } + +#else + /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ + uint16_t rowCnt = num_of_rows >> 2; + const q7_t *pB = pM; + const q15_t *pA; + q15_t *pO = pOut; + const q7_t *pBias = bias; + + while (rowCnt) + { + q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + q31_t sum3 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + uint16_t colCnt = dim_vec >> 1; + + pA = pV; + + while (colCnt) + { + q15_t inA1 = *pA++; + q15_t inA2 = *pA++; + + q7_t inB1 = *pB++; + q7_t inB3 = *pB++; + q7_t inB2 = *pB++; + q7_t inB4 = *pB++; + + sum += inA1 * inB1 + inA2 * inB2; + sum2 += inA1 * inB3 + inA2 * inB4; + + inB1 = *pB++; + inB3 = *pB++; + inB2 = *pB++; + inB4 = *pB++; + + sum3 += inA1 * inB1 + inA2 * inB2; + sum4 += inA1 * inB3 + inA2 * inB4; + + colCnt--; + } + + colCnt = dim_vec & 0x1; + while (colCnt) + { + q15_t inA = *pA++; + q7_t inB = *pB++; + sum += inA * inB; + inB = *pB++; + sum2 += inA * inB; + inB = *pB++; + sum3 += inA * inB; + inB = *pB++; + sum4 += inA * inB; + + colCnt--; + } + *pO++ = (q15_t)__SSAT((sum >> out_shift), 16); + *pO++ = (q15_t)__SSAT((sum2 >> out_shift), 16); + *pO++ = (q15_t)__SSAT((sum3 >> out_shift), 16); + *pO++ = (q15_t)__SSAT((sum4 >> out_shift), 16); + + rowCnt--; + } + + rowCnt = num_of_rows & 0x3; + + while (rowCnt) + { + int ip_out = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + int j; + + pA = pV; + for (j = 0; j < dim_vec; j++) + { + q15_t inA = *pA++; + q7_t inB = *pB++; + ip_out += inA * inB; + } + *pO++ = (q15_t)__SSAT((ip_out >> out_shift), 16); + + rowCnt--; + } + +#endif /* ARM_MATH_DSP */ + + /* Return to ARM_MATH_SUCCESS */ + return (ARM_MATH_SUCCESS); +} + +/** + * @} end of FC group + */ diff --git a/Drivers/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_q15.c b/Drivers/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_q15.c new file mode 100644 index 0000000..d8b6887 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_q15.c @@ -0,0 +1,195 @@ +/* + * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_fully_connected_q15.c + * Description: Q15 basic fully-connected layer function + * + * $Date: 20. July 2021 + * $Revision: V.1.1.1 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup FC + * @{ + */ + +/** + * @brief Q15 opt fully-connected layer function + * @param[in] pV pointer to input vector + * @param[in] pM pointer to matrix weights + * @param[in] dim_vec length of the vector + * @param[in] num_of_rows number of rows in weight matrix + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in] bias pointer to bias + * @param[in,out] pOut pointer to output vector + * @param[in,out] vec_buffer pointer to buffer space for input + * @return The function returns <code>ARM_MATH_SUCCESS</code> + * + * + * @details + * + * <b>Buffer size:</b> + * + * vec_buffer size: 0 + * + */ + +arm_status arm_fully_connected_q15(const q15_t *pV, + const q15_t *pM, + const uint16_t dim_vec, + const uint16_t num_of_rows, + const uint16_t bias_shift, + const uint16_t out_shift, + const q15_t *bias, + q15_t *pOut, + q15_t *vec_buffer) +{ + (void)vec_buffer; +#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) + /* Run the following code for Cortex-M4 and Cortex-M7 */ + + const q15_t *pB = pM; + const q15_t *pB2 = pB + dim_vec; + q15_t *pO = pOut; + const q15_t *pA; + const q15_t *pBias = bias; + uint16_t rowCnt = num_of_rows >> 1; + + /* this loop loops over different output */ + while (rowCnt) + { + q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + + uint16_t colCnt = dim_vec >> 2; + + pA = pV; + pB2 = pB + dim_vec; + + while (colCnt) + { + q31_t inV1, inM1, inM2; + inV1 = arm_nn_read_q15x2_ia(&pA); + inM1 = arm_nn_read_q15x2_ia(&pB); + sum = __SMLAD(inV1, inM1, sum); + inM2 = arm_nn_read_q15x2_ia(&pB2); + sum2 = __SMLAD(inV1, inM2, sum2); + + inV1 = arm_nn_read_q15x2_ia(&pA); + inM1 = arm_nn_read_q15x2_ia(&pB); + sum = __SMLAD(inV1, inM1, sum); + inM2 = arm_nn_read_q15x2_ia(&pB2); + sum2 = __SMLAD(inV1, inM2, sum2); + + colCnt--; + } + colCnt = dim_vec & 0x3; + while (colCnt) + { + q15_t inV = *pA++; + q15_t inM = *pB++; + q15_t inM2 = *pB2++; + + sum += inV * inM; + sum2 += inV * inM2; + colCnt--; + } /* while over colCnt */ + *pO++ = (q15_t)(__SSAT((sum >> out_shift), 16)); + *pO++ = (q15_t)(__SSAT((sum2 >> out_shift), 16)); + + /* adjust the pointers and counters */ + pB = pB + dim_vec; + rowCnt--; + } + + rowCnt = num_of_rows & 0x1; + + while (rowCnt) + { + q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + + uint16_t colCnt = dim_vec >> 2; + + pA = pV; + + while (colCnt) + { + q31_t inV1, inM1; + inV1 = arm_nn_read_q15x2_ia(&pA); + inM1 = arm_nn_read_q15x2_ia(&pB); + sum = __SMLAD(inV1, inM1, sum); + + inV1 = arm_nn_read_q15x2_ia(&pA); + inM1 = arm_nn_read_q15x2_ia(&pB); + sum = __SMLAD(inV1, inM1, sum); + + colCnt--; + } + + /* left-over of the vector */ + colCnt = dim_vec & 0x3; + while (colCnt) + { + q15_t inV = *pA++; + q15_t inM = *pB++; + + sum += inV * inM; + + colCnt--; + } + + *pO++ = (q15_t)(__SSAT((sum >> out_shift), 16)); + + rowCnt--; + } + +#else + int i, j; + /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ + for (i = 0; i < num_of_rows; i++) + { + int ip_out = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift); + for (j = 0; j < dim_vec; j++) + { + ip_out += pV[j] * pM[i * dim_vec + j]; + } + pOut[i] = (q15_t)__SSAT((ip_out >> out_shift), 16); + } + +#endif /* ARM_MATH_DSP */ + + /* Return to application */ + return (ARM_MATH_SUCCESS); +} + +/** + * @} end of FC group + */ diff --git a/Drivers/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_q15_opt.c b/Drivers/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_q15_opt.c new file mode 100644 index 0000000..f6c9b16 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_q15_opt.c @@ -0,0 +1,336 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_fully_connected_q15_opt.c + * Description: Q15 opt fully-connected layer function + * + * $Date: 20. July 2021 + * $Revision: V.1.1.1 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup FC + * @{ + */ + +/** + * @brief Q15 opt fully-connected layer function + * @param[in] pV pointer to input vector + * @param[in] pM pointer to matrix weights + * @param[in] dim_vec length of the vector + * @param[in] num_of_rows number of rows in weight matrix + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in] bias pointer to bias + * @param[in,out] pOut pointer to output vector + * @param[in,out] vec_buffer pointer to buffer space for input + * @return The function returns <code>ARM_MATH_SUCCESS</code> + * + * + * @details + * + * <b>Buffer size:</b> + * + * vec_buffer size: 0 + * + * Here we use only one pointer to read 4 rows in the weight + * matrix. So if the original matrix looks like this: + * + * | a11 | a12 | a13 | + * + * | a21 | a22 | a23 | + * + * | a31 | a32 | a33 | + * + * | a41 | a42 | a43 | + * + * | a51 | a52 | a53 | + * + * | a61 | a62 | a63 | + * + * We operates on multiple-of-4 rows, so the first four rows becomes + * + * | a11 | a12 | a21 | a22 | a31 | a32 | a41 | a42 | + * + * | a13 | a23 | a33 | a43 | + * + * Remaining rows are kept the same original order. + * + * So the stored weight matrix looks like this: + * + * + * | a11 | a12 | a21 | a22 | a31 | a32 | a41 | a42 | + * + * | a13 | a23 | a33 | a43 | a51 | a52 | a53 | a61 | + * + * | a62 | a63 | + */ + +arm_status arm_fully_connected_q15_opt(const q15_t *pV, + const q15_t *pM, + const uint16_t dim_vec, + const uint16_t num_of_rows, + const uint16_t bias_shift, + const uint16_t out_shift, + const q15_t *bias, + q15_t *pOut, + q15_t *vec_buffer) +{ + (void)vec_buffer; +#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) + /* Run the following code for Cortex-M4 and Cortex-M7 */ + + const q15_t *pB = pM; + q15_t *pO = pOut; + const q15_t *pBias = bias; + const q15_t *pA = pV; + + uint16_t rowCnt = num_of_rows >> 2; + + while (rowCnt) + { + q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + q31_t sum3 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + + uint16_t colCnt = dim_vec >> 1; + + pA = pV; + +#ifdef USE_INTRINSIC + + while (colCnt) + { + q31_t inM11, inM12, inM13, inM14; + q31_t inV; + + inV = arm_nn_read_q15x2_ia(&pA); + inM11 = arm_nn_read_q15x2_ia(&pB); + sum = __SMLAD(inV, inM11, sum); + inM12 = arm_nn_read_q15x2_ia(&pB); + sum2 = __SMLAD(inV, inM12, sum2); + inM13 = arm_nn_read_q15x2_ia(&pB); + sum3 = __SMLAD(inV, inM13, sum3); + inM14 = arm_nn_read_q15x2_ia(&pB); + sum4 = __SMLAD(inV, inM14, sum4); + colCnt--; + } + +#else + + /* + * register needed: + * loop counter: colCnt + * accumulators: sum, sum2, sum3, sum4 + * pointers: pB, pA + * weight data: inM11, inM12, inM13, inM14 + * activation data: inV + */ + + asm volatile("COL_LOOP_%=:\n" + "ldr.w r4, [%[pA]], #4\n" + "ldr.w r0, [%[pB]], #16\n" + "smlad %[sum], r4, r0, %[sum]\n" + "ldr.w r1, [%[pB] , #-12]\n" + "smlad %[sum2], r4, r1, %[sum2]\n" + "ldr.w r2, [%[pB] , #-8]\n" + "smlad %[sum3], r4, r2, %[sum3]\n" + "ldr.w r3, [%[pB] , #-4]\n" + "smlad %[sum4], r4, r3, %[sum4]\n" + "subs %[colCnt], #1\n" + "bne COL_LOOP_%=\n" + : [ sum ] "+r"(sum), + [ sum2 ] "+r"(sum2), + [ sum3 ] "+r"(sum3), + [ sum4 ] "+r"(sum4), + [ pB ] "+r"(pB), + [ pA ] "+r"(pA) + : [ colCnt ] "r"(colCnt) + : "r0", "r1", "r2", "r3", "r4"); + +#endif /* USE_INTRINSIC */ + + colCnt = dim_vec & 0x1; + while (colCnt) + { + + q15_t inV = *pA++; + q15_t inM = *pB++; + q15_t inM2 = *pB++; + q15_t inM3 = *pB++; + q15_t inM4 = *pB++; + + sum += inV * inM; + sum2 += inV * inM2; + sum3 += inV * inM3; + sum4 += inV * inM4; + colCnt--; + } /* while over colCnt */ + *pO++ = (q15_t)(__SSAT((sum >> out_shift), 16)); + *pO++ = (q15_t)(__SSAT((sum2 >> out_shift), 16)); + *pO++ = (q15_t)(__SSAT((sum3 >> out_shift), 16)); + *pO++ = (q15_t)(__SSAT((sum4 >> out_shift), 16)); + + /* adjust the pointers and counters */ + rowCnt--; + } + + /* left-over part of the rows */ + rowCnt = num_of_rows & 0x3; + + while (rowCnt) + { + q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + + uint16_t colCnt = dim_vec >> 2; + + pA = pV; + + while (colCnt) + { + q31_t inV1, inV2, inM1, inM2; + + inM1 = arm_nn_read_q15x2_ia(&pB); + inV1 = arm_nn_read_q15x2_ia(&pA); + sum = __SMLAD(inV1, inM1, sum); + + inM2 = arm_nn_read_q15x2_ia(&pB); + inV2 = arm_nn_read_q15x2_ia(&pA); + sum = __SMLAD(inV2, inM2, sum); + + colCnt--; + } + + /* left-over of the vector */ + colCnt = dim_vec & 0x3; + while (colCnt) + { + q15_t inV = *pA++; + q15_t inM = *pB++; + sum += inV * inM; + colCnt--; + } + + *pO++ = (q15_t)(__SSAT((sum >> out_shift), 16)); + + rowCnt--; + } + +#else + /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ + uint16_t rowCnt = num_of_rows >> 2; + const q15_t *pB = pM; + const q15_t *pA; + q15_t *pO = pOut; + const q15_t *pBias = bias; + + while (rowCnt) + { + q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + q31_t sum3 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + + uint16_t colCnt = dim_vec >> 1; + + pA = pV; + while (colCnt) + { + q15_t inA1 = *pA++; + q15_t inA2 = *pA++; + + q15_t inB1 = *pB++; + q15_t inB2 = *pB++; + sum += inA1 * inB1 + inA2 * inB2; + + inB1 = *pB++; + inB2 = *pB++; + sum2 += inA1 * inB1 + inA2 * inB2; + + inB1 = *pB++; + inB2 = *pB++; + sum3 += inA1 * inB1 + inA2 * inB2; + + inB1 = *pB++; + inB2 = *pB++; + sum4 += inA1 * inB1 + inA2 * inB2; + + colCnt--; + } + colCnt = dim_vec & 0x1; + while (colCnt) + { + q15_t inA = *pA++; + q15_t inB = *pB++; + sum += inA * inB; + inB = *pB++; + sum2 += inA * inB; + inB = *pB++; + sum3 += inA * inB; + inB = *pB++; + sum4 += inA * inB; + colCnt--; + } + *pO++ = (q15_t)__SSAT((sum >> out_shift), 16); + *pO++ = (q15_t)__SSAT((sum2 >> out_shift), 16); + *pO++ = (q15_t)__SSAT((sum3 >> out_shift), 16); + *pO++ = (q15_t)__SSAT((sum4 >> out_shift), 16); + + rowCnt--; + } + rowCnt = num_of_rows & 0x3; + + while (rowCnt) + { + int ip_out = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + int j; + + pA = pV; + for (j = 0; j < dim_vec; j++) + { + q15_t inA = *pA++; + q15_t inB = *pB++; + ip_out += inA * inB; + } + *pO++ = (q15_t)__SSAT((ip_out >> out_shift), 16); + + rowCnt--; + } + +#endif /* ARM_MATH_DSP */ + + /* Return to ARM_MATH_SUCCESS */ + return (ARM_MATH_SUCCESS); +} + +/** + * @} end of FC group + */ diff --git a/Drivers/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_q7.c b/Drivers/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_q7.c new file mode 100644 index 0000000..d500efe --- /dev/null +++ b/Drivers/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_q7.c @@ -0,0 +1,200 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_fully_connected_q7.c + * Description: Q7 basic fully-connected layer function + * + * $Date: July 20, 2021 + * $Revision: V.1.1.2 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup FC + * @{ + */ + +/** + * @brief Q7 basic fully-connected layer function + * @param[in] pV pointer to input vector + * @param[in] pM pointer to matrix weights + * @param[in] dim_vec length of the vector + * @param[in] num_of_rows number of rows in weight matrix + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in] bias pointer to bias + * @param[in,out] pOut pointer to output vector + * @param[in,out] vec_buffer pointer to buffer space for input + * @return The function returns <code>ARM_MATH_SUCCESS</code> + * + * @details + * + * <b>Buffer size:</b> + * + * vec_buffer size: dim_vec + * + * This basic function is designed to work with regular weight + * matrix without interleaving. + * + */ + +arm_status arm_fully_connected_q7(const q7_t *pV, + const q7_t *pM, + const uint16_t dim_vec, + const uint16_t num_of_rows, + const uint16_t bias_shift, + const uint16_t out_shift, + const q7_t *bias, + q7_t *pOut, + q15_t *vec_buffer) +{ + +#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) + /* Run the following code for Cortex-M4 and Cortex-M7 */ + + const q7_t *pB = pM; + const q7_t *pB2; + q7_t *pO = pOut; + const q7_t *pBias = bias; + const q15_t *pA; + uint16_t rowCnt = num_of_rows >> 1; + + /* expand the vector into the buffer */ + arm_q7_to_q15_reordered_no_shift(pV, vec_buffer, dim_vec); + + while (rowCnt) + { + q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + uint16_t colCnt = dim_vec >> 2; + + pA = vec_buffer; + pB2 = pB + dim_vec; + + while (colCnt) + { + q31_t inV, inM11, inM12, inM21, inM22; + pB = read_and_pad_reordered(pB, &inM11, &inM12); + pB2 = read_and_pad_reordered(pB2, &inM21, &inM22); + + inV = arm_nn_read_q15x2_ia(&pA); + + sum = __SMLAD(inV, inM11, sum); + sum2 = __SMLAD(inV, inM21, sum2); + + inV = arm_nn_read_q15x2_ia(&pA); + + sum = __SMLAD(inV, inM12, sum); + sum2 = __SMLAD(inV, inM22, sum2); + + colCnt--; + } + colCnt = dim_vec & 0x3; + while (colCnt) + { + q7_t inV = *pA++; + q15_t inM = *pB++; + q15_t inM2 = *pB2++; + + sum += inV * inM; + sum2 += inV * inM2; + colCnt--; + } /* while over colCnt */ + *pO++ = (q7_t)(__SSAT((sum >> out_shift), 8)); + *pO++ = (q7_t)(__SSAT((sum2 >> out_shift), 8)); + + /* adjust the pointers and counters */ + pB += dim_vec; + rowCnt--; + } + + /* left-over part of the rows */ + rowCnt = num_of_rows & 0x1; + + while (rowCnt) + { + uint16_t colCnt = dim_vec >> 2; + q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + + pA = vec_buffer; + + while (colCnt) + { + q31_t inV1, inV2, inM11, inM12; + + pB = read_and_pad_reordered(pB, &inM11, &inM12); + + inV1 = arm_nn_read_q15x2_ia(&pA); + sum = __SMLAD(inV1, inM11, sum); + + inV2 = arm_nn_read_q15x2_ia(&pA); + sum = __SMLAD(inV2, inM12, sum); + + colCnt--; + } + + /* left-over of the vector */ + colCnt = dim_vec & 0x3; + while (colCnt) + { + q7_t inV = *pA++; + q15_t inM = *pB++; + sum += inV * inM; + colCnt--; + } + + *pO++ = (q7_t)(__SSAT((sum >> out_shift), 8)); + + rowCnt--; + } + +#else + (void)vec_buffer; + int i, j; + + /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ + for (i = 0; i < num_of_rows; i++) + { + int ip_out = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift); + for (j = 0; j < dim_vec; j++) + { + ip_out += pV[j] * pM[i * dim_vec + j]; + } + pOut[i] = (q7_t)__SSAT((ip_out >> out_shift), 8); + } + +#endif /* ARM_MATH_DSP */ + + /* Return to ARM_MATH_SUCCESS */ + return (ARM_MATH_SUCCESS); +} + +/** + * @} end of FC group + */ diff --git a/Drivers/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_q7_opt.c b/Drivers/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_q7_opt.c new file mode 100644 index 0000000..2f3d653 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_q7_opt.c @@ -0,0 +1,495 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_fully_connected_q7_opt.c + * Description: Q7 basic fully-connected layer function + * + * $Date: 20. July 2021 + * $Revision: V.1.1.1 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup FC + * @{ + */ + +/** + * @brief Q7 opt fully-connected layer function + * @param[in] pV pointer to input vector + * @param[in] pM pointer to matrix weights + * @param[in] dim_vec length of the vector + * @param[in] num_of_rows number of rows in weight matrix + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in] bias pointer to bias + * @param[in,out] pOut pointer to output vector + * @param[in,out] vec_buffer pointer to buffer space for input + * @return The function returns <code>ARM_MATH_SUCCESS</code> + * + * @details + * + * <b>Buffer size:</b> + * + * vec_buffer size: dim_vec + * + * This opt function is designed to work with interleaved weight + * matrix. The vector input is assumed in q7_t format, we call + * arm_q7_to_q15_no_shift_shuffle function to expand into + * q15_t format with certain weight re-ordering, refer to the function + * comments for more details. + * Here we use only one pointer to read 4 rows in the weight + * matrix. So if the original q7_t matrix looks like this: + * + * | a11 | a12 | a13 | a14 | a15 | a16 | a17 | + * + * | a21 | a22 | a23 | a24 | a25 | a26 | a27 | + * + * | a31 | a32 | a33 | a34 | a35 | a36 | a37 | + * + * | a41 | a42 | a43 | a44 | a45 | a46 | a47 | + * + * | a51 | a52 | a53 | a54 | a55 | a56 | a57 | + * + * | a61 | a62 | a63 | a64 | a65 | a66 | a67 | + * + * + * We operates on multiple-of-4 rows, so the first four rows becomes + * + * | a11 | a21 | a13 | a23 | a31 | a41 | a33 | a43 | + * + * | a12 | a22 | a14 | a24 | a32 | a42 | a34 | a44 | + * + * | a15 | a25 | a35 | a45 | a16 | a26 | a36 | a46 | + * + * So within the kernel, we first read the re-ordered vector in as: + * + * | b1 | b3 | and | b2 | b4 | + * + * the four q31_t weights will look like + * + * | a11 | a13 |, | a21 | a23 |, | a31 | a33 |, | a41 | a43 | + * + * | a12 | a14 |, | a22 | a24 |, | a32 | a34 |, | a42 | a44 | + * + * The column left over will be in-order. + * which is: + * + * | a17 | a27 | a37 | a47 | + * + * For the left-over rows, we do 1x1 computation, so the data remains + * as its original order. + * + * So the stored weight matrix looks like this: + * + * | a11 | a21 | a13 | a23 | a31 | a41 | + * + * | a33 | a43 | a12 | a22 | a14 | a24 | + * + * | a32 | a42 | a34 | a44 | a15 | a25 | + * + * | a35 | a45 | a16 | a26 | a36 | a46 | + * + * | a17 | a27 | a37 | a47 | a51 | a52 | + * + * | a53 | a54 | a55 | a56 | a57 | a61 | + * + * | a62 | a63 | a64 | a65 | a66 | a67 | + * + * + */ + +arm_status arm_fully_connected_q7_opt(const q7_t *pV, + const q7_t *pM, + const uint16_t dim_vec, + const uint16_t num_of_rows, + const uint16_t bias_shift, + const uint16_t out_shift, + const q7_t *bias, + q7_t *pOut, + q15_t *vec_buffer) +{ + +#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) + /* Run the following code for Cortex-M4 and Cortex-M7 */ + + const q7_t *pB = pM; + q7_t *pO = pOut; + const q7_t *pBias = bias; + const q15_t *pA; + uint16_t rowCnt = num_of_rows >> 2; + + arm_q7_to_q15_reordered_no_shift(pV, vec_buffer, dim_vec); + + while (rowCnt) + { + + q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + q31_t sum3 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + + uint16_t colCnt = dim_vec >> 2; + + pA = vec_buffer; + +#ifdef USE_INTRINSIC + +#ifndef ARM_MATH_BIG_ENDIAN + while (colCnt) + { + q31_t inM11, inM12, inM13, inM14; + q31_t inV; + + inV = arm_nn_read_q15x2_ia(&pA); + inM11 = arm_nn_read_q7x4_ia(&pB); + inM12 = __SXTB16(__ROR(inM11, 8)); + inM11 = __SXTB16(inM11); + sum = __SMLAD(inM11, inV, sum); + sum2 = __SMLAD(inM12, inV, sum2); + inM13 = arm_nn_read_q7x4_ia(&pB); + inM14 = __SXTB16(__ROR(inM13, 8)); + inM13 = __SXTB16(inM13); + sum3 = __SMLAD(inM13, inV, sum3); + sum4 = __SMLAD(inM14, inV, sum4); + + inV = arm_nn_read_q15x2_ia(&pA); + inM11 = arm_nn_read_q7x4_ia(&pB); + inM12 = __SXTB16(__ROR(inM11, 8)); + inM11 = __SXTB16(inM11); + sum = __SMLAD(inM11, inV, sum); + sum2 = __SMLAD(inM12, inV, sum2); + inM13 = arm_nn_read_q7x4_ia(&pB); + inM14 = __SXTB16(__ROR(inM13, 8)); + inM13 = __SXTB16(inM13); + sum3 = __SMLAD(inM13, inV, sum3); + sum4 = __SMLAD(inM14, inV, sum4); + colCnt--; + } +#else + while (colCnt) + { + q31_t inM11, inM12, inM13, inM14; + q31_t inV; + + inV = arm_nn_read_q15x2_ia(&pA); + inM11 = arm_nn_read_q7x4_ia(&pB); + inM12 = __SXTB16(__ROR(inM11, 8)); + inM11 = __SXTB16(inM11); + sum = __SMLAD(inM12, inV, sum); + sum2 = __SMLAD(inM11, inV, sum2); + inM13 = arm_nn_read_q7x4_ia(&pB); + inM14 = __SXTB16(__ROR(inM13, 8)); + inM13 = __SXTB16(inM13); + sum3 = __SMLAD(inM14, inV, sum3); + sum4 = __SMLAD(inM13, inV, sum4); + + inV = arm_nn_read_q15x2_ia(&pA); + inM11 = arm_nn_read_q7x4_ia(&pB); + inM12 = __SXTB16(__ROR(inM11, 8)); + inM11 = __SXTB16(inM11); + sum = __SMLAD(inM12, inV, sum); + sum2 = __SMLAD(inM11, inV, sum2); + inM13 = arm_nn_read_q7x4_ia(&pB); + inM14 = __SXTB16(__ROR(inM13, 8)); + inM13 = __SXTB16(inM13); + sum3 = __SMLAD(inM14, inV, sum3); + sum4 = __SMLAD(inM13, inV, sum4); + colCnt--; + } +#endif /* ARM_MATH_BIG_ENDIAN */ + +#else + + /* + * register needed: + * loop counter: colCnt + * accumulators: sum, sum2, sum3, sum4 + * pointers: pB, pA + * weight data: inM11, inM12, inM13, inM14 + * activation data: inV + */ + +#ifndef ARM_MATH_BIG_ENDIAN + asm volatile("COL_LOOP_%=:\n" + "ldr.w r4, [%[pA]], #8\n" + "ldr.w r1, [%[pB]], #16\n" + "mov.w r0, r1, ror #8\n" + "sxtb16 r0, r0\n" + "sxtb16 r1, r1\n" + "smlad %[sum], r4, r1, %[sum]\n" + "smlad %[sum2], r4, r0, %[sum2]\n" + "ldr.w r3, [%[pB], #-12]\n" + "mov.w r2, r3, ror #8\n" + "sxtb16 r2, r2\n" + "sxtb16 r3, r3\n" + "smlad %[sum3], r4, r3, %[sum3]\n" + "smlad %[sum4], r4, r2, %[sum4]\n" + "ldr.w r4, [%[pA], #-4]\n" + "ldr.w r1, [%[pB], #-8]\n" + "mov.w r0, r1, ror #8\n" + "sxtb16 r0, r0\n" + "sxtb16 r1, r1\n" + "smlad %[sum], r4, r1, %[sum]\n" + "smlad %[sum2], r4, r0, %[sum2]\n" + "ldr.w r3, [%[pB], #-4]\n" + "mov.w r2, r3, ror #8\n" + "sxtb16 r2, r2\n" + "sxtb16 r3, r3\n" + "smlad %[sum3], r4, r3, %[sum3]\n" + "smlad %[sum4], r4, r2, %[sum4]\n" + "subs %[colCnt], #1\n" + "bne COL_LOOP_%=\n" + : [ sum ] "+r"(sum), + [ sum2 ] "+r"(sum2), + [ sum3 ] "+r"(sum3), + [ sum4 ] "+r"(sum4), + [ pB ] "+r"(pB), + [ pA ] "+r"(pA) + : [ colCnt ] "r"(colCnt) + : "r0", "r1", "r2", "r3", "r4"); +#else + asm volatile("COL_LOOP_%=:\n" + "ldr.w r4, [%[pA]], #8\n" + "ldr.w r1, [%[pB]], #16\n" + "mov.w r0, r1, ror #8\n" + "sxtb16 r0, r0\n" + "sxtb16 r1, r1\n" + "smlad %[sum], r4, r0, %[sum]\n" + "smlad %[sum2], r4, r1, %[sum2]\n" + "ldr.w r3, [%[pB], #-12]\n" + "mov.w r2, r3, ror #8\n" + "sxtb16 r2, r2\n" + "sxtb16 r3, r3\n" + "smlad %[sum3], r4, r2, %[sum3]\n" + "smlad %[sum4], r4, r3, %[sum4]\n" + "ldr.w r4, [%[pA], #-4]\n" + "ldr.w r1, [%[pB], #-8]\n" + "mov.w r0, r1, ror #8\n" + "sxtb16 r0, r0\n" + "sxtb16 r1, r1\n" + "smlad %[sum], r4, r0, %[sum]\n" + "smlad %[sum2], r4, r1, %[sum2]\n" + "ldr.w r3, [%[pB], #-4]\n" + "mov.w r2, r3, ror #8\n" + "sxtb16 r2, r2\n" + "sxtb16 r3, r3\n" + "smlad %[sum3], r4, r2, %[sum3]\n" + "smlad %[sum4], r4, r3, %[sum4]\n" + "subs %[colCnt], #1\n" + "bne COL_LOOP_%=\n" + : [ sum ] "+r"(sum), + [ sum2 ] "+r"(sum2), + [ sum3 ] "+r"(sum3), + [ sum4 ] "+r"(sum4), + [ pB ] "+r"(pB), + [ pA ] "+r"(pA) + : [ colCnt ] "r"(colCnt) + : "r0", "r1", "r2", "r3", "r4"); +#endif /* ARM_MATH_BIG_ENDIAN */ + +#endif /* USE_INTRINSIC */ + + colCnt = dim_vec & 0x3; + while (colCnt) + { + q15_t inV = *pA++; + q7_t inM = *pB++; + q7_t inM2 = *pB++; + q7_t inM3 = *pB++; + q7_t inM4 = *pB++; + + sum += inV * inM; + sum2 += inV * inM2; + sum3 += inV * inM3; + sum4 += inV * inM4; + colCnt--; + } /* while over colCnt */ + *pO++ = (q7_t)(__SSAT((sum >> out_shift), 8)); + *pO++ = (q7_t)(__SSAT((sum2 >> out_shift), 8)); + *pO++ = (q7_t)(__SSAT((sum3 >> out_shift), 8)); + *pO++ = (q7_t)(__SSAT((sum4 >> out_shift), 8)); + + /* adjust the pointers and counters */ + rowCnt--; + } + + /* left-over part of the rows */ + rowCnt = num_of_rows & 0x3; + + while (rowCnt) + { + q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + uint16_t colCnt = dim_vec >> 2; + + pA = vec_buffer; + + while (colCnt) + { + q31_t inV1, inV2, inM11, inM12; + + pB = read_and_pad_reordered(pB, &inM11, &inM12); + + inV1 = arm_nn_read_q15x2_ia(&pA); + sum = __SMLAD(inV1, inM11, sum); + + inV2 = arm_nn_read_q15x2_ia(&pA); + sum = __SMLAD(inV2, inM12, sum); + + colCnt--; + } + + /* left-over of the vector */ + colCnt = dim_vec & 0x3; + while (colCnt) + { + q15_t inV = *pA++; + q7_t inM = *pB++; + sum += inV * inM; + colCnt--; + } + + *pO++ = (q7_t)(__SSAT((sum >> out_shift), 8)); + + rowCnt--; + } + +#else + /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ + (void)vec_buffer; + uint16_t rowCnt = num_of_rows >> 2; + const q7_t *pB = pM; + const q7_t *pA; + q7_t *pO = pOut; + const q7_t *pBias = bias; + + while (rowCnt) + { + q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + q31_t sum3 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + + uint16_t colCnt = dim_vec >> 2; + + pA = pV; + + while (colCnt) + { + q7_t inA1 = *pA++; + q7_t inA3 = *pA++; + q7_t inA2 = *pA++; + q7_t inA4 = *pA++; + + q7_t inB1 = *pB++; + q7_t inB3 = *pB++; + q7_t inB2 = *pB++; + q7_t inB4 = *pB++; + + sum += inA1 * inB1 + inA2 * inB2; + sum2 += inA1 * inB3 + inA2 * inB4; + + inB1 = *pB++; + inB3 = *pB++; + inB2 = *pB++; + inB4 = *pB++; + + sum3 += inA1 * inB1 + inA2 * inB2; + sum4 += inA1 * inB3 + inA2 * inB4; + + inB1 = *pB++; + inB3 = *pB++; + inB2 = *pB++; + inB4 = *pB++; + + sum += inA3 * inB1 + inA4 * inB2; + sum2 += inA3 * inB3 + inA4 * inB4; + + inB1 = *pB++; + inB3 = *pB++; + inB2 = *pB++; + inB4 = *pB++; + + sum3 += inA3 * inB1 + inA4 * inB2; + sum4 += inA3 * inB3 + inA4 * inB4; + + colCnt--; + } + colCnt = dim_vec & 0x3; + while (colCnt) + { + q7_t inA = *pA++; + q7_t inB = *pB++; + sum += inA * inB; + inB = *pB++; + sum2 += inA * inB; + inB = *pB++; + sum3 += inA * inB; + inB = *pB++; + sum4 += inA * inB; + + colCnt--; + } + *pO++ = (q7_t)__SSAT((sum >> out_shift), 8); + *pO++ = (q7_t)__SSAT((sum2 >> out_shift), 8); + *pO++ = (q7_t)__SSAT((sum3 >> out_shift), 8); + *pO++ = (q7_t)__SSAT((sum4 >> out_shift), 8); + + rowCnt--; + } + + rowCnt = num_of_rows & 0x3; + + while (rowCnt) + { + int ip_out = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + + int j; + + pA = pV; + for (j = 0; j < dim_vec; j++) + { + q7_t inA = *pA++; + q7_t inB = *pB++; + ip_out += inA * inB; + } + *pO++ = (q7_t)__SSAT((ip_out >> out_shift), 8); + + rowCnt--; + } + +#endif /* ARM_MATH_DSP */ + + /* Return to ARM_MATH_SUCCESS */ + return (ARM_MATH_SUCCESS); +} + +/** + * @} end of FC group + */ diff --git a/Drivers/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_s16.c b/Drivers/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_s16.c new file mode 100644 index 0000000..46df578 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_s16.c @@ -0,0 +1,97 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_fully_connected_s16 + * Description: Fully connected function compatible with TF Lite. + * + * $Date: 3. August 2021 + * $Revision: V.1.0.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup FC + * @{ + */ + +/* + * S16 basic fully-connected and matrix multiplication layer function for TensorFlow Lite + * + * Refer header file for details. + * + */ +arm_status arm_fully_connected_s16(const cmsis_nn_context *ctx, + const cmsis_nn_fc_params *fc_params, + const cmsis_nn_per_tensor_quant_params *quant_params, + const cmsis_nn_dims *input_dims, + const q15_t *input, + const cmsis_nn_dims *filter_dims, + const q7_t *kernel, + const cmsis_nn_dims *bias_dims, + const int64_t *bias, + const cmsis_nn_dims *output_dims, + q15_t *output) +{ + (void)bias_dims; + (void)ctx; + (void)fc_params->filter_offset; + + int32_t batch_cnt = input_dims->n; + + const q31_t reduced_multiplier = REDUCE_MULTIPLIER(quant_params->multiplier); + + while (batch_cnt) + { + arm_nn_vec_mat_mult_t_s16(input, + kernel, + bias, + output, + reduced_multiplier, + quant_params->shift, + filter_dims->n, /* col_dim or accum_depth */ + output_dims->c, /* row_dim or output_depth */ + fc_params->activation.min, + fc_params->activation.max); + input += filter_dims->n; + output += output_dims->c; + batch_cnt--; + } + + return (ARM_MATH_SUCCESS); +} + +int32_t arm_fully_connected_s16_get_buffer_size(const cmsis_nn_dims *filter_dims) +{ + (void)filter_dims; + return 0; +} + +/** + * @} end of FC group + */ diff --git a/Drivers/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_s8.c b/Drivers/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_s8.c new file mode 100644 index 0000000..9615701 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_s8.c @@ -0,0 +1,99 @@ +/* + * Copyright (C) 2010-2022 Arm Limited or its affiliates. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_fully_connected_s8 + * Description: Fully connected function compatible with TF Lite. + * + * $Date: 8 April 2022 + * $Revision: V.3.1.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup FC + * @{ + */ + +/* + * S8 basic fully-connected and matrix multiplication layer function for TensorFlow Lite + * + * Refer header file for details. + * + */ + +arm_status arm_fully_connected_s8(const cmsis_nn_context *ctx, + const cmsis_nn_fc_params *fc_params, + const cmsis_nn_per_tensor_quant_params *quant_params, + const cmsis_nn_dims *input_dims, + const q7_t *input, + const cmsis_nn_dims *filter_dims, + const q7_t *kernel, + const cmsis_nn_dims *bias_dims, + const int32_t *bias, + const cmsis_nn_dims *output_dims, + q7_t *output) +{ + (void)bias_dims; + (void)ctx; + (void)fc_params->filter_offset; + + int32_t batch_cnt = input_dims->n; + + while (batch_cnt) + { + arm_nn_vec_mat_mult_t_s8(input, + kernel, + bias, + output, + fc_params->input_offset, + 0, + fc_params->output_offset, + quant_params->multiplier, + quant_params->shift, + filter_dims->n, /* col_dim or accum_depth */ + output_dims->c, /* row_dim or output_depth */ + fc_params->activation.min, + fc_params->activation.max, + 1L); + input += filter_dims->n; + output += output_dims->c; + batch_cnt--; + } + return (ARM_MATH_SUCCESS); +} + +int32_t arm_fully_connected_s8_get_buffer_size(const cmsis_nn_dims *filter_dims) +{ + (void)filter_dims; + return 0; +} + +/** + * @} end of FC group + */ diff --git a/Drivers/CMSIS/NN/Source/NNSupportFunctions/CMakeLists.txt b/Drivers/CMSIS/NN/Source/NNSupportFunctions/CMakeLists.txt new file mode 100644 index 0000000..0aa9f38 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/NNSupportFunctions/CMakeLists.txt @@ -0,0 +1,26 @@ +# +# Copyright (c) 2019-2022 Arm Limited. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +file(GLOB SRC "./*_s8.c") +target_sources(cmsis-nn PRIVATE ${SRC} arm_q7_to_q15_with_offset.c + arm_nn_mat_mul_kernel_s16.c + arm_q7_to_q15_with_offset.c + arm_nn_mat_mul_kernel_s16.c + arm_nn_vec_mat_mult_t_s16.c + arm_q7_to_q15_no_shift.c) + diff --git a/Drivers/CMSIS/NN/Source/NNSupportFunctions/arm_nn_accumulate_q7_to_q15.c b/Drivers/CMSIS/NN/Source/NNSupportFunctions/arm_nn_accumulate_q7_to_q15.c new file mode 100644 index 0000000..c3f666a --- /dev/null +++ b/Drivers/CMSIS/NN/Source/NNSupportFunctions/arm_nn_accumulate_q7_to_q15.c @@ -0,0 +1,85 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_nn_accumulate_q7_to_q15.c + * Description: Accumulate q7 vector into q15 one. + * + * $Date: 20 July 2021 + * $Revision: V.1.1.2 + * + * pSrc Processor: Cortex-M CPUs + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupSupport + */ + +/** + * @addtogroup NNBasicMath + * @{ + */ + +void arm_nn_accumulate_q7_to_q15(q15_t *pDst, const q7_t *pSrc, uint32_t length) +{ + q15_t *pCnt = pDst; + const q7_t *pV = pSrc; + int32_t count = length; +#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) + q31_t v1, v2, vo1, vo2; + count = length >> 2; + q31_t in; + + while (count > 0l) + { + q31_t value = arm_nn_read_q7x4_ia(&pV); + v1 = __SXTB16(__ROR((uint32_t)value, 8)); + v2 = __SXTB16(value); +#ifndef ARM_MATH_BIG_ENDIAN + vo2 = (q31_t)__PKHTB(v1, v2, 16); + vo1 = (q31_t)__PKHBT(v2, v1, 16); +#else + vo1 = (q31_t)__PKHTB(v1, v2, 16); + vo2 = (q31_t)__PKHBT(v2, v1, 16); +#endif + + in = arm_nn_read_q15x2(pCnt); + arm_nn_write_q15x2_ia(&pCnt, __QADD16(vo1, in)); + + in = arm_nn_read_q15x2(pCnt); + arm_nn_write_q15x2_ia(&pCnt, __QADD16(vo2, in)); + + count--; + } + count = length & 0x3; +#endif + while (count > 0l) + { + *pCnt++ += *pV++; + count--; + } +} + +/** + * @} end of NNBasicMath group + */ diff --git a/Drivers/CMSIS/NN/Source/NNSupportFunctions/arm_nn_add_q7.c b/Drivers/CMSIS/NN/Source/NNSupportFunctions/arm_nn_add_q7.c new file mode 100644 index 0000000..511e586 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/NNSupportFunctions/arm_nn_add_q7.c @@ -0,0 +1,82 @@ +/* + * Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_nn_add_q7.c + * Description: Non saturating addition of elements of a q7 vector. + * + * $Date: 20. July 2021 + * $Revision: V.1.1.1 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nn_tables.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupSupport + */ + +/** + * @addtogroup NNBasicMath + * @{ + */ + +void arm_nn_add_q7(const q7_t *input, q31_t *output, uint32_t block_size) +{ + uint32_t block_count; + q31_t result = 0; +#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) + /* Loop unrolling: Compute 4 outputs at a time */ + block_count = block_size >> 2U; + + while (block_count > 0U) + { + const int32_t mult_q15x2 = (1UL << 16) | 1UL; + q31_t in_q7x4 = arm_nn_read_q7x4_ia(&input); + q31_t temp_q15x2 = __SXTAB16(__SXTB16(in_q7x4), __ROR((uint32_t)in_q7x4, 8)); + + result = __SMLAD(temp_q15x2, mult_q15x2, result); + + /* Decrement loop counter */ + block_count--; + } + + /* Loop unrolling: Compute remaining outputs */ + block_count = block_size & 0x3; +#else + block_count = block_size; +#endif + while (block_count > 0U) + { + /* Add and store result in destination buffer. */ + result += *input++; + + /* Decrement loop counter */ + block_count--; + } + + *output = result; +} + +/** + * @} end of NNBasicMath group + */
\ No newline at end of file diff --git a/Drivers/CMSIS/NN/Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_padded_s8.c b/Drivers/CMSIS/NN/Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_padded_s8.c new file mode 100644 index 0000000..b633ef4 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_padded_s8.c @@ -0,0 +1,168 @@ +/* + * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_nn_depthwise_conv_nt_t_padded_s8.c + * Description: Depthwise convolution with padded matrices. + * + * $Date: 09. October 2020 + * $Revision: V.1.0.2 + * + * Target Processor: Cortex-M processors with MVE extension + * -------------------------------------------------------------------- */ + +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupSupport + */ + +/** + * @addtogroup NNBasicMath + * @{ + */ + +/* + * Depthwise convolution of transposed rhs matrix with 4 lhs matrices. One or more of the rhs matrices are padded. + * Dimensions are the same for lhs and rhs. + * + * Refer header file for details. + * + */ + +q7_t *arm_nn_depthwise_conv_nt_t_padded_s8(const q7_t *lhs, + const q7_t *rhs, + const int32_t input_offset, + const uint16_t num_ch, + const int32_t *out_shift, + const int32_t *out_mult, + const int32_t out_offset, + const int32_t activation_min, + const int32_t activation_max, + const uint16_t row_x_col, + const int32_t *const output_bias, + q7_t *out) +{ +#if defined(ARM_MATH_MVEI) + int32_t loop_count = (num_ch + 3) / 4; + const int32_t *bias = output_bias; + uint32_t num_ch_to_process = num_ch; + + for (int i_loop_cnt = 0, offset = 0; i_loop_cnt < loop_count; + num_ch_to_process -= 4, out += 4, offset += 4, i_loop_cnt++) + { + int32x4_t out_0 = vldrwq_s32(bias); + int32x4_t out_1 = out_0; + int32x4_t out_2 = out_0; + int32x4_t out_3 = out_0; + bias += 4; + + const int8_t *rhs_0 = rhs + offset; + const int8_t *lhs_0 = lhs + offset; + const int8_t *lhs_1 = lhs + row_x_col * num_ch + offset; + const int8_t *lhs_2 = lhs + (row_x_col * num_ch * 2) + offset; + const int8_t *lhs_3 = lhs + (row_x_col * num_ch * 3) + offset; + + for (int i_row_x_col = 0; i_row_x_col < row_x_col; i_row_x_col++) + { + const int32x4_t ker_0 = vldrbq_s32(rhs_0); + + int32x4_t ip_0 = vldrbq_s32(lhs_0); + ip_0 = vaddq_n_s32(ip_0, input_offset); + out_0 += vmulq_s32(ip_0, ker_0); + + int32x4_t ip_1 = vldrbq_s32(lhs_1); + ip_1 = vaddq_n_s32(ip_1, input_offset); + out_1 += vmulq_s32(ip_1, ker_0); + + int32x4_t ip_2 = vldrbq_s32(lhs_2); + ip_2 = vaddq_n_s32(ip_2, input_offset); + out_2 += vmulq_s32(ip_2, ker_0); + + int32x4_t ip_3 = vldrbq_s32(lhs_3); + ip_3 = vaddq_n_s32(ip_3, input_offset); + + out_3 += vmulq_s32(ip_3, ker_0); + + lhs_0 += num_ch; + lhs_1 += num_ch; + lhs_2 += num_ch; + lhs_3 += num_ch; + + rhs_0 += num_ch; + } + + const int32x4_t mult = vldrwq_s32(out_mult); + const int32x4_t shift = vldrwq_s32(out_shift); + out_mult += 4; + out_shift += 4; + + out_0 = arm_requantize_mve_32x4(out_0, mult, shift); + out_0 = vaddq_n_s32(out_0, out_offset); + out_0 = vmaxq_s32(out_0, vdupq_n_s32(activation_min)); + out_0 = vminq_s32(out_0, vdupq_n_s32(activation_max)); + mve_pred16_t p = vctp32q(num_ch_to_process); + vstrbq_p_s32(out, out_0, p); + + out_1 = arm_requantize_mve_32x4(out_1, mult, shift); + out_1 = vaddq_n_s32(out_1, out_offset); + out_1 = vmaxq_s32(out_1, vdupq_n_s32(activation_min)); + out_1 = vminq_s32(out_1, vdupq_n_s32(activation_max)); + vstrbq_p_s32(out + num_ch, out_1, p); + + out_2 = arm_requantize_mve_32x4(out_2, mult, shift); + out_2 = vaddq_n_s32(out_2, out_offset); + out_2 = vmaxq_s32(out_2, vdupq_n_s32(activation_min)); + out_2 = vminq_s32(out_2, vdupq_n_s32(activation_max)); + vstrbq_p_s32(out + 2 * num_ch, out_2, p); + + out_3 = arm_requantize_mve_32x4(out_3, mult, shift); + out_3 = vaddq_n_s32(out_3, out_offset); + out_3 = vmaxq_s32(out_3, vdupq_n_s32(activation_min)); + out_3 = vminq_s32(out_3, vdupq_n_s32(activation_max)); + vstrbq_p_s32(out + 3 * num_ch, out_3, p); + } + + const int tail_ch = num_ch & 0x3; + if (tail_ch != 0) + { + out -= (4 - tail_ch); + } + return out + (3 * num_ch); + +#else + (void)lhs; + (void)rhs; + (void)input_offset; + (void)num_ch; + (void)out_shift; + (void)out_mult; + (void)out_offset; + (void)activation_min; + (void)activation_max; + (void)row_x_col; + (void)output_bias; + (void)out; + return NULL; +#endif +} + +/** + * @} end of NNBasicMath group + */ diff --git a/Drivers/CMSIS/NN/Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_s8.c b/Drivers/CMSIS/NN/Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_s8.c new file mode 100644 index 0000000..dda12fd --- /dev/null +++ b/Drivers/CMSIS/NN/Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_s8.c @@ -0,0 +1,170 @@ +/* + * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_nn_depthwise_conv_nt_t_s8.c + * Description: Depthwise convolution on matrices with no padding. + * + * $Date: 09. October 2020 + * $Revision: V.1.0.2 + * + * Target Processor: Cortex-M processors with MVE extension. + * -------------------------------------------------------------------- */ + +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupSupport + */ + +/** + * @addtogroup NNBasicMath + * @{ + */ + +/* + * Depthwise convolution of rhs matrix with 4 lhs matrices with no padding. Dimensions are the same for lhs and rhs. + * + * Refer header file for details. + * + */ + +q7_t *arm_nn_depthwise_conv_nt_t_s8(const q7_t *lhs, + const q7_t *rhs, + const int32_t input_offset, + const uint16_t num_ch, + const int32_t *out_shift, + const int32_t *out_mult, + const int32_t out_offset, + const int32_t activation_min, + const int32_t activation_max, + const uint16_t row_x_col, + const int32_t *const output_bias, + q7_t *out) +{ +#if defined(ARM_MATH_MVEI) + const int32_t *bias = output_bias; + int32_t loop_count = (num_ch + 3) / 4; + uint32_t num_ch_to_process = num_ch; + + for (int i_loop_cnt = 0, offset = 0; i_loop_cnt < loop_count; + num_ch_to_process -= 4, offset += 4, out += 4, i_loop_cnt++) + { + int32x4_t out_0 = vldrwq_s32(bias); + int32x4_t out_1 = out_0; + int32x4_t out_2 = out_0; + int32x4_t out_3 = out_0; + bias += 4; + + const int8_t *rhs_0 = rhs + offset; + const int8_t *lhs_0 = lhs + offset; + const int8_t *lhs_1 = lhs + row_x_col * num_ch + offset; + const int8_t *lhs_2 = lhs + (row_x_col * num_ch * 2) + offset; + const int8_t *lhs_3 = lhs + (row_x_col * num_ch * 3) + offset; + int32x4_t ker_sum = vdupq_n_s32(0); + + for (int i_row_x_col = 0; i_row_x_col < row_x_col; i_row_x_col++) + { + const int32x4_t ker_0 = vldrbq_s32(rhs_0); + ker_sum = vaddq_s32(ker_sum, ker_0); + + int32x4_t ip_0 = vldrbq_s32(lhs_0); + out_0 += vmulq_s32(ip_0, ker_0); + + int32x4_t ip_1 = vldrbq_s32(lhs_1); + out_1 += vmulq_s32(ip_1, ker_0); + + int32x4_t ip_2 = vldrbq_s32(lhs_2); + out_2 += vmulq_s32(ip_2, ker_0); + + int32x4_t ip_3 = vldrbq_s32(lhs_3); + out_3 += vmulq_s32(ip_3, ker_0); + + lhs_0 += num_ch; + lhs_1 += num_ch; + lhs_2 += num_ch; + lhs_3 += num_ch; + + rhs_0 += num_ch; + } + + ker_sum = vmulq_n_s32(ker_sum, input_offset); + out_0 = ker_sum + out_0; + out_1 = ker_sum + out_1; + out_2 = ker_sum + out_2; + out_3 = ker_sum + out_3; + + const int32x4_t mult = vldrwq_s32(out_mult); + const int32x4_t shift = vldrwq_s32(out_shift); + out_mult += 4; + out_shift += 4; + mve_pred16_t p = vctp32q(num_ch_to_process); + + out_0 = arm_requantize_mve_32x4(out_0, mult, shift); + out_0 = vaddq_n_s32(out_0, out_offset); + out_0 = vmaxq_s32(out_0, vdupq_n_s32(activation_min)); + out_0 = vminq_s32(out_0, vdupq_n_s32(activation_max)); + vstrbq_p_s32(out, out_0, p); + + out_1 = arm_requantize_mve_32x4(out_1, mult, shift); + out_1 = vaddq_n_s32(out_1, out_offset); + out_1 = vmaxq_s32(out_1, vdupq_n_s32(activation_min)); + out_1 = vminq_s32(out_1, vdupq_n_s32(activation_max)); + vstrbq_p_s32(out + num_ch, out_1, p); + + out_2 = arm_requantize_mve_32x4(out_2, mult, shift); + out_2 = vaddq_n_s32(out_2, out_offset); + out_2 = vmaxq_s32(out_2, vdupq_n_s32(activation_min)); + out_2 = vminq_s32(out_2, vdupq_n_s32(activation_max)); + vstrbq_p_s32(out + 2 * num_ch, out_2, p); + + out_3 = arm_requantize_mve_32x4(out_3, mult, shift); + out_3 = vaddq_n_s32(out_3, out_offset); + out_3 = vmaxq_s32(out_3, vdupq_n_s32(activation_min)); + out_3 = vminq_s32(out_3, vdupq_n_s32(activation_max)); + vstrbq_p_s32(out + 3 * num_ch, out_3, p); + } + + const int tail_ch = num_ch & 0x3; + if (tail_ch != 0) + { + out -= (4 - tail_ch); + } + + return out + (3 * num_ch); +#else + (void)lhs; + (void)rhs; + (void)input_offset; + (void)num_ch; + (void)out_shift; + (void)out_mult; + (void)out_offset; + (void)activation_min; + (void)activation_max; + (void)row_x_col; + (void)output_bias; + (void)out; + return NULL; +#endif +} + +/** + * @} end of NNBasicMath group + */ diff --git a/Drivers/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mat_mul_core_1x_s8.c b/Drivers/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mat_mul_core_1x_s8.c new file mode 100644 index 0000000..8b1bf6e --- /dev/null +++ b/Drivers/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mat_mul_core_1x_s8.c @@ -0,0 +1,86 @@ +/* + * Copyright (C) 2010-2022 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_nn_mat_mul_core_1x_s8.c + * Description: General Matrix-multiplication function + * + * $Date: 19. April 2022 + * $Revision: V.1.0.3 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ + +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupSupport + */ + +/** + * @addtogroup NNBasicMath + * @{ + */ + +/* + * s8 matrix multiplication to process 1 row + * + * Refer header file for details. + * + */ + +arm_status arm_nn_mat_mul_core_1x_s8(int32_t row_elements, + const int8_t *row_base, + const int8_t *col_base, + int32_t *const sum_col, + int32_t *const output) +{ + int32_t acc_n0 = 0; + int32_t sum_tmp = 0; + +#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) + + __ASM volatile(" vldrb.8 q0, [%[col]], #16 \n" + " wlstp.8 lr, %[cnt], 1f \n" + "2: \n" + " vaddva.s8 %[sum], q0 \n" + " vldrb.8 q1, [%[row0]], #16 \n" + " vmladava.s8 %[out0], q0, q1 \n" + " vldrb.8 q0, [%[col]], #16 \n" + " letp lr, 2b \n" + "1: \n" + : [col] "+r"(col_base), [sum] "+Te"(sum_tmp), [row0] "+r"(row_base), [out0] "+Te"(acc_n0) + : [cnt] "r"(row_elements) + : "q0", "q1", "memory", "r14"); +#else + for (int i = 0; i < row_elements; i++) + { + sum_tmp += col_base[i]; + acc_n0 += row_base[i] * col_base[i]; + } +#endif + + *sum_col = sum_tmp; + *output = acc_n0; + return ARM_MATH_SUCCESS; +} + +/** + * @} end of NNBasicMath group + */ diff --git a/Drivers/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mat_mul_core_4x_s8.c b/Drivers/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mat_mul_core_4x_s8.c new file mode 100644 index 0000000..ff427ad --- /dev/null +++ b/Drivers/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mat_mul_core_4x_s8.c @@ -0,0 +1,137 @@ +/* + * Copyright (C) 2010-2022 Arm Limited or its affiliates. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_nn_mat_mul_core_4x_s8.c + * Description: General matrix multiplication function for MVE extension + * + * $Date: 19. April 2022 + * $Revision: V.3.0.1 + * + * Target Processor: Cortex-M processors + * -------------------------------------------------------------------- */ +#include "arm_nn_types.h" +#include "arm_nnsupportfunctions.h" +/** + * @ingroup groupSupport + */ + +/** + * @addtogroup NNBasicMath + * @{ + */ + +/* + * s8 matrix multiplication to process 4 rows and one column + * + * Refer header file for details. + * + */ + +int8_t *arm_nn_mat_mul_core_4x_s8(const int32_t row_elements, + const int32_t offset, + const int8_t *row_base, + const int8_t *col_base_ref, + const int32_t out_ch, + const cmsis_nn_conv_params *conv_params, + const cmsis_nn_per_channel_quant_params *quant_params, + const int32_t *bias, + int8_t *output) +{ + +#if defined(ARM_MATH_MVEI) + for (int i = 0; i < out_ch; i++) + { + int32_t acc_n0 = 0; + int32_t acc_n1 = 0; + int32_t acc_n2 = 0; + int32_t acc_n3 = 0; + + const int8_t *ip_row_0 = row_base; + const int8_t *ip_row_1 = row_base + offset; + const int8_t *ip_row_2 = row_base + (2 * offset); + const int8_t *ip_row_3 = row_base + (3 * offset); + const int8_t *col_base = col_base_ref + i * row_elements; + int32_t sum_tmp = 0; + + __ASM volatile(" vldrb.8 q0, [%[col]], #16 \n" + " wlstp.8 lr, %[cnt], 1f \n" + "2: \n" + " vaddva.s8 %[sum], q0 \n" + " vldrb.8 q1, [%[row0]], #16 \n" + " vmladava.s8 %[out0], q0, q1 \n" + " vldrb.8 q2, [%[row1]], #16 \n" + " vmladava.s8 %[out1], q0, q2 \n" + " vldrb.8 q3, [%[row2]], #16 \n" + " vmladava.s8 %[out2], q0, q3 \n" + " vldrb.8 q4, [%[row3]], #16 \n" + " vmladava.s8 %[out3], q0, q4 \n" + " vldrb.8 q0, [%[col]], #16 \n" + " letp lr, 2b \n" + "1: \n" + : [col] "+r"(col_base), + [sum] "+Te"(sum_tmp), + [row0] "+r"(ip_row_0), + [row1] "+r"(ip_row_1), + [row2] "+r"(ip_row_2), + [row3] "+r"(ip_row_3), + [out0] "+Te"(acc_n0), + [out1] "+Te"(acc_n1), + [out2] "+Te"(acc_n2), + [out3] "+Te"(acc_n3) + : [cnt] "r"(row_elements) + : "q0", "q1", "q2", "q3", "q4", "memory", "r14"); + + int32x4_t res = {acc_n0, acc_n1, acc_n2, acc_n3}; + sum_tmp *= conv_params->input_offset; + if (bias) + { + sum_tmp += bias[i]; + } + res = vaddq_n_s32(res, sum_tmp); + + res = arm_requantize_mve(res, quant_params->multiplier[i], quant_params->shift[i]); + res = vaddq_n_s32(res, conv_params->output_offset); + + res = vmaxq_s32(res, vdupq_n_s32(conv_params->activation.min)); + res = vminq_s32(res, vdupq_n_s32(conv_params->activation.max)); + + const uint32x4_t scatter_offset = {0, (uint32_t)out_ch, (uint32_t)out_ch * 2, (uint32_t)out_ch * 3}; + vstrbq_scatter_offset_s32(output, scatter_offset, res); + output++; + } + + return output + (3 * out_ch); +#else + (void)row_elements; + (void)offset; + (void)row_base; + (void)col_base_ref; + (void)out_ch; + (void)conv_params; + (void)quant_params; + (void)bias; + (void)output; + return NULL; +#endif +} + +/** + * @} end of NNBasicMath group + */ diff --git a/Drivers/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mat_mul_kernel_s16.c b/Drivers/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mat_mul_kernel_s16.c new file mode 100644 index 0000000..41d0bc9 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mat_mul_kernel_s16.c @@ -0,0 +1,250 @@ +/* + * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_nn_mat_mult_kernel_s16.c + * Description: Matrix-multiplication function for convolution + * + * $Date: 12 August 2021 + * $Revision: V.1.1.0 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/* + * Matrix-multiplication function for convolution with per-channel requantization. + * + * Refer header file for details. + * + */ + +q15_t *arm_nn_mat_mult_kernel_s16(const q7_t *input_a, + const q15_t *input_b, + const int32_t output_ch, + const int32_t *out_shift, + const int32_t *out_mult, + const int16_t activation_min, + const int16_t activation_max, + const int32_t num_col_a, + const int64_t *const output_bias, + q15_t *out_0) +{ + +#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) + /* set up the second output pointers */ + q15_t *out_1 = out_0 + output_ch; + const int64_t *bias = output_bias; + uint16_t row_count = output_ch / 2; + const q7_t *ip_a0 = input_a; + + /* this loop over rows in A */ + while (row_count) + { + /* setup pointers for B */ + const q15_t *ip_b0 = input_b; + const q15_t *ip_b1 = ip_b0 + num_col_a; + + /* align the second pointer for A */ + const q7_t *ip_a1 = ip_a0 + num_col_a; + + /* Init accumulator for channel N and N + 1 */ + q31_t ch_0_out_0 = 0; + q31_t ch_0_out_1 = 0; + q31_t ch_1_out_0 = 0; + q31_t ch_1_out_1 = 0; + + uint16_t col_count = num_col_a / 4; + /* accumulate over the vector */ + while (col_count) + { + q31_t a01, a02, a11, a12; + q31_t b0 = arm_nn_read_q15x2_ia(&ip_b0); + q31_t b1 = arm_nn_read_q15x2_ia(&ip_b1); + + ip_a0 = read_and_pad(ip_a0, &a01, &a02); + ip_a1 = read_and_pad(ip_a1, &a11, &a12); + + ch_0_out_0 = __SMLAD(a01, b0, ch_0_out_0); + ch_0_out_1 = __SMLAD(a01, b1, ch_0_out_1); + ch_1_out_0 = __SMLAD(a11, b0, ch_1_out_0); + ch_1_out_1 = __SMLAD(a11, b1, ch_1_out_1); + + b0 = arm_nn_read_q15x2_ia(&ip_b0); + b1 = arm_nn_read_q15x2_ia(&ip_b1); + + ch_0_out_0 = __SMLAD(a02, b0, ch_0_out_0); + ch_0_out_1 = __SMLAD(a02, b1, ch_0_out_1); + ch_1_out_0 = __SMLAD(a12, b0, ch_1_out_0); + ch_1_out_1 = __SMLAD(a12, b1, ch_1_out_1); + + col_count--; + } /* while over col_count */ + col_count = num_col_a & 0x3; + while (col_count) + { + q7_t a0 = *ip_a0++; + q15_t b0 = *ip_b0++; + q7_t a1 = *ip_a1++; + q15_t b1 = *ip_b1++; + + ch_0_out_0 += a0 * b0; + ch_0_out_1 += a0 * b1; + ch_1_out_0 += a1 * b0; + ch_1_out_1 += a1 * b1; + col_count--; + } /* while over col_count */ + if (bias) + { + q31_t reduced_multiplier = REDUCE_MULTIPLIER(*out_mult); + q63_t acc_64 = ch_0_out_0 + *bias; + ch_0_out_0 = arm_nn_requantize_s64(acc_64, reduced_multiplier, *out_shift); + acc_64 = ch_0_out_1 + *bias++; + ch_0_out_1 = arm_nn_requantize_s64(acc_64, reduced_multiplier, *out_shift); + out_mult++; + } + else + { + ch_0_out_0 = arm_nn_requantize(ch_0_out_0, *out_mult, *out_shift); + ch_0_out_1 = arm_nn_requantize(ch_0_out_1, *out_mult, *out_shift); + out_mult++; + } + ch_0_out_0 = MAX(ch_0_out_0, activation_min); + ch_0_out_0 = MIN(ch_0_out_0, activation_max); + *out_0++ = (q15_t)ch_0_out_0; + + ch_0_out_1 = MAX(ch_0_out_1, activation_min); + ch_0_out_1 = MIN(ch_0_out_1, activation_max); + *out_1++ = (q15_t)ch_0_out_1; + out_shift++; + + if (bias) + { + q31_t reduced_multiplier = REDUCE_MULTIPLIER(*out_mult); + q63_t acc_64 = ch_1_out_0 + *bias; + ch_1_out_0 = arm_nn_requantize_s64(acc_64, reduced_multiplier, *out_shift); + acc_64 = ch_1_out_1 + *bias++; + ch_1_out_1 = arm_nn_requantize_s64(acc_64, reduced_multiplier, *out_shift); + out_mult++; + } + else + { + ch_1_out_0 = arm_nn_requantize(ch_1_out_0, *out_mult, *out_shift); + ch_1_out_1 = arm_nn_requantize(ch_1_out_1, *out_mult, *out_shift); + out_mult++; + } + ch_1_out_0 = MAX(ch_1_out_0, activation_min); + ch_1_out_0 = MIN(ch_1_out_0, activation_max); + *out_0++ = (q15_t)ch_1_out_0; + + ch_1_out_1 = MAX(ch_1_out_1, activation_min); + ch_1_out_1 = MIN(ch_1_out_1, activation_max); + *out_1++ = (q15_t)ch_1_out_1; + out_shift++; + + /* skip row */ + ip_a0 += num_col_a; + row_count--; + } + + /* compute the last odd numbered row if any */ + if (output_ch & 0x1) + { + /* setup pointers for B */ + const q15_t *ip_b0 = input_b; + const q15_t *ip_b1 = ip_b0 + num_col_a; + + q31_t ch_0_out_0 = 0; + q31_t ch_0_out_1 = 0; + + uint16_t col_count = num_col_a >> 2; + while (col_count) + { + q31_t a01, a02; + q31_t b0 = arm_nn_read_q15x2_ia(&ip_b0); + q31_t b1 = arm_nn_read_q15x2_ia(&ip_b1); + + ip_a0 = read_and_pad(ip_a0, &a01, &a02); + + ch_0_out_0 = __SMLAD(a01, b0, ch_0_out_0); + ch_0_out_1 = __SMLAD(a01, b1, ch_0_out_1); + + b0 = arm_nn_read_q15x2_ia(&ip_b0); + b1 = arm_nn_read_q15x2_ia(&ip_b1); + ch_0_out_0 = __SMLAD(a02, b0, ch_0_out_0); + ch_0_out_1 = __SMLAD(a02, b1, ch_0_out_1); + + col_count--; + } + col_count = num_col_a & 0x3; + while (col_count) + { + q7_t a0 = *ip_a0++; + q15_t b0 = *ip_b0++; + q15_t b1 = *ip_b1++; + + ch_0_out_0 += a0 * b0; + ch_0_out_1 += a0 * b1; + col_count--; + } + if (bias) + { + q31_t reduced_multiplier = REDUCE_MULTIPLIER(*out_mult); + q63_t acc_64 = ch_0_out_0 + *bias; + ch_0_out_0 = arm_nn_requantize_s64(acc_64, reduced_multiplier, *out_shift); + acc_64 = ch_0_out_1 + *bias++; + ch_0_out_1 = arm_nn_requantize_s64(acc_64, reduced_multiplier, *out_shift); + } + else + { + ch_0_out_0 = arm_nn_requantize(ch_0_out_0, *out_mult, *out_shift); + ch_0_out_1 = arm_nn_requantize(ch_0_out_1, *out_mult, *out_shift); + } + ch_0_out_0 = MAX(ch_0_out_0, activation_min); + ch_0_out_0 = MIN(ch_0_out_0, activation_max); + *out_0++ = (q15_t)ch_0_out_0; + + ch_0_out_1 = MAX(ch_0_out_1, activation_min); + ch_0_out_1 = MIN(ch_0_out_1, activation_max); + *out_1++ = (q15_t)ch_0_out_1; + out_mult++; + out_shift++; + } + + out_0 += output_ch; + + /* return the new output pointer with offset */ + return out_0; +#else + (void)input_a; + (void)input_b; + (void)output_ch; + (void)out_shift; + (void)out_mult; + (void)activation_min; + (void)activation_max; + (void)num_col_a; + (void)output_bias; + (void)out_0; + /* To be completed */ + return NULL; +#endif +} diff --git a/Drivers/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mat_mult_nt_t_s8.c b/Drivers/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mat_mult_nt_t_s8.c new file mode 100644 index 0000000..d0420c2 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mat_mult_nt_t_s8.c @@ -0,0 +1,582 @@ +/* + * Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_nn_mat_mult_s8_nt_t_s8 + * Description: Matrix multiplication support function with the right-hand-side (rhs) matrix transposed + * + * $Date: 09. October 2020 + * $Revision: V.1.0.3 + * + * Target Processor: Cortex-M + * + * -------------------------------------------------------------------- */ + +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupSupport + */ + +/** + * @addtogroup NNBasicMath + * @{ + */ + +/* + * s8 matrix multiplication with the right-hand-side matrix transposed + * + * Refer header file for details. + * + */ +arm_status arm_nn_mat_mult_nt_t_s8(const q7_t *lhs, + const q7_t *rhs, + const q31_t *bias, + q7_t *dst, + const int32_t *dst_multipliers, + const int32_t *dst_shifts, + const int32_t lhs_rows, + const int32_t rhs_rows, + const int32_t rhs_cols, + const int32_t lhs_offset, + const int32_t dst_offset, + const int32_t activation_min, + const int32_t activation_max) +{ +#if defined(ARM_MATH_DSP) + const int32_t off0 = rhs_cols - 4; + + for (int32_t rhs_rows_idx = 0; rhs_rows_idx <= (rhs_rows - 2); rhs_rows_idx += 2) + { + const q7_t *lhs_ptr = &lhs[0]; + q7_t *dst_ptr = &dst[0]; + + q31_t lhs_offset_contribution0 = 0; + q31_t lhs_offset_contribution1 = 0; + + for (int32_t x = 0; x < rhs_cols; ++x) + { + lhs_offset_contribution0 += rhs[x]; + lhs_offset_contribution1 += rhs[x + rhs_cols]; + } + + lhs_offset_contribution0 *= lhs_offset; + lhs_offset_contribution1 *= lhs_offset; + if (bias) + { + lhs_offset_contribution0 += bias[rhs_rows_idx]; + lhs_offset_contribution1 += bias[rhs_rows_idx + 1]; + } + + int32_t lhs_rows_idx = lhs_rows >> 1; + + while (lhs_rows_idx) + { + const q7_t *rhs_ptr = &rhs[0]; + + q31_t res00 = lhs_offset_contribution0; + q31_t res01 = lhs_offset_contribution1; + q31_t res10 = lhs_offset_contribution0; + q31_t res11 = lhs_offset_contribution1; + + int32_t rhs_cols_idx = 0; + + q31_t val0, val1, val2, val3, val4, val5; + + for (; rhs_cols_idx <= (rhs_cols - 16); rhs_cols_idx += 16) + { + val1 = arm_nn_read_q7x4_ia((const q7_t **)&rhs_ptr); + val2 = __SXTB16(val1); + val0 = arm_nn_read_q7x4_ia((const q7_t **)&lhs_ptr); + val3 = __SXTB16(val0); + val4 = arm_nn_read_q7x4((const q7_t *)&rhs_ptr[off0]); + val1 = __SXTB16_RORn(val1, 8); + val0 = __SXTB16_RORn(val0, 8); + + // 4 x MAC res00, res01 + res00 = __SMLAD(val3, val2, res00); + val5 = __SXTB16(val4); + res00 = __SMLAD(val0, val1, res00); + val4 = __SXTB16_RORn(val4, 8); + res01 = __SMLAD(val3, val5, res01); + res01 = __SMLAD(val0, val4, res01); + + // 4 x MAC res10, res11 + val0 = arm_nn_read_q7x4((const q7_t *)&lhs_ptr[off0]); + val3 = __SXTB16(val0); + val0 = __SXTB16_RORn(val0, 8); + res10 = __SMLAD(val3, val2, res10); + res11 = __SMLAD(val3, val5, res11); + res10 = __SMLAD(val0, val1, res10); + val1 = arm_nn_read_q7x4_ia((const q7_t **)&rhs_ptr); + res11 = __SMLAD(val0, val4, res11); + + val4 = arm_nn_read_q7x4((const q7_t *)&rhs_ptr[off0]); + val2 = __SXTB16(val1); + val0 = arm_nn_read_q7x4_ia((const q7_t **)&lhs_ptr); + val3 = __SXTB16(val0); + val1 = __SXTB16_RORn(val1, 8); + val0 = __SXTB16_RORn(val0, 8); + + // 4 x MAC res00, res01 + res00 = __SMLAD(val3, val2, res00); + val5 = __SXTB16(val4); + res00 = __SMLAD(val0, val1, res00); + val4 = __SXTB16_RORn(val4, 8); + res01 = __SMLAD(val3, val5, res01); + res01 = __SMLAD(val0, val4, res01); + + // 4 x MAC res10, res11 + val0 = arm_nn_read_q7x4((const q7_t *)&lhs_ptr[off0]); + val3 = __SXTB16(val0); + val0 = __SXTB16_RORn(val0, 8); + res10 = __SMLAD(val3, val2, res10); + res11 = __SMLAD(val3, val5, res11); + res10 = __SMLAD(val0, val1, res10); + val1 = arm_nn_read_q7x4_ia((const q7_t **)&rhs_ptr); + res11 = __SMLAD(val0, val4, res11); + + val4 = arm_nn_read_q7x4((const q7_t *)&rhs_ptr[off0]); + val2 = __SXTB16(val1); + val0 = arm_nn_read_q7x4_ia((const q7_t **)&lhs_ptr); + val3 = __SXTB16(val0); + val1 = __SXTB16_RORn(val1, 8); + val0 = __SXTB16_RORn(val0, 8); + + // 4 x MAC res00, res01 + res00 = __SMLAD(val3, val2, res00); + val5 = __SXTB16(val4); + res00 = __SMLAD(val0, val1, res00); + val4 = __SXTB16_RORn(val4, 8); + res01 = __SMLAD(val3, val5, res01); + res01 = __SMLAD(val0, val4, res01); + + // 4 x MAC res10, res11 + val0 = arm_nn_read_q7x4((const q7_t *)&lhs_ptr[off0]); + val3 = __SXTB16(val0); + val0 = __SXTB16_RORn(val0, 8); + res10 = __SMLAD(val3, val2, res10); + res11 = __SMLAD(val3, val5, res11); + res10 = __SMLAD(val0, val1, res10); + val1 = arm_nn_read_q7x4_ia((const q7_t **)&rhs_ptr); + res11 = __SMLAD(val0, val4, res11); + + val4 = arm_nn_read_q7x4((const q7_t *)&rhs_ptr[off0]); + val2 = __SXTB16(val1); + val0 = arm_nn_read_q7x4_ia((const q7_t **)&lhs_ptr); + val3 = __SXTB16(val0); + val1 = __SXTB16_RORn(val1, 8); + val0 = __SXTB16_RORn(val0, 8); + + // 4 x MAC res00, res01 + res00 = __SMLAD(val3, val2, res00); + val5 = __SXTB16(val4); + res00 = __SMLAD(val0, val1, res00); + val4 = __SXTB16_RORn(val4, 8); + res01 = __SMLAD(val3, val5, res01); + res01 = __SMLAD(val0, val4, res01); + + // 4 x MAC res10, res11 + val0 = arm_nn_read_q7x4((const q7_t *)&lhs_ptr[off0]); + val3 = __SXTB16(val0); + val0 = __SXTB16_RORn(val0, 8); + res10 = __SMLAD(val3, val2, res10); + res11 = __SMLAD(val3, val5, res11); + res10 = __SMLAD(val0, val1, res10); + res11 = __SMLAD(val0, val4, res11); + } + + for (; rhs_cols_idx < rhs_cols; ++rhs_cols_idx) + { + q7_t rhs_value0 = rhs_ptr[0]; + q7_t rhs_value1 = rhs_ptr[rhs_cols]; + q7_t lhs_value = lhs_ptr[0]; + + res00 += lhs_value * rhs_value0; + res01 += lhs_value * rhs_value1; + + lhs_value = lhs_ptr[rhs_cols]; + res10 += lhs_value * rhs_value0; + res11 += lhs_value * rhs_value1; + + ++rhs_ptr; + ++lhs_ptr; + } + + // Quantize down + res00 = arm_nn_requantize(res00, dst_multipliers[rhs_rows_idx], dst_shifts[rhs_rows_idx]); + res01 = arm_nn_requantize(res01, dst_multipliers[rhs_rows_idx + 1], dst_shifts[rhs_rows_idx + 1]); + res10 = arm_nn_requantize(res10, dst_multipliers[rhs_rows_idx], dst_shifts[rhs_rows_idx]); + res11 = arm_nn_requantize(res11, dst_multipliers[rhs_rows_idx + 1], dst_shifts[rhs_rows_idx + 1]); + + // Add offset + res00 += dst_offset; + res01 += dst_offset; + res10 += dst_offset; + res11 += dst_offset; + + // Clamp the result + res00 = MAX(res00, activation_min); + res00 = MIN(res00, activation_max); + res01 = MAX(res01, activation_min); + res01 = MIN(res01, activation_max); + res10 = MAX(res10, activation_min); + res10 = MIN(res10, activation_max); + res11 = MAX(res11, activation_min); + res11 = MIN(res11, activation_max); + + dst_ptr[0] = (q7_t)res00; + dst_ptr[1] = (q7_t)res01; + dst_ptr += rhs_rows; + dst_ptr[0] = (q7_t)res10; + dst_ptr[1] = (q7_t)res11; + dst_ptr += rhs_rows; + + lhs_ptr += rhs_cols; + + lhs_rows_idx--; + } + + // Left-over rows + if (lhs_rows % 2) + { + const q7_t *rhs_ptr = &rhs[0]; + + q31_t res00 = lhs_offset_contribution0; + q31_t res01 = lhs_offset_contribution1; + + int32_t rhs_cols_idx = 0; + + q31_t val0, val1, val2, val3, val4, val5; + for (; rhs_cols_idx <= (rhs_cols - 16); rhs_cols_idx += 16) + { + val0 = arm_nn_read_q7x4_ia((const q7_t **)&rhs_ptr); + val1 = arm_nn_read_q7x4((const q7_t *)&rhs_ptr[off0]); + val2 = arm_nn_read_q7x4_ia((const q7_t **)&lhs_ptr); + val3 = __SXTB16(val0); + val5 = __SXTB16(val2); + val4 = __SXTB16(val1); + val0 = __SXTB16_RORn(val0, 8); + val2 = __SXTB16_RORn(val2, 8); + val1 = __SXTB16_RORn(val1, 8); + + // 4 x MAC res00, res01 + res00 = __SMLAD(val5, val3, res00); + res00 = __SMLAD(val2, val0, res00); + res01 = __SMLAD(val5, val4, res01); + res01 = __SMLAD(val2, val1, res01); + + val0 = arm_nn_read_q7x4_ia((const q7_t **)&rhs_ptr); + val1 = arm_nn_read_q7x4((const q7_t *)&rhs_ptr[off0]); + val2 = arm_nn_read_q7x4_ia((const q7_t **)&lhs_ptr); + val3 = __SXTB16(val0); + val5 = __SXTB16(val2); + val4 = __SXTB16(val1); + val0 = __SXTB16_RORn(val0, 8); + val2 = __SXTB16_RORn(val2, 8); + val1 = __SXTB16_RORn(val1, 8); + + // 4 x MAC res00, res01 + res00 = __SMLAD(val5, val3, res00); + res00 = __SMLAD(val2, val0, res00); + res01 = __SMLAD(val5, val4, res01); + res01 = __SMLAD(val2, val1, res01); + + val0 = arm_nn_read_q7x4_ia((const q7_t **)&rhs_ptr); + val1 = arm_nn_read_q7x4((const q7_t *)&rhs_ptr[off0]); + val2 = arm_nn_read_q7x4_ia((const q7_t **)&lhs_ptr); + val3 = __SXTB16(val0); + val5 = __SXTB16(val2); + val4 = __SXTB16(val1); + val0 = __SXTB16_RORn(val0, 8); + val2 = __SXTB16_RORn(val2, 8); + val1 = __SXTB16_RORn(val1, 8); + + // 4 x MAC res00, res01 + res00 = __SMLAD(val5, val3, res00); + res00 = __SMLAD(val2, val0, res00); + res01 = __SMLAD(val5, val4, res01); + res01 = __SMLAD(val2, val1, res01); + + val0 = arm_nn_read_q7x4_ia((const q7_t **)&rhs_ptr); + val1 = arm_nn_read_q7x4((const q7_t *)&rhs_ptr[off0]); + val2 = arm_nn_read_q7x4_ia((const q7_t **)&lhs_ptr); + val3 = __SXTB16(val0); + val5 = __SXTB16(val2); + val4 = __SXTB16(val1); + val0 = __SXTB16_RORn(val0, 8); + val2 = __SXTB16_RORn(val2, 8); + val1 = __SXTB16_RORn(val1, 8); + + // 4 x MAC res00, res01 + res00 = __SMLAD(val5, val3, res00); + res00 = __SMLAD(val2, val0, res00); + res01 = __SMLAD(val5, val4, res01); + res01 = __SMLAD(val2, val1, res01); + } + + // Left-over accumulations + for (; rhs_cols_idx < rhs_cols; ++rhs_cols_idx) + { + q7_t rhs_value0 = rhs_ptr[0]; + q7_t rhs_value1 = rhs_ptr[rhs_cols]; + q7_t lhs_value = lhs_ptr[0]; + + res00 += lhs_value * rhs_value0; + res01 += lhs_value * rhs_value1; + + ++rhs_ptr; + ++lhs_ptr; + } + + // Quantize down + res00 = arm_nn_requantize(res00, dst_multipliers[rhs_rows_idx], dst_shifts[rhs_rows_idx]); + res01 = arm_nn_requantize(res01, dst_multipliers[rhs_rows_idx + 1], dst_shifts[rhs_rows_idx + 1]); + + // Add offset + res00 += dst_offset; + res01 += dst_offset; + + // Clamp the result + res00 = MAX(res00, activation_min); + res00 = MIN(res00, activation_max); + res01 = MAX(res01, activation_min); + res01 = MIN(res01, activation_max); + + dst_ptr[0] = (q7_t)res00; + dst_ptr[1] = (q7_t)res01; + } + + rhs += 2 * rhs_cols; + dst += 2; + } + + if (rhs_rows % 2) + { + const q7_t *lhs_ptr = &lhs[0]; + q7_t *dst_ptr = &dst[0]; + + for (int32_t lhs_rows_idx = 0; lhs_rows_idx < lhs_rows; ++lhs_rows_idx) + { + const q7_t *rhs_ptr = &rhs[0]; + q31_t res00 = 0; + if (bias) + { + res00 = bias[rhs_rows - 1]; + } + + for (int32_t rhs_cols_idx = 0; rhs_cols_idx < rhs_cols; ++rhs_cols_idx) + { + q31_t rhs_value = rhs_ptr[0]; + q31_t lhs_value = lhs_ptr[0] + lhs_offset; + + res00 += lhs_value * rhs_value; + + ++rhs_ptr; + ++lhs_ptr; + } + + // Quantize down + res00 = arm_nn_requantize(res00, dst_multipliers[rhs_rows - 1], dst_shifts[rhs_rows - 1]); + + // Add offset + res00 += dst_offset; + + // Clamp the result + res00 = MAX(res00, activation_min); + res00 = MIN(res00, activation_max); + + dst_ptr[0] = (q7_t)res00; + dst_ptr += rhs_rows; + } + } +#else + for (int32_t rhs_rows_idx = 0; rhs_rows_idx <= (rhs_rows - 2); rhs_rows_idx += 2) + { + const q7_t *lhs_ptr = &lhs[0]; + q7_t *dst_ptr = &dst[0]; + + q31_t lhs_offset_contribution0 = 0; + q31_t lhs_offset_contribution1 = 0; + + for (int32_t x = 0; x < rhs_cols; ++x) + { + lhs_offset_contribution0 += rhs[x]; + lhs_offset_contribution1 += rhs[x + rhs_cols]; + } + + lhs_offset_contribution0 *= lhs_offset; + lhs_offset_contribution1 *= lhs_offset; + if (bias) + { + lhs_offset_contribution0 += bias[rhs_rows_idx]; + lhs_offset_contribution1 += bias[rhs_rows_idx + 1]; + } + + int32_t lhs_rows_idx = lhs_rows >> 1; + + while (lhs_rows_idx) + { + const q7_t *rhs_ptr = &rhs[0]; + + q31_t res00 = lhs_offset_contribution0; + q31_t res01 = lhs_offset_contribution1; + q31_t res10 = lhs_offset_contribution0; + q31_t res11 = lhs_offset_contribution1; + + for (int32_t rhs_cols_idx = rhs_cols; rhs_cols_idx != 0; rhs_cols_idx--) + { + q7_t rhs_value0 = rhs_ptr[0]; + q7_t rhs_value1 = rhs_ptr[rhs_cols]; + q7_t lhs_value = lhs_ptr[0]; + + res00 += lhs_value * rhs_value0; + res01 += lhs_value * rhs_value1; + + lhs_value = lhs_ptr[rhs_cols]; + res10 += lhs_value * rhs_value0; + res11 += lhs_value * rhs_value1; + + ++rhs_ptr; + ++lhs_ptr; + } + + // Quantize down + res00 = arm_nn_requantize(res00, dst_multipliers[rhs_rows_idx], dst_shifts[rhs_rows_idx]); + res01 = arm_nn_requantize(res01, dst_multipliers[rhs_rows_idx + 1], dst_shifts[rhs_rows_idx + 1]); + res10 = arm_nn_requantize(res10, dst_multipliers[rhs_rows_idx], dst_shifts[rhs_rows_idx]); + res11 = arm_nn_requantize(res11, dst_multipliers[rhs_rows_idx + 1], dst_shifts[rhs_rows_idx + 1]); + + // Add offset + res00 += dst_offset; + res01 += dst_offset; + res10 += dst_offset; + res11 += dst_offset; + + // Clamp the result + res00 = MAX(res00, activation_min); + res00 = MIN(res00, activation_max); + res01 = MAX(res01, activation_min); + res01 = MIN(res01, activation_max); + res10 = MAX(res10, activation_min); + res10 = MIN(res10, activation_max); + res11 = MAX(res11, activation_min); + res11 = MIN(res11, activation_max); + + dst_ptr[0] = (q7_t)res00; + dst_ptr[1] = (q7_t)res01; + dst_ptr += rhs_rows; + dst_ptr[0] = (q7_t)res10; + dst_ptr[1] = (q7_t)res11; + dst_ptr += rhs_rows; + + lhs_ptr += rhs_cols; + + lhs_rows_idx--; + } + + // Left-over rows + if (lhs_rows % 2) + { + const q7_t *rhs_ptr = &rhs[0]; + + q31_t res00 = lhs_offset_contribution0; + q31_t res01 = lhs_offset_contribution1; + + for (int32_t rhs_cols_idx = rhs_cols; rhs_cols_idx != 0; rhs_cols_idx--) + { + q7_t rhs_value0 = rhs_ptr[0]; + q7_t rhs_value1 = rhs_ptr[rhs_cols]; + q7_t lhs_value = lhs_ptr[0]; + + res00 += lhs_value * rhs_value0; + res01 += lhs_value * rhs_value1; + + ++rhs_ptr; + ++lhs_ptr; + } + + // Quantize down + res00 = arm_nn_requantize(res00, dst_multipliers[rhs_rows_idx], dst_shifts[rhs_rows_idx]); + res01 = arm_nn_requantize(res01, dst_multipliers[rhs_rows_idx + 1], dst_shifts[rhs_rows_idx + 1]); + + // Add offset + res00 += dst_offset; + res01 += dst_offset; + + // Clamp the result + res00 = MAX(res00, activation_min); + res00 = MIN(res00, activation_max); + res01 = MAX(res01, activation_min); + res01 = MIN(res01, activation_max); + + dst_ptr[0] = (q7_t)res00; + dst_ptr[1] = (q7_t)res01; + } + + rhs += 2 * rhs_cols; + dst += 2; + } + + if (rhs_rows % 2) + { + const q7_t *lhs_ptr = &lhs[0]; + q7_t *dst_ptr = &dst[0]; + + for (int32_t lhs_rows_idx = 0; lhs_rows_idx < lhs_rows; ++lhs_rows_idx) + { + const q7_t *rhs_ptr = &rhs[0]; + q31_t res00 = 0; + if (bias) + { + res00 = bias[rhs_rows - 1]; + } + + for (int32_t rhs_cols_idx = rhs_cols; rhs_cols_idx != 0; rhs_cols_idx--) + { + q31_t rhs_value = rhs_ptr[0]; + q31_t lhs_value = lhs_ptr[0] + lhs_offset; + + res00 += lhs_value * rhs_value; + + ++rhs_ptr; + ++lhs_ptr; + } + + // Quantize down + res00 = arm_nn_requantize(res00, dst_multipliers[rhs_rows - 1], dst_shifts[rhs_rows - 1]); + + // Add offset + res00 += dst_offset; + + // Clamp the result + res00 = MAX(res00, activation_min); + res00 = MIN(res00, activation_max); + + dst_ptr[0] = (q7_t)res00; + dst_ptr += rhs_rows; + } + } +#endif + return ARM_MATH_SUCCESS; +} + +/** + * @} end of NNBasicMath group + */ diff --git a/Drivers/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mult_q15.c b/Drivers/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mult_q15.c new file mode 100644 index 0000000..d6a45ef --- /dev/null +++ b/Drivers/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mult_q15.c @@ -0,0 +1,73 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_nn_mult_q15.c + * Description: Q15 vector multiplication with variable output shifts + * + * $Date: 20. July 2021 + * $Revision: V.1.1.2 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupSupport + */ + +/** + * @addtogroup NNBasicMath + * @{ + */ + +/** + * @brief Q7 vector multiplication with variable output shifts + * @param[in] *pSrcA pointer to the first input vector + * @param[in] *pSrcB pointer to the second input vector + * @param[out] *pDst pointer to the output vector + * @param[in] out_shift amount of right-shift for output + * @param[in] blockSize number of samples in each vector + * + * <b>Scaling and Overflow Behavior:</b> + * \par + * The function uses saturating arithmetic. + * Results outside of the allowable Q15 range [0x8000 0x7FFF] will be saturated. + */ + +void arm_nn_mult_q15(q15_t *pSrcA, q15_t *pSrcB, q15_t *pDst, const uint16_t out_shift, uint32_t blockSize) +{ + uint32_t blkCnt = blockSize; /* loop counters */ + + while (blkCnt > 0U) + { + /* C = A * B */ + /* Multiply the inputs and store the result in the destination buffer */ + *pDst++ = (q15_t)__SSAT(((q31_t)((q31_t)(*pSrcA++) * (*pSrcB++) + NN_ROUND(out_shift)) >> out_shift), 16); + + /* Decrement the blockSize loop counter */ + blkCnt--; + } +} + +/** + * @} end of NNBasicMath group + */ diff --git a/Drivers/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mult_q7.c b/Drivers/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mult_q7.c new file mode 100644 index 0000000..fdced4c --- /dev/null +++ b/Drivers/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mult_q7.c @@ -0,0 +1,73 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_nn_mult_q7.c + * Description: Q7 vector multiplication with variable output shifts + * + * $Date: 20. July 2021 + * $Revision: V.1.1.2 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupSupport + */ + +/** + * @addtogroup NNBasicMath + * @{ + */ + +/** + * @brief Q7 vector multiplication with variable output shifts + * @param[in] *pSrcA pointer to the first input vector + * @param[in] *pSrcB pointer to the second input vector + * @param[out] *pDst pointer to the output vector + * @param[in] out_shift amount of right-shift for output + * @param[in] blockSize number of samples in each vector + * + * <b>Scaling and Overflow Behavior:</b> + * \par + * The function uses saturating arithmetic. + * Results outside of the allowable Q7 range [0x80 0x7F] will be saturated. + */ + +void arm_nn_mult_q7(q7_t *pSrcA, q7_t *pSrcB, q7_t *pDst, const uint16_t out_shift, uint32_t blockSize) +{ + uint32_t blkCnt = blockSize; /* loop counters */ + + while (blkCnt > 0U) + { + /* C = A * B */ + /* Multiply the inputs and store the result in the destination buffer */ + *pDst++ = (q7_t)__SSAT(((q15_t)((q15_t)(*pSrcA++) * (*pSrcB++) + NN_ROUND(out_shift)) >> out_shift), 8); + + /* Decrement the blockSize loop counter */ + blkCnt--; + } +} + +/** + * @} end of NNBasicMath group + */ diff --git a/Drivers/CMSIS/NN/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s16.c b/Drivers/CMSIS/NN/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s16.c new file mode 100644 index 0000000..5956d3a --- /dev/null +++ b/Drivers/CMSIS/NN/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s16.c @@ -0,0 +1,211 @@ +/* + * Copyright (C) 2020-2022 Arm Limited or its affiliates. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_nn_vec_mat_mult_t_s16 + * Description: s16 vector by matrix (transposed) multiplication + * + * $Date: 04. January 2022 + * $Revision: V.1.2.0 + * + * Target Processor: Cortex-M + * + * -------------------------------------------------------------------- */ + +#include "arm_nnsupportfunctions.h" +/** + * @ingroup groupSupport + */ + +/** + * @addtogroup NNBasicMath + * @{ + */ + +/* + * s16 vector(lhs) by matrix (transposed) multiplication + * + * Refer header file for details. + * + */ +arm_status arm_nn_vec_mat_mult_t_s16(const q15_t *lhs, + const q7_t *rhs, + const q63_t *bias, + q15_t *dst, + const int32_t dst_multiplier, + const int32_t dst_shift, + const int32_t rhs_cols, + const int32_t rhs_rows, + const int32_t activation_min, + const int32_t activation_max) +{ +#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) + const int32_t row_loop_cnt = rhs_rows / 2; + + int32_t rhs_cols_fast = rhs_cols; + + if (rhs_cols > 512) + { + rhs_cols_fast = 512; + } + + for (int32_t i = 0; i < row_loop_cnt; i++) + { + q63_t acc_64_0 = 0; + q63_t acc_64_1 = 0; + int32_t acc_0 = 0; + int32_t acc_1 = 0; + + const int32_t col_loop_cnt = rhs_cols_fast / 4; + + const int16_t *lhs_vec = lhs; + const int8_t *rhs_0 = rhs; + const int8_t *rhs_1 = rhs + rhs_cols; + rhs += 2 * rhs_cols; + + for (int j = col_loop_cnt; j != 0; j--) + { + int32_t ker_0, ker_1, vec_part_0, vec_part_1; + vec_part_0 = arm_nn_read_q15x2_ia(&lhs_vec); + vec_part_1 = arm_nn_read_q15x2_ia(&lhs_vec); + + rhs_0 = read_and_pad(rhs_0, &ker_0, &ker_1); + + acc_0 = __SMLAD(ker_0, vec_part_0, acc_0); + acc_0 = __SMLAD(ker_1, vec_part_1, acc_0); + + rhs_1 = read_and_pad(rhs_1, &ker_0, &ker_1); + + acc_1 = __SMLAD(ker_0, vec_part_0, acc_1); + acc_1 = __SMLAD(ker_1, vec_part_1, acc_1); + } + + acc_64_0 += acc_0; + acc_64_1 += acc_1; + + for (int k = col_loop_cnt * 4; k < rhs_cols; k++) + { + const int32_t lhs_temp = (*lhs_vec); + lhs_vec++; + acc_64_0 += lhs_temp * (*rhs_0); + rhs_0++; + acc_64_1 += lhs_temp * (*rhs_1); + rhs_1++; + } + + if (bias) + { + acc_64_0 += *bias++; + acc_64_1 += *bias++; + } + q31_t tmp; + tmp = arm_nn_requantize_s64(acc_64_0, dst_multiplier, dst_shift); + tmp = MAX(tmp, activation_min); + tmp = MIN(tmp, activation_max); + *dst++ = (q15_t)tmp; + + tmp = arm_nn_requantize_s64(acc_64_1, dst_multiplier, dst_shift); + tmp = MAX(tmp, activation_min); + tmp = MIN(tmp, activation_max); + *dst++ = (q15_t)tmp; + } + + if (rhs_rows & 0x1) + { + q63_t acc_64_0 = 0; + int32_t acc_0 = 0; + const int32_t col_loop_cnt = rhs_cols_fast / 4; + + const int16_t *lhs_vec = lhs; + const int8_t *rhs_0 = rhs; + + for (int i = col_loop_cnt; i != 0; i--) + { + int32_t ker_0, ker_1, vec; + rhs_0 = read_and_pad(rhs_0, &ker_0, &ker_1); + + vec = arm_nn_read_q15x2_ia(&lhs_vec); + acc_0 = __SMLAD(ker_0, vec, acc_0); + + vec = arm_nn_read_q15x2_ia(&lhs_vec); + acc_0 = __SMLAD(ker_1, vec, acc_0); + } + + acc_64_0 += acc_0; + + for (int j = col_loop_cnt * 4; j < rhs_cols; j++) + { + const int32_t lhs_temp = (*lhs_vec); + lhs_vec++; + acc_64_0 += lhs_temp * (*rhs_0); + rhs_0++; + } + + if (bias) + { + acc_64_0 += *bias++; + } + q31_t tmp; + tmp = arm_nn_requantize_s64(acc_64_0, dst_multiplier, dst_shift); + tmp = MAX(tmp, activation_min); + tmp = MIN(tmp, activation_max); + *dst++ = (q15_t)tmp; + } + +#else + for (int i_row_loop_cnt = 0; i_row_loop_cnt < rhs_rows; i_row_loop_cnt++) + { + const q15_t *lhs_ptr = lhs; + const q7_t *rhs_ptr_0 = &rhs[0]; + + q63_t result = 0; + + if (bias) + { + result = *bias++; + } + for (int32_t rhs_cols_idx = 0; rhs_cols_idx < rhs_cols; ++rhs_cols_idx) + { + const q63_t rhs_value0 = (int8_t)*rhs_ptr_0; + const q63_t lhs_value = *lhs_ptr; + + result += lhs_value * rhs_value0; + + ++rhs_ptr_0; + ++lhs_ptr; + } + + // Quantize down + result = arm_nn_requantize_s64(result, dst_multiplier, dst_shift); + + // Clamp the result + result = ((result) > (activation_min) ? (result) : (activation_min)); + result = ((result) < (activation_max) ? (result) : (activation_max)); + + *dst++ = (q15_t)result; + rhs += rhs_cols; + } +#endif + + return ARM_MATH_SUCCESS; +} + +/** + * @} end of NNBasicMath group + */ diff --git a/Drivers/CMSIS/NN/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s8.c b/Drivers/CMSIS/NN/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s8.c new file mode 100644 index 0000000..c7dfd14 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s8.c @@ -0,0 +1,402 @@ +/* + * Copyright (C) 2020-2022 Arm Limited or its affiliates. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_nn_vec_mat_mult_t_s8 + * Description: s8 vector by matrix (transposed) multiplication + * + * $Date: 28 April 2022 + * $Revision: V.3.0.1 + * + * Target Processor: Cortex-M + * + * -------------------------------------------------------------------- */ + +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupSupport + */ + +/** + * @addtogroup NNBasicMath + * @{ + */ + +/* + * s8 vector(lhs) by matrix (transposed) multiplication + * + * Refer header file for details. + * + */ +arm_status arm_nn_vec_mat_mult_t_s8(const q7_t *lhs, + const q7_t *rhs, + const q31_t *bias, + q7_t *dst, + const int32_t lhs_offset, + const int32_t rhs_offset, + const int32_t dst_offset, + const int32_t dst_multiplier, + const int32_t dst_shift, + const int32_t rhs_cols, + const int32_t rhs_rows, + const int32_t activation_min, + const int32_t activation_max, + const int32_t address_offset) +{ + (void)rhs_offset; +#if defined(ARM_MATH_MVEI) + const int32_t row_loop_cnt = rhs_rows / 3; + const uint32x4_t address_offset_array = {0, address_offset, address_offset * 2, address_offset * 3}; + + for (int i_row_loop_cnt = 0; i_row_loop_cnt < row_loop_cnt; i_row_loop_cnt++) + { + int32_t acc_0 = 0; + int32_t acc_1 = 0; + int32_t acc_2 = 0; + + const int32_t col_loop_cnt = (rhs_cols + 15) / 16; + + const int8_t *lhs_vec = lhs; + const int8_t *rhs_0 = rhs; + const int8_t *rhs_1 = rhs + rhs_cols; + const int8_t *rhs_2 = rhs + 2 * rhs_cols; + + int32_t rhs_sum_0 = 0; + int32_t rhs_sum_1 = 0; + int32_t rhs_sum_2 = 0; + + uint32_t col_cnt = (uint32_t)rhs_cols; + + for (int i = 0; i < col_loop_cnt; i++) + { + mve_pred16_t p = vctp8q(col_cnt); + col_cnt -= 16; + + const int8x16_t input = vldrbq_z_s8(lhs_vec, p); + + const int8x16_t ker_0 = vldrbq_z_s8(rhs_0, p); + rhs_sum_0 = vaddvaq_p_s8(rhs_sum_0, ker_0, p); + acc_0 = vmladavaq_p_s8(acc_0, ker_0, input, p); + + const int8x16_t ker_1 = vldrbq_z_s8(rhs_1, p); + rhs_sum_1 = vaddvaq_p_s8(rhs_sum_1, ker_1, p); + acc_1 = vmladavaq_p_s8(acc_1, ker_1, input, p); + + const int8x16_t ker_2 = vldrbq_z_s8(rhs_2, p); + rhs_sum_2 = vaddvaq_p_s8(rhs_sum_2, ker_2, p); + acc_2 = vmladavaq_p_s8(acc_2, ker_2, input, p); + + lhs_vec += 16; + rhs_0 += 16; + rhs_1 += 16; + rhs_2 += 16; + } + rhs += 3 * rhs_cols; + + int32x4_t acc = {acc_0, acc_1, acc_2, 0}; + mve_pred16_t p = vctp32q(3); + if (bias) + { + int32x4_t b = vldrwq_z_s32(bias, p); + acc = vaddq_m_s32(vuninitializedq_s32(), acc, b, p); + bias += 3; + } + const int32x4_t rhs_sum = {rhs_sum_0, rhs_sum_1, rhs_sum_2, 0}; + acc += vdupq_n_s32(lhs_offset) * rhs_sum; + + acc = arm_requantize_mve(acc, dst_multiplier, dst_shift); + acc = vaddq_s32(acc, vdupq_n_s32(dst_offset)); + acc = vmaxq_s32(acc, vdupq_n_s32(activation_min)); + acc = vminq_s32(acc, vdupq_n_s32(activation_max)); + + if (address_offset > 1L) + { + vstrbq_scatter_offset_s32(dst, address_offset_array, acc); + } + else + { + vstrbq_p_s32(dst, acc, p); + } + dst += 3 * address_offset; + } + + const int loop_cnt = rhs_rows % 3; + for (int i_row_loop_cnt = 0; i_row_loop_cnt < loop_cnt; i_row_loop_cnt++) + { + int32_t acc_0 = 0; + const int32_t col_loop_cnt = (rhs_cols + 15) / 16; + const int8_t *lhs_vec = lhs; + const int8_t *rhs_0 = rhs; + int32_t rhs_sum_0 = 0; + uint32_t col_cnt = (uint32_t)rhs_cols; + + for (int i = 0; i < col_loop_cnt; i++) + { + mve_pred16_t p = vctp8q(col_cnt); + col_cnt -= 16; + const int8x16_t input = vldrbq_z_s8(lhs_vec, p); + + const int8x16_t ker_0 = vldrbq_z_s8(rhs_0, p); + rhs_sum_0 = vaddvaq_p_s8(rhs_sum_0, ker_0, p); + acc_0 = vmladavaq_p_s8(acc_0, ker_0, input, p); + + lhs_vec += 16; + rhs_0 += 16; + } + rhs += rhs_cols; + + if (bias) + { + acc_0 += *bias; + bias++; + } + const int32_t offsets = rhs_sum_0 * lhs_offset; + acc_0 += offsets; + acc_0 = arm_nn_requantize(acc_0, dst_multiplier, dst_shift); + acc_0 += dst_offset; + + // Clamp the result + acc_0 = MAX(acc_0, activation_min); + *dst = MIN(acc_0, activation_max); + dst += address_offset; + } + +#elif defined(ARM_MATH_DSP) + const int32_t row_loop_cnt = rhs_rows / 2; + const int16_t lhs_offset_s16 = (int16_t)lhs_offset; + const uint32_t lhs_offset_s16x2 = __PKHBT(lhs_offset_s16, lhs_offset_s16, 16); + + for (int32_t i = 0; i < row_loop_cnt; i++) + { + int32_t acc_0 = 0; + int32_t acc_1 = 0; + if (bias) + { + acc_0 = *bias++; + acc_1 = *bias++; + } + + const int32_t col_loop_cnt = rhs_cols / 4; + + const int8_t *lhs_vec = lhs; + const int8_t *rhs_0 = rhs; + const int8_t *rhs_1 = rhs + rhs_cols; + rhs += 2 * rhs_cols; + + for (int j = col_loop_cnt; j != 0; j--) + { + int32_t vec_0 = arm_nn_read_q7x4_ia(&lhs_vec); + int32_t vec_1 = __SXTAB16_RORn(lhs_offset_s16x2, (uint32_t)vec_0, 8); + + vec_0 = __SXTAB16(lhs_offset_s16x2, vec_0); + + int32_t ker_0 = arm_nn_read_q7x4_ia(&rhs_0); + int32_t ker_1 = __SXTB16_RORn((uint32_t)ker_0, 8); + ker_0 = __SXTB16(ker_0); + + acc_0 = __SMLAD(ker_1, vec_1, acc_0); + acc_0 = __SMLAD(ker_0, vec_0, acc_0); + + ker_0 = arm_nn_read_q7x4_ia(&rhs_1); + ker_1 = __SXTB16_RORn((uint32_t)ker_0, 8); + ker_0 = __SXTB16(ker_0); + + acc_1 = __SMLAD(ker_1, vec_1, acc_1); + acc_1 = __SMLAD(ker_0, vec_0, acc_1); + } + + for (int k = col_loop_cnt * 4; k < rhs_cols; k++) + { + const int32_t lhs_temp = (*lhs_vec + lhs_offset); + lhs_vec++; + acc_0 += lhs_temp * (*rhs_0); + rhs_0++; + acc_1 += lhs_temp * (*rhs_1); + rhs_1++; + } + + acc_0 = arm_nn_requantize(acc_0, dst_multiplier, dst_shift); + acc_1 = arm_nn_requantize(acc_1, dst_multiplier, dst_shift); + + // Add offset + acc_0 += dst_offset; + acc_1 += dst_offset; + // Clamp the result + acc_0 = MAX(acc_0, activation_min); + acc_0 = MIN(acc_0, activation_max); + acc_1 = MAX(acc_1, activation_min); + acc_1 = MIN(acc_1, activation_max); + *dst = (int8_t)acc_0; + *(dst + address_offset) = (int8_t)acc_1; + dst += 2 * address_offset; + } + + if (rhs_rows & 0x1) + { + int32_t acc_0 = 0; + if (bias) + { + acc_0 = *bias++; + } + const int32_t col_loop_cnt = rhs_cols / 4; + + const int8_t *lhs_vec = lhs; + const int8_t *rhs_0 = rhs; + + for (int i = col_loop_cnt; i != 0; i--) + { + int32_t vec_0 = arm_nn_read_q7x4_ia(&lhs_vec); + int32_t vec_1 = __SXTAB16_RORn(lhs_offset_s16x2, (uint32_t)vec_0, 8); + vec_0 = __SXTAB16(lhs_offset_s16x2, vec_0); + + int32_t ker_0 = arm_nn_read_q7x4_ia(&rhs_0); + int32_t ker_1 = __SXTB16_RORn((uint32_t)ker_0, 8); + ker_0 = __SXTB16(ker_0); + + acc_0 = __SMLAD(ker_1, vec_1, acc_0); + acc_0 = __SMLAD(ker_0, vec_0, acc_0); + } + + for (int j = col_loop_cnt * 4; j < rhs_cols; j++) + { + const int32_t lhs_temp = (*lhs_vec + lhs_offset); + lhs_vec++; + acc_0 += lhs_temp * (*rhs_0); + rhs_0++; + } + + acc_0 = arm_nn_requantize(acc_0, dst_multiplier, dst_shift); + + // Add offset + acc_0 += dst_offset; + // Clamp the result + acc_0 = MAX(acc_0, activation_min); + acc_0 = MIN(acc_0, activation_max); + *dst = (int8_t)acc_0; + dst += address_offset; + } + +#else + + const int32_t row_loop_cnt = rhs_rows / 3; + + for (int i_row_loop_cnt = 0; i_row_loop_cnt < row_loop_cnt; i_row_loop_cnt++) + { + const q7_t *lhs_ptr = lhs; + const q7_t *rhs_ptr_0 = &rhs[0]; + const q7_t *rhs_ptr_1 = &rhs[rhs_cols]; + const q7_t *rhs_ptr_2 = &rhs[rhs_cols * 2]; + + q31_t res00 = 0; + q31_t res01 = 0; + q31_t res02 = 0; + if (bias) + { + res00 = *bias++; + res01 = *bias++; + res02 = *bias++; + } + for (int32_t rhs_cols_idx = 0; rhs_cols_idx < rhs_cols; ++rhs_cols_idx) + { + const q31_t rhs_value0 = (int8_t)*rhs_ptr_0; + const q31_t rhs_value1 = (int8_t)*rhs_ptr_1; + const q31_t rhs_value2 = (int8_t)*rhs_ptr_2; + const q31_t lhs_value = (int8_t)*lhs_ptr + lhs_offset; + + res00 += lhs_value * rhs_value0; + res01 += lhs_value * rhs_value1; + res02 += lhs_value * rhs_value2; + + ++rhs_ptr_0; + ++rhs_ptr_1; + ++rhs_ptr_2; + ++lhs_ptr; + } + // Quantize down + res00 = arm_nn_requantize(res00, dst_multiplier, dst_shift); + res01 = arm_nn_requantize(res01, dst_multiplier, dst_shift); + res02 = arm_nn_requantize(res02, dst_multiplier, dst_shift); + + // Add offset + res00 += dst_offset; + res01 += dst_offset; + res02 += dst_offset; + + // Clamp the result + res00 = MAX(res00, activation_min); + res00 = MIN(res00, activation_max); + res01 = MAX(res01, activation_min); + res01 = MIN(res01, activation_max); + res02 = MAX(res02, activation_min); + res02 = MIN(res02, activation_max); + + *dst = (q7_t)res00; + *(dst + address_offset) = (q7_t)res01; + *(dst + 2 * address_offset) = (q7_t)res02; + dst += 3 * address_offset; + + rhs += 3 * rhs_cols; + } + + const int loop_cnt = rhs_rows % 3; + + for (int i_loop_cnt = 0; i_loop_cnt < loop_cnt; i_loop_cnt++) + { + const q7_t *lhs_ptr = &lhs[0]; + const q7_t *rhs_ptr = &rhs[0]; + + q31_t res00 = 0; + if (bias) + { + res00 = *bias++; + } + + for (int32_t rhs_cols_idx = 0; rhs_cols_idx < rhs_cols; ++rhs_cols_idx) + { + q31_t rhs_value0 = (int8_t)rhs_ptr[0]; + q31_t lhs_value = (int8_t)lhs_ptr[0] + lhs_offset; + + res00 += lhs_value * rhs_value0; + + ++rhs_ptr; + ++lhs_ptr; + } + + // Quantize down + res00 = arm_nn_requantize(res00, dst_multiplier, dst_shift); + + // Add offset + res00 += dst_offset; + + // Clamp the result + res00 = MAX(res00, activation_min); + res00 = MIN(res00, activation_max); + + *dst = (int8_t)res00; + dst += address_offset; + rhs += rhs_cols; + } +#endif + return ARM_MATH_SUCCESS; +} + +/** + * @} end of NNBasicMath group + */ diff --git a/Drivers/CMSIS/NN/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_svdf_s8.c b/Drivers/CMSIS/NN/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_svdf_s8.c new file mode 100644 index 0000000..5b821c3 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_svdf_s8.c @@ -0,0 +1,341 @@ +/* + * Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_nn_vec_mat_mult_t_svdf_s8 + * Description: s8 vector by matrix (transposed) multiplication with + * s16 output. Targetted at SVDF operator. + * + * $Date: 15. April 2021 + * $Revision: V.1.0.0 + * + * Target Processor: Cortex-M + * + * -------------------------------------------------------------------- */ + +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupSupport + */ + +/** + * @addtogroup NNBasicMath + * @{ + */ + +/* + * s8 vector(lhs) by matrix (transposed) multiplication + * + * Refer header file for details. + * + */ +arm_status arm_nn_vec_mat_mult_t_svdf_s8(const q7_t *lhs, + const q7_t *rhs, + q15_t *dst, + const int32_t lhs_offset, + const int32_t rhs_offset, + const int32_t dst_offset, + const int32_t dst_multiplier, + const int32_t dst_shift, + const int32_t rhs_cols, + const int32_t rhs_rows, + const int32_t activation_min, + const int32_t activation_max) +{ + (void)rhs_offset; + if (rhs_cols < 0 || (NN_Q31_MAX - rhs_cols) < 16 || dst_offset < 0) + { + return ARM_MATH_ARGUMENT_ERROR; + } + + (void)rhs_offset; +#if defined(ARM_MATH_MVEI) + int32_t row_loop_cnt = rhs_rows / 3; + + for (int i_row_loop_cnt = 0; i_row_loop_cnt < row_loop_cnt; i_row_loop_cnt++) + { + int32_t acc_0 = 0; + int32_t acc_1 = 0; + int32_t acc_2 = 0; + + const int32_t col_loop_cnt = (rhs_cols + 15) / 16; + + const int8_t *lhs_vec = lhs; + const int8_t *rhs_0 = rhs; + const int8_t *rhs_1 = rhs + rhs_cols; + const int8_t *rhs_2 = rhs + 2 * rhs_cols; + + int32_t rhs_sum_0 = 0; + int32_t rhs_sum_1 = 0; + int32_t rhs_sum_2 = 0; + + uint32_t col_cnt = (uint32_t)rhs_cols; + + for (int i = 0; i < col_loop_cnt; i++) + { + mve_pred16_t p = vctp8q(col_cnt); + col_cnt -= 16; + + const int8x16_t input = vldrbq_z_s8(lhs_vec, p); + + const int8x16_t ker_0 = vldrbq_z_s8(rhs_0, p); + rhs_sum_0 = vaddvaq_p_s8(rhs_sum_0, ker_0, p); + acc_0 = vmladavaq_p_s8(acc_0, ker_0, input, p); + + const int8x16_t ker_1 = vldrbq_z_s8(rhs_1, p); + rhs_sum_1 = vaddvaq_p_s8(rhs_sum_1, ker_1, p); + acc_1 = vmladavaq_p_s8(acc_1, ker_1, input, p); + + const int8x16_t ker_2 = vldrbq_z_s8(rhs_2, p); + rhs_sum_2 = vaddvaq_p_s8(rhs_sum_2, ker_2, p); + acc_2 = vmladavaq_p_s8(acc_2, ker_2, input, p); + + lhs_vec += 16; + rhs_0 += 16; + rhs_1 += 16; + rhs_2 += 16; + } + rhs += 3 * rhs_cols; + + int32x4_t acc = {acc_0, acc_1, acc_2, 0}; + const int32x4_t rhs_sum = {rhs_sum_0, rhs_sum_1, rhs_sum_2, 0}; + acc += vdupq_n_s32(lhs_offset) * rhs_sum; + + acc = arm_requantize_mve(acc, dst_multiplier, dst_shift); + acc = vmaxq_s32(acc, vdupq_n_s32(activation_min)); + acc = vminq_s32(acc, vdupq_n_s32(activation_max)); + *(dst) = (int16_t)acc[0]; + *(dst + dst_offset) = (int16_t)acc[1]; + *(dst + 2 * dst_offset) = (int16_t)acc[2]; + dst += 3 * dst_offset; + } + + const int loop_cnt = rhs_rows % 3; + for (int i_row_loop_cnt = 0; i_row_loop_cnt < loop_cnt; i_row_loop_cnt++) + { + int32_t acc_0 = 0; + const int32_t col_loop_cnt = (rhs_cols + 15) / 16; + const int8_t *lhs_vec = lhs; + const int8_t *rhs_0 = rhs; + int32_t rhs_sum_0 = 0; + uint32_t col_cnt = (uint32_t)rhs_cols; + + for (int i = 0; i < col_loop_cnt; i++) + { + mve_pred16_t p = vctp8q(col_cnt); + col_cnt -= 16; + const int8x16_t input = vldrbq_z_s8(lhs_vec, p); + + const int8x16_t ker_0 = vldrbq_z_s8(rhs_0, p); + rhs_sum_0 = vaddvaq_p_s8(rhs_sum_0, ker_0, p); + acc_0 = vmladavaq_p_s8(acc_0, ker_0, input, p); + + lhs_vec += 16; + rhs_0 += 16; + } + rhs += rhs_cols; + + const int32_t offsets = rhs_sum_0 * lhs_offset; + acc_0 = __QADD(acc_0, offsets); + acc_0 = arm_nn_requantize(acc_0, dst_multiplier, dst_shift); + + // Clamp the result + acc_0 = MAX(acc_0, activation_min); + *dst = (q15_t)MIN(acc_0, activation_max); + dst += dst_offset; + } + +#elif defined(ARM_MATH_DSP) + int32_t row_loop_cnt = rhs_rows / 2; + + const int16_t lhs_offset_s16 = lhs_offset; + const int16_t rhs_offset_s16 = rhs_offset; + + const uint32_t lhs_offset_s16x2 = __PKHBT(lhs_offset_s16, lhs_offset_s16, 16); + const uint32_t rhs_offset_s16x2 = __PKHBT(rhs_offset_s16, rhs_offset_s16, 16); + for (int32_t i = 0; i < row_loop_cnt; i++) + { + int32_t acc_0 = 0; + int32_t acc_1 = 0; + + const int32_t col_loop_cnt = rhs_cols / 4; + const int8_t *lhs_vec = lhs; + const int8_t *rhs_0 = rhs; + const int8_t *rhs_1 = rhs + rhs_cols; + rhs += 2 * rhs_cols; + for (int j = col_loop_cnt; j != 0; j--) + { + int32_t vec_0 = arm_nn_read_q7x4_ia(&lhs_vec); + int32_t vec_1 = __SXTAB16_RORn(lhs_offset_s16x2, (uint32_t)vec_0, 8); + vec_0 = __SXTAB16(lhs_offset_s16x2, vec_0); + int32_t ker_0 = arm_nn_read_q7x4_ia(&rhs_0); + int32_t ker_1 = __SXTAB16_RORn(rhs_offset_s16x2, (uint32_t)ker_0, 8); + ker_0 = __SXTAB16(rhs_offset_s16x2, ker_0); + acc_0 = __SMLAD(ker_1, vec_1, acc_0); + acc_0 = __SMLAD(ker_0, vec_0, acc_0); + ker_0 = arm_nn_read_q7x4_ia(&rhs_1); + ker_1 = __SXTAB16_RORn(rhs_offset_s16x2, (uint32_t)ker_0, 8); + ker_0 = __SXTAB16(rhs_offset_s16x2, ker_0); + acc_1 = __SMLAD(ker_1, vec_1, acc_1); + acc_1 = __SMLAD(ker_0, vec_0, acc_1); + } + for (int k = col_loop_cnt * 4; k < rhs_cols; k++) + { + const int32_t lhs_temp = (*lhs_vec + lhs_offset); + lhs_vec++; + acc_0 += lhs_temp * (*rhs_0 + rhs_offset); + rhs_0++; + acc_1 += lhs_temp * (*rhs_1 + rhs_offset); + rhs_1++; + } + acc_0 = arm_nn_requantize(acc_0, dst_multiplier, dst_shift); + acc_1 = arm_nn_requantize(acc_1, dst_multiplier, dst_shift); + + // Clamp the result + acc_0 = MAX(acc_0, activation_min); + acc_0 = MIN(acc_0, activation_max); + acc_1 = MAX(acc_1, activation_min); + acc_1 = MIN(acc_1, activation_max); + *dst = (q15_t)acc_0; + *(dst + dst_offset) = (q15_t)acc_1; + dst += 2 * dst_offset; + } + if (rhs_rows & 0x1) + { + int32_t acc_0 = 0; + const int32_t col_loop_cnt = rhs_cols / 4; + const int8_t *lhs_vec = lhs; + const int8_t *rhs_0 = rhs; + for (int i = col_loop_cnt; i != 0; i--) + { + int32_t vec_0 = arm_nn_read_q7x4_ia(&lhs_vec); + int32_t vec_1 = __SXTAB16(lhs_offset_s16x2, __ROR((uint32_t)vec_0, 8)); + vec_0 = __SXTAB16(lhs_offset_s16x2, vec_0); + int32_t ker_0 = arm_nn_read_q7x4_ia(&rhs_0); + int32_t ker_1 = __SXTAB16(rhs_offset_s16x2, __ROR((uint32_t)ker_0, 8)); + ker_0 = __SXTAB16(rhs_offset_s16x2, ker_0); + acc_0 = __SMLAD(ker_1, vec_1, acc_0); + acc_0 = __SMLAD(ker_0, vec_0, acc_0); + } + for (int j = col_loop_cnt * 4; j < rhs_cols; j++) + { + const int32_t lhs_temp = (*lhs_vec + lhs_offset); + lhs_vec++; + acc_0 += lhs_temp * (*rhs_0 + rhs_offset); + rhs_0++; + } + acc_0 = arm_nn_requantize(acc_0, dst_multiplier, dst_shift); + + // Clamp the result + acc_0 = MAX(acc_0, activation_min); + acc_0 = MIN(acc_0, activation_max); + *dst = (q15_t)acc_0; + dst += dst_offset; + } + +#else + + int32_t row_loop_cnt = rhs_rows / 3; + + for (int i_row_loop_cnt = 0; i_row_loop_cnt < row_loop_cnt; i_row_loop_cnt++) + { + const q7_t *lhs_ptr = lhs; + const q7_t *rhs_ptr_0 = &rhs[0]; + const q7_t *rhs_ptr_1 = &rhs[rhs_cols]; + const q7_t *rhs_ptr_2 = &rhs[rhs_cols * 2]; + + q31_t res00 = 0; + q31_t res01 = 0; + q31_t res02 = 0; + for (int32_t rhs_cols_idx = 0; rhs_cols_idx < rhs_cols; ++rhs_cols_idx) + { + const q31_t rhs_value0 = (int8_t)*rhs_ptr_0; + const q31_t rhs_value1 = (int8_t)*rhs_ptr_1; + const q31_t rhs_value2 = (int8_t)*rhs_ptr_2; + const q31_t lhs_value = (int8_t)*lhs_ptr + lhs_offset; + + res00 += lhs_value * rhs_value0; + res01 += lhs_value * rhs_value1; + res02 += lhs_value * rhs_value2; + + ++rhs_ptr_0; + ++rhs_ptr_1; + ++rhs_ptr_2; + ++lhs_ptr; + } + // Quantize down + res00 = arm_nn_requantize(res00, dst_multiplier, dst_shift); + res01 = arm_nn_requantize(res01, dst_multiplier, dst_shift); + res02 = arm_nn_requantize(res02, dst_multiplier, dst_shift); + + // Clamp the result + res00 = MAX(res00, activation_min); + res00 = MIN(res00, activation_max); + res01 = MAX(res01, activation_min); + res01 = MIN(res01, activation_max); + res02 = MAX(res02, activation_min); + res02 = MIN(res02, activation_max); + + *dst = (q15_t)res00; + *(dst + dst_offset) = (q15_t)res01; + *(dst + 2 * dst_offset) = (q15_t)res02; + dst += 3 * dst_offset; + rhs += 3 * rhs_cols; + } + + const int loop_cnt = rhs_rows % 3; + + for (int i_loop_cnt = 0; i_loop_cnt < loop_cnt; i_loop_cnt++) + { + const q7_t *lhs_ptr = &lhs[0]; + const q7_t *rhs_ptr = &rhs[0]; + + q31_t res00 = 0; + + for (int32_t rhs_cols_idx = 0; rhs_cols_idx < rhs_cols; ++rhs_cols_idx) + { + q31_t rhs_value0 = (int8_t)rhs_ptr[0] + rhs_offset; + q31_t lhs_value = (int8_t)lhs_ptr[0] + lhs_offset; + + res00 += lhs_value * rhs_value0; + + ++rhs_ptr; + ++lhs_ptr; + } + + // Quantize down + res00 = arm_nn_requantize(res00, dst_multiplier, dst_shift); + + // Clamp the result + res00 = MAX(res00, activation_min); + res00 = MIN(res00, activation_max); + + *dst = (q15_t)res00; + dst += dst_offset; + rhs += rhs_cols; + } +#endif + + return ARM_MATH_SUCCESS; +} + +/** + * @} end of NNBasicMath group + */ diff --git a/Drivers/CMSIS/NN/Source/NNSupportFunctions/arm_nntables.c b/Drivers/CMSIS/NN/Source/NNSupportFunctions/arm_nntables.c new file mode 100644 index 0000000..5a8cea2 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/NNSupportFunctions/arm_nntables.c @@ -0,0 +1,203 @@ +/* + * Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_nntables.c + * Description: Converts the elements of the Q7 vector to Q15 vector without left-shift + * + * $Date: 17. January 2018 + * $Revision: V.1.0.0 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnsupportfunctions.h" + +/** + * @brief tables for various activation functions + * + * This file include the declaration of common tables. + * Most of them are used for activation functions + * + * Assumption: + * Unified table: input is 3.x format, i.e, range of [-8, 8) + * sigmoid(8) = 0.9996646498695336 + * tanh(8) = 0.9999997749296758 + * The accuracy here should be good enough + * + * 2-stage HL table: + * + * The entire input range is divided into two parts: + * + * Low range table: 0x000x xxxx or 0x111x xxxx + * table entry will be the binary number excluding the first + * two digits, i.e., 0x0x xxxx or 0x1x xxxx + * + * + * + * High range table 0x0010 0000 -- 0x0111 1111 + * 0x1000 0000 -- 0x1101 1111 + * + * For positive numbers, table entry will be + * 0x0010 0000 -- 0x0111 1111 minus 0x0010 0000 + * i.e., 0x0000 0000 - 0x0101 11111 + * + * same thing for the negative numbers, table entry will be + * 0x1000 0000 -- 0x1101 1111 minux 0x0010 0000 + * i.e., 0x0110 0000 - 0x1011 1111 + */ + +const q7_t sigmoidTable_q7[256] = { + 0x40, 0x42, 0x44, 0x46, 0x48, 0x4a, 0x4c, 0x4e, 0x50, 0x52, 0x53, 0x55, 0x57, 0x59, 0x5a, 0x5c, 0x5e, 0x5f, 0x61, + 0x62, 0x63, 0x65, 0x66, 0x67, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x72, 0x73, 0x74, 0x74, + 0x75, 0x76, 0x76, 0x77, 0x77, 0x78, 0x78, 0x79, 0x79, 0x7a, 0x7a, 0x7a, 0x7b, 0x7b, 0x7b, 0x7c, 0x7c, 0x7c, 0x7c, + 0x7c, 0x7d, 0x7d, 0x7d, 0x7d, 0x7d, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, + 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, + 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, + 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x04, 0x04, 0x04, 0x04, 0x04, 0x05, 0x05, 0x05, 0x06, 0x06, + 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0c, 0x0c, 0x0d, 0x0e, 0x0e, 0x0f, 0x10, 0x11, 0x12, + 0x13, 0x14, 0x15, 0x16, 0x17, 0x19, 0x1a, 0x1b, 0x1d, 0x1e, 0x1f, 0x21, 0x22, 0x24, 0x26, 0x27, 0x29, 0x2b, 0x2d, + 0x2e, 0x30, 0x32, 0x34, 0x36, 0x38, 0x3a, 0x3c, 0x3e, +}; + +const q15_t sigmoidTable_q15[256] = { + 0x4000, 0x4200, 0x43ff, 0x45fc, 0x47f5, 0x49eb, 0x4bdc, 0x4dc8, 0x4fad, 0x518a, 0x5360, 0x552c, 0x56ef, 0x58a8, + 0x5a57, 0x5bfb, 0x5d93, 0x5f20, 0x60a1, 0x6216, 0x637f, 0x64db, 0x662b, 0x676f, 0x68a6, 0x69d2, 0x6af1, 0x6c05, + 0x6d0d, 0x6e09, 0x6efb, 0x6fe2, 0x70be, 0x7190, 0x7258, 0x7316, 0x73cc, 0x7478, 0x751b, 0x75b7, 0x764a, 0x76d6, + 0x775b, 0x77d8, 0x784f, 0x78c0, 0x792a, 0x798f, 0x79ee, 0x7a48, 0x7a9d, 0x7aed, 0x7b39, 0x7b80, 0x7bc4, 0x7c03, + 0x7c3f, 0x7c78, 0x7cad, 0x7ce0, 0x7d0f, 0x7d3c, 0x7d66, 0x7d8d, 0x7db3, 0x7dd6, 0x7df7, 0x7e16, 0x7e33, 0x7e4f, + 0x7e69, 0x7e81, 0x7e98, 0x7eae, 0x7ec2, 0x7ed5, 0x7ee7, 0x7ef8, 0x7f08, 0x7f17, 0x7f25, 0x7f32, 0x7f3e, 0x7f4a, + 0x7f55, 0x7f5f, 0x7f69, 0x7f72, 0x7f7b, 0x7f83, 0x7f8a, 0x7f91, 0x7f98, 0x7f9e, 0x7fa4, 0x7faa, 0x7faf, 0x7fb4, + 0x7fb8, 0x7fbd, 0x7fc1, 0x7fc5, 0x7fc8, 0x7fcc, 0x7fcf, 0x7fd2, 0x7fd5, 0x7fd7, 0x7fda, 0x7fdc, 0x7fde, 0x7fe0, + 0x7fe2, 0x7fe4, 0x7fe6, 0x7fe7, 0x7fe9, 0x7fea, 0x7feb, 0x7fed, 0x7fee, 0x7fef, 0x7ff0, 0x7ff1, 0x7ff2, 0x7ff3, + 0x7ff4, 0x7ff4, 0x000b, 0x000c, 0x000c, 0x000d, 0x000e, 0x000f, 0x0010, 0x0011, 0x0012, 0x0013, 0x0015, 0x0016, + 0x0017, 0x0019, 0x001a, 0x001c, 0x001e, 0x0020, 0x0022, 0x0024, 0x0026, 0x0029, 0x002b, 0x002e, 0x0031, 0x0034, + 0x0038, 0x003b, 0x003f, 0x0043, 0x0048, 0x004c, 0x0051, 0x0056, 0x005c, 0x0062, 0x0068, 0x006f, 0x0076, 0x007d, + 0x0085, 0x008e, 0x0097, 0x00a1, 0x00ab, 0x00b6, 0x00c2, 0x00ce, 0x00db, 0x00e9, 0x00f8, 0x0108, 0x0119, 0x012b, + 0x013e, 0x0152, 0x0168, 0x017f, 0x0197, 0x01b1, 0x01cd, 0x01ea, 0x0209, 0x022a, 0x024d, 0x0273, 0x029a, 0x02c4, + 0x02f1, 0x0320, 0x0353, 0x0388, 0x03c1, 0x03fd, 0x043c, 0x0480, 0x04c7, 0x0513, 0x0563, 0x05b8, 0x0612, 0x0671, + 0x06d6, 0x0740, 0x07b1, 0x0828, 0x08a5, 0x092a, 0x09b6, 0x0a49, 0x0ae5, 0x0b88, 0x0c34, 0x0cea, 0x0da8, 0x0e70, + 0x0f42, 0x101e, 0x1105, 0x11f7, 0x12f3, 0x13fb, 0x150f, 0x162e, 0x175a, 0x1891, 0x19d5, 0x1b25, 0x1c81, 0x1dea, + 0x1f5f, 0x20e0, 0x226d, 0x2405, 0x25a9, 0x2758, 0x2911, 0x2ad4, 0x2ca0, 0x2e76, 0x3053, 0x3238, 0x3424, 0x3615, + 0x380b, 0x3a04, 0x3c01, 0x3e00, +}; + +const q15_t sigmoidLTable_q15[128] = { + 0x4000, 0x4100, 0x4200, 0x42ff, 0x43ff, 0x44fd, 0x45fc, 0x46f9, 0x47f5, 0x48f1, 0x49eb, 0x4ae5, 0x4bdc, + 0x4cd3, 0x4dc8, 0x4ebb, 0x4fad, 0x509c, 0x518a, 0x5276, 0x5360, 0x5447, 0x552c, 0x560f, 0x56ef, 0x57cd, + 0x58a8, 0x5981, 0x5a57, 0x5b2a, 0x5bfb, 0x5cc9, 0x5d93, 0x5e5b, 0x5f20, 0x5fe2, 0x60a1, 0x615d, 0x6216, + 0x62cc, 0x637f, 0x642e, 0x64db, 0x6584, 0x662b, 0x66ce, 0x676f, 0x680c, 0x68a6, 0x693d, 0x69d2, 0x6a63, + 0x6af1, 0x6b7c, 0x6c05, 0x6c8a, 0x6d0d, 0x6d8d, 0x6e09, 0x6e84, 0x6efb, 0x6f70, 0x6fe2, 0x7051, 0x0f42, + 0x0faf, 0x101e, 0x1090, 0x1105, 0x117c, 0x11f7, 0x1273, 0x12f3, 0x1376, 0x13fb, 0x1484, 0x150f, 0x159d, + 0x162e, 0x16c3, 0x175a, 0x17f4, 0x1891, 0x1932, 0x19d5, 0x1a7c, 0x1b25, 0x1bd2, 0x1c81, 0x1d34, 0x1dea, + 0x1ea3, 0x1f5f, 0x201e, 0x20e0, 0x21a5, 0x226d, 0x2337, 0x2405, 0x24d6, 0x25a9, 0x267f, 0x2758, 0x2833, + 0x2911, 0x29f1, 0x2ad4, 0x2bb9, 0x2ca0, 0x2d8a, 0x2e76, 0x2f64, 0x3053, 0x3145, 0x3238, 0x332d, 0x3424, + 0x351b, 0x3615, 0x370f, 0x380b, 0x3907, 0x3a04, 0x3b03, 0x3c01, 0x3d01, 0x3e00, 0x3f00, +}; + +const q15_t sigmoidHTable_q15[192] = { + 0x70be, 0x7190, 0x7258, 0x7316, 0x73cc, 0x7478, 0x751b, 0x75b7, 0x764a, 0x76d6, 0x775b, 0x77d8, 0x784f, 0x78c0, + 0x792a, 0x798f, 0x79ee, 0x7a48, 0x7a9d, 0x7aed, 0x7b39, 0x7b80, 0x7bc4, 0x7c03, 0x7c3f, 0x7c78, 0x7cad, 0x7ce0, + 0x7d0f, 0x7d3c, 0x7d66, 0x7d8d, 0x7db3, 0x7dd6, 0x7df7, 0x7e16, 0x7e33, 0x7e4f, 0x7e69, 0x7e81, 0x7e98, 0x7eae, + 0x7ec2, 0x7ed5, 0x7ee7, 0x7ef8, 0x7f08, 0x7f17, 0x7f25, 0x7f32, 0x7f3e, 0x7f4a, 0x7f55, 0x7f5f, 0x7f69, 0x7f72, + 0x7f7b, 0x7f83, 0x7f8a, 0x7f91, 0x7f98, 0x7f9e, 0x7fa4, 0x7faa, 0x7faf, 0x7fb4, 0x7fb8, 0x7fbd, 0x7fc1, 0x7fc5, + 0x7fc8, 0x7fcc, 0x7fcf, 0x7fd2, 0x7fd5, 0x7fd7, 0x7fda, 0x7fdc, 0x7fde, 0x7fe0, 0x7fe2, 0x7fe4, 0x7fe6, 0x7fe7, + 0x7fe9, 0x7fea, 0x7feb, 0x7fed, 0x7fee, 0x7fef, 0x7ff0, 0x7ff1, 0x7ff2, 0x7ff3, 0x7ff4, 0x7ff4, 0x000b, 0x000c, + 0x000c, 0x000d, 0x000e, 0x000f, 0x0010, 0x0011, 0x0012, 0x0013, 0x0015, 0x0016, 0x0017, 0x0019, 0x001a, 0x001c, + 0x001e, 0x0020, 0x0022, 0x0024, 0x0026, 0x0029, 0x002b, 0x002e, 0x0031, 0x0034, 0x0038, 0x003b, 0x003f, 0x0043, + 0x0048, 0x004c, 0x0051, 0x0056, 0x005c, 0x0062, 0x0068, 0x006f, 0x0076, 0x007d, 0x0085, 0x008e, 0x0097, 0x00a1, + 0x00ab, 0x00b6, 0x00c2, 0x00ce, 0x00db, 0x00e9, 0x00f8, 0x0108, 0x0119, 0x012b, 0x013e, 0x0152, 0x0168, 0x017f, + 0x0197, 0x01b1, 0x01cd, 0x01ea, 0x0209, 0x022a, 0x024d, 0x0273, 0x029a, 0x02c4, 0x02f1, 0x0320, 0x0353, 0x0388, + 0x03c1, 0x03fd, 0x043c, 0x0480, 0x04c7, 0x0513, 0x0563, 0x05b8, 0x0612, 0x0671, 0x06d6, 0x0740, 0x07b1, 0x0828, + 0x08a5, 0x092a, 0x09b6, 0x0a49, 0x0ae5, 0x0b88, 0x0c34, 0x0cea, 0x0da8, 0x0e70, +}; + +const q7_t tanhTable_q7[256] = { + 0x00, 0x08, 0x10, 0x18, 0x1f, 0x27, 0x2e, 0x35, 0x3b, 0x41, 0x47, 0x4c, 0x51, 0x56, 0x5a, 0x5e, 0x61, 0x65, 0x68, + 0x6a, 0x6d, 0x6f, 0x71, 0x72, 0x74, 0x75, 0x76, 0x78, 0x78, 0x79, 0x7a, 0x7b, 0x7b, 0x7c, 0x7c, 0x7d, 0x7d, 0x7e, + 0x7e, 0x7e, 0x7e, 0x7e, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, + 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, + 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, + 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, + 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x81, 0x81, + 0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x82, 0x82, 0x82, 0x82, 0x82, 0x83, 0x83, 0x84, 0x84, 0x85, 0x85, 0x86, 0x87, + 0x88, 0x88, 0x8a, 0x8b, 0x8c, 0x8e, 0x8f, 0x91, 0x93, 0x96, 0x98, 0x9b, 0x9f, 0xa2, 0xa6, 0xaa, 0xaf, 0xb4, 0xb9, + 0xbf, 0xc5, 0xcb, 0xd2, 0xd9, 0xe1, 0xe8, 0xf0, 0xf8, +}; + +const q15_t tanhTable_q15[256] = { + 0x0000, 0x07fd, 0x0feb, 0x17b9, 0x1f59, 0x26bf, 0x2ddf, 0x34ae, 0x3b27, 0x4142, 0x46fd, 0x4c56, 0x514d, 0x55e2, + 0x5a1a, 0x5df6, 0x617c, 0x64b0, 0x6797, 0x6a37, 0x6c95, 0x6eb5, 0x709e, 0x7254, 0x73dc, 0x753a, 0x7672, 0x7788, + 0x787f, 0x795b, 0x7a1e, 0x7acb, 0x7b65, 0x7bee, 0x7c66, 0x7cd1, 0x7d30, 0x7d84, 0x7dce, 0x7e0f, 0x7e49, 0x7e7d, + 0x7eaa, 0x7ed2, 0x7ef5, 0x7f14, 0x7f30, 0x7f48, 0x7f5e, 0x7f71, 0x7f82, 0x7f91, 0x7f9e, 0x7fa9, 0x7fb3, 0x7fbc, + 0x7fc4, 0x7fcb, 0x7fd1, 0x7fd7, 0x7fdc, 0x7fe0, 0x7fe4, 0x7fe7, 0x7fea, 0x7fed, 0x7fef, 0x7ff1, 0x7ff3, 0x7ff4, + 0x7ff6, 0x7ff7, 0x7ff8, 0x7ff9, 0x7ffa, 0x7ffa, 0x7ffb, 0x7ffc, 0x7ffc, 0x7ffd, 0x7ffd, 0x7ffd, 0x7ffe, 0x7ffe, + 0x7ffe, 0x7ffe, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, + 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, + 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, + 0x7fff, 0x7fff, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8001, 0x8001, 0x8001, 0x8001, 0x8001, 0x8001, + 0x8001, 0x8001, 0x8001, 0x8002, 0x8002, 0x8002, 0x8002, 0x8003, 0x8003, 0x8003, 0x8004, 0x8004, 0x8005, 0x8006, + 0x8006, 0x8007, 0x8008, 0x8009, 0x800a, 0x800c, 0x800d, 0x800f, 0x8011, 0x8013, 0x8016, 0x8019, 0x801c, 0x8020, + 0x8024, 0x8029, 0x802f, 0x8035, 0x803c, 0x8044, 0x804d, 0x8057, 0x8062, 0x806f, 0x807e, 0x808f, 0x80a2, 0x80b8, + 0x80d0, 0x80ec, 0x810b, 0x812e, 0x8156, 0x8183, 0x81b7, 0x81f1, 0x8232, 0x827c, 0x82d0, 0x832f, 0x839a, 0x8412, + 0x849b, 0x8535, 0x85e2, 0x86a5, 0x8781, 0x8878, 0x898e, 0x8ac6, 0x8c24, 0x8dac, 0x8f62, 0x914b, 0x936b, 0x95c9, + 0x9869, 0x9b50, 0x9e84, 0xa20a, 0xa5e6, 0xaa1e, 0xaeb3, 0xb3aa, 0xb903, 0xbebe, 0xc4d9, 0xcb52, 0xd221, 0xd941, + 0xe0a7, 0xe847, 0xf015, 0xf803, +}; + +const q15_t tanhLTable_q15[128] = { + 0x0000, 0x0400, 0x07fd, 0x0bf7, 0x0feb, 0x13d7, 0x17b9, 0x1b90, 0x1f59, 0x2314, 0x26bf, 0x2a58, 0x2ddf, + 0x3151, 0x34ae, 0x37f6, 0x3b27, 0x3e40, 0x4142, 0x442c, 0x46fd, 0x49b6, 0x4c56, 0x4edd, 0x514d, 0x53a3, + 0x55e2, 0x580a, 0x5a1a, 0x5c13, 0x5df6, 0x5fc4, 0x617c, 0x6320, 0x64b0, 0x662d, 0x6797, 0x68f0, 0x6a37, + 0x6b6e, 0x6c95, 0x6dac, 0x6eb5, 0x6fb0, 0x709e, 0x717f, 0x7254, 0x731e, 0x73dc, 0x7490, 0x753a, 0x75da, + 0x7672, 0x7701, 0x7788, 0x7807, 0x787f, 0x78f0, 0x795b, 0x79bf, 0x7a1e, 0x7a77, 0x7acb, 0x7b1b, 0x849b, + 0x84e5, 0x8535, 0x8589, 0x85e2, 0x8641, 0x86a5, 0x8710, 0x8781, 0x87f9, 0x8878, 0x88ff, 0x898e, 0x8a26, + 0x8ac6, 0x8b70, 0x8c24, 0x8ce2, 0x8dac, 0x8e81, 0x8f62, 0x9050, 0x914b, 0x9254, 0x936b, 0x9492, 0x95c9, + 0x9710, 0x9869, 0x99d3, 0x9b50, 0x9ce0, 0x9e84, 0xa03c, 0xa20a, 0xa3ed, 0xa5e6, 0xa7f6, 0xaa1e, 0xac5d, + 0xaeb3, 0xb123, 0xb3aa, 0xb64a, 0xb903, 0xbbd4, 0xbebe, 0xc1c0, 0xc4d9, 0xc80a, 0xcb52, 0xceaf, 0xd221, + 0xd5a8, 0xd941, 0xdcec, 0xe0a7, 0xe470, 0xe847, 0xec29, 0xf015, 0xf409, 0xf803, 0xfc00, +}; + +const q15_t tanhHTable_q15[192] = { + 0x7b65, 0x7bee, 0x7c66, 0x7cd1, 0x7d30, 0x7d84, 0x7dce, 0x7e0f, 0x7e49, 0x7e7d, 0x7eaa, 0x7ed2, 0x7ef5, 0x7f14, + 0x7f30, 0x7f48, 0x7f5e, 0x7f71, 0x7f82, 0x7f91, 0x7f9e, 0x7fa9, 0x7fb3, 0x7fbc, 0x7fc4, 0x7fcb, 0x7fd1, 0x7fd7, + 0x7fdc, 0x7fe0, 0x7fe4, 0x7fe7, 0x7fea, 0x7fed, 0x7fef, 0x7ff1, 0x7ff3, 0x7ff4, 0x7ff6, 0x7ff7, 0x7ff8, 0x7ff9, + 0x7ffa, 0x7ffa, 0x7ffb, 0x7ffc, 0x7ffc, 0x7ffd, 0x7ffd, 0x7ffd, 0x7ffe, 0x7ffe, 0x7ffe, 0x7ffe, 0x7fff, 0x7fff, + 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, + 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, + 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8001, 0x8001, 0x8001, 0x8001, 0x8001, 0x8001, 0x8001, 0x8001, 0x8001, 0x8002, + 0x8002, 0x8002, 0x8002, 0x8003, 0x8003, 0x8003, 0x8004, 0x8004, 0x8005, 0x8006, 0x8006, 0x8007, 0x8008, 0x8009, + 0x800a, 0x800c, 0x800d, 0x800f, 0x8011, 0x8013, 0x8016, 0x8019, 0x801c, 0x8020, 0x8024, 0x8029, 0x802f, 0x8035, + 0x803c, 0x8044, 0x804d, 0x8057, 0x8062, 0x806f, 0x807e, 0x808f, 0x80a2, 0x80b8, 0x80d0, 0x80ec, 0x810b, 0x812e, + 0x8156, 0x8183, 0x81b7, 0x81f1, 0x8232, 0x827c, 0x82d0, 0x832f, 0x839a, 0x8412, +}; diff --git a/Drivers/CMSIS/NN/Source/NNSupportFunctions/arm_q7_to_q15_no_shift.c b/Drivers/CMSIS/NN/Source/NNSupportFunctions/arm_q7_to_q15_no_shift.c new file mode 100644 index 0000000..6f2f575 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/NNSupportFunctions/arm_q7_to_q15_no_shift.c @@ -0,0 +1,121 @@ +/* + * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_q7_to_q15_no_shift.c + * Description: Converts the elements of the Q7 vector to Q15 vector without left-shift + * + * $Date: May 29, 2020 + * $Revision: V.1.0.2 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupSupport + */ + +/** + * @addtogroup nndata_convert + * @{ + */ + +/** + * @brief Converts the elements of the Q7 vector to Q15 vector without left-shift + * @param[in] *pSrc points to the Q7 input vector + * @param[out] *pDst points to the Q15 output vector + * @param[in] blockSize length of the input vector + * + * \par Description: + * + * The equation used for the conversion process is: + * + * <pre> + * pDst[n] = (q15_t) pSrc[n]; 0 <= n < blockSize. + * </pre> + * + */ + +void arm_q7_to_q15_no_shift(const q7_t *pSrc, q15_t *pDst, uint32_t blockSize) +{ + const q7_t *pIn = pSrc; + uint32_t blkCnt; + +#if defined(ARM_MATH_DSP) + q31_t in; + q31_t in1, in2; + q31_t out1, out2; + + /*loop Unrolling */ + blkCnt = blockSize >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 outputs at a time. */ + while (blkCnt > 0u) + { + in = arm_nn_read_q7x4_ia(&pIn); + + /* rotatate in by 8 and extend two q7_t values to q15_t values */ + in1 = __SXTB16(__ROR((uint32_t)in, 8)); + + /* extend remaining two q7_t values to q15_t values */ + in2 = __SXTB16(in); + +#ifndef ARM_MATH_BIG_ENDIAN + out2 = (int32_t)__PKHTB(in1, in2, 16); + out1 = (int32_t)__PKHBT(in2, in1, 16); +#else + out1 = (int32_t)__PKHTB(in1, in2, 16); + out2 = (int32_t)__PKHBT(in2, in1, 16); +#endif + arm_nn_write_q15x2_ia(&pDst, out1); + arm_nn_write_q15x2_ia(&pDst, out2); + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize % 0x4u; + +#else + + /* Run the below code for Cortex-M0 */ + + /* Loop over blockSize number of values */ + blkCnt = blockSize; + +#endif /* #ifndef ARM_MATH_CM0_FAMILY */ + + while (blkCnt > 0u) + { + /* convert from q7 to q15 and then store the results in the destination buffer */ + *pDst++ = (q15_t)*pIn++; + + /* Decrement the loop counter */ + blkCnt--; + } +} + +/** + * @} end of nndata_convert group + */ diff --git a/Drivers/CMSIS/NN/Source/NNSupportFunctions/arm_q7_to_q15_reordered_no_shift.c b/Drivers/CMSIS/NN/Source/NNSupportFunctions/arm_q7_to_q15_reordered_no_shift.c new file mode 100644 index 0000000..8abbc3a --- /dev/null +++ b/Drivers/CMSIS/NN/Source/NNSupportFunctions/arm_q7_to_q15_reordered_no_shift.c @@ -0,0 +1,143 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_q7_to_q15_reordered_no_shift.c + * Description: Converts the elements of the Q7 vector to reordered Q15 vector without left-shift + * + * $Date: July 20, 2021 + * $Revision: V.1.1.1 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupSupport + */ + +/** + * @addtogroup nndata_convert + * @{ + */ + +/** + * @brief Converts the elements of the Q7 vector to reordered Q15 vector without left-shift + * @param[in] *pSrc points to the Q7 input vector + * @param[out] *pDst points to the Q15 output vector + * @param[in] blockSize length of the input vector + * + * @details + * + * This function does the q7 to q15 expansion with re-ordering + * + * <pre> + * | A1 | A2 | A3 | A4 | + * + * 0 7 8 15 16 23 24 31 + * </pre> + * + * is converted into: + * + * <pre> + * | A1 | A3 | and | A2 | A4 | + * + * 0 15 16 31 0 15 16 31 + * </pre> + * + * + * This looks strange but is natural considering how sign-extension is done at + * assembly level. + * + * The expansion of other other oprand will follow the same rule so that the end + * results are the same. + * + * The tail (i.e., last (N % 4) elements) will still be in original order. + * + */ + +void arm_q7_to_q15_reordered_no_shift(const q7_t *pSrc, q15_t *pDst, uint32_t blockSize) +{ + const q7_t *pIn = pSrc; /* Src pointer */ + uint32_t blkCnt; /* loop counter */ + +#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) + q31_t in; + q31_t in1, in2; + + /* Run the below code for Cortex-M4 and Cortex-M3 */ + + /*loop Unrolling */ + blkCnt = blockSize >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 outputs at a time. + ** a second loop below computes the remaining 1 to 3 samples. */ + while (blkCnt > 0u) + { + /* C = (q15_t) A << 8 */ + /* convert from q7 to q15 and then store the results in the destination buffer */ + in = arm_nn_read_q7x4_ia(&pIn); + + /* rotatate in by 8 and extend two q7_t values to q15_t values */ + in1 = __SXTB16(__ROR((uint32_t)in, 8)); + + /* extend remainig two q7_t values to q15_t values */ + in2 = __SXTB16(in); + +#ifndef ARM_MATH_BIG_ENDIAN + arm_nn_write_q7x4_ia((q7_t **)&pDst, in2); + arm_nn_write_q7x4_ia((q7_t **)&pDst, in1); +#else + arm_nn_write_q7x4_ia((q7_t **)&pDst, in1); + arm_nn_write_q7x4_ia((q7_t **)&pDst, in2); +#endif + + /* Decrement the loop counter */ + blkCnt--; + } + + /* If the blockSize is not a multiple of 4, compute any remaining output samples here. + ** No loop unrolling is used. */ + blkCnt = blockSize % 0x4u; + +#else + + /* Run the below code for Cortex-M0 */ + + /* Loop over blockSize number of values */ + blkCnt = blockSize; + +#endif /* #ifndef ARM_MATH_CM0_FAMILY */ + + while (blkCnt > 0u) + { + /* C = (q15_t) A << 8 */ + /* convert from q7 to q15 and then store the results in the destination buffer */ + *pDst++ = (q15_t)*pIn++; + + /* Decrement the loop counter */ + blkCnt--; + } +} + +/** + * @} end of q7_to_x group + */ diff --git a/Drivers/CMSIS/NN/Source/NNSupportFunctions/arm_q7_to_q15_reordered_with_offset.c b/Drivers/CMSIS/NN/Source/NNSupportFunctions/arm_q7_to_q15_reordered_with_offset.c new file mode 100644 index 0000000..765929d --- /dev/null +++ b/Drivers/CMSIS/NN/Source/NNSupportFunctions/arm_q7_to_q15_reordered_with_offset.c @@ -0,0 +1,100 @@ +/* + * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_q7_to_q15_reordered_with_offset.c + * Description: Converts the elements of the Q7 vector to a reordered Q15 vector with an added offset. The re-ordering + * is a signature of sign extension intrinsic(DSP extension). + * + * $Date: May 29, 2020 + * $Revision: V.2.0.3 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupSupport + */ + +/** + * @addtogroup nndata_convert + * @{ + */ + +/** + * @brief Converts the elements of the Q7 vector to a reordered Q15 vector with an added offset. + * + * @note Refer header file for details. + * + */ + +void arm_q7_to_q15_reordered_with_offset(const q7_t *src, q15_t *dst, uint32_t block_size, q15_t offset) +{ + +#if defined(ARM_MATH_DSP) + uint32_t block_cnt; + /* Run the below code for cores that support SIMD instructions */ + q31_t in_q7x4; + q31_t out_q15x2_1; + q31_t out_q15x2_2; + + /*loop unrolling */ + block_cnt = block_size >> 2u; + + /* First part of the processing with loop unrolling. Compute 4 outputs at a time. */ + const q31_t offset_q15x2 = (q31_t)__PKHBT(offset, offset, 16); + while (block_cnt > 0u) + { + /* convert from q7 to q15 and then store the results in the destination buffer */ + in_q7x4 = arm_nn_read_q7x4_ia(&src); + + /* Extract and sign extend each of the four q7 values to q15 */ + out_q15x2_1 = __SXTAB16(offset_q15x2, __ROR((uint32_t)in_q7x4, 8)); + out_q15x2_2 = __SXTAB16(offset_q15x2, in_q7x4); + + arm_nn_write_q15x2_ia(&dst, out_q15x2_2); + arm_nn_write_q15x2_ia(&dst, out_q15x2_1); + + block_cnt--; + } + /* Handle left over samples */ + block_cnt = block_size % 0x4u; + + while (block_cnt > 0u) + { + *dst++ = (q15_t)*src++ + offset; + + /* Decrement the loop counter */ + block_cnt--; + } +#else + (void)src; + (void)dst; + (void)block_size; + (void)offset; + /* Not available */ +#endif +} + +/** + * @} end of nndata_convert group + */ diff --git a/Drivers/CMSIS/NN/Source/NNSupportFunctions/arm_q7_to_q15_with_offset.c b/Drivers/CMSIS/NN/Source/NNSupportFunctions/arm_q7_to_q15_with_offset.c new file mode 100644 index 0000000..ea29986 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/NNSupportFunctions/arm_q7_to_q15_with_offset.c @@ -0,0 +1,114 @@ +/* + * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in_q7x4 compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in_q7x4 writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_q7_to_q15_with_offset.c + * Description: Converts the elements of the Q7 vector to Q15 vector with an added offset + * + * $Date: March 3, 2020 + * $Revision: V.2.0.2 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupSupport + */ + +/** + * @addtogroup nndata_convert + * @{ + */ + +void arm_q7_to_q15_with_offset(const q7_t *src, q15_t *dst, uint32_t block_size, q15_t offset) +{ + int block_cnt; + +#if defined(ARM_MATH_MVEI) + + int16x8_t source; + const int16x8_t source_offset = vdupq_n_s16(offset); + block_cnt = block_size / 8; + + while (block_cnt > 0) + { + source = vldrbq_s16(src); + source = vaddq_s16(source, source_offset); + vstrhq_s16(dst, source); + dst += 8; + src += 8; + block_cnt--; + } + + block_cnt = block_size & 0x7; + +#elif defined(ARM_MATH_DSP) + /* Run the below code for cores that support SIMD instructions */ + q31_t in_q7x4; + q31_t in_q15x2_1; + q31_t in_q15x2_2; + q31_t out_q15x2_1; + q31_t out_q15x2_2; + + /*loop unrolling */ + block_cnt = block_size >> 2; + + /* First part of the processing with loop unrolling. Compute 4 outputs at a time. */ + const q31_t offset_q15x2 = __PKHBT(offset, offset, 16); + while (block_cnt > 0) + { + /* convert from q7 to q15 and then store the results in the destination buffer */ + in_q7x4 = arm_nn_read_q7x4_ia(&src); + + /* Extract and sign extend each of the four q7 values to q15 */ + in_q15x2_1 = __SXTAB16(offset_q15x2, __ROR(in_q7x4, 8)); + in_q15x2_2 = __SXTAB16(offset_q15x2, in_q7x4); + + out_q15x2_2 = __PKHTB(in_q15x2_1, in_q15x2_2, 16); + out_q15x2_1 = __PKHBT(in_q15x2_2, in_q15x2_1, 16); + + arm_nn_write_q15x2_ia(&dst, out_q15x2_1); + arm_nn_write_q15x2_ia(&dst, out_q15x2_2); + + block_cnt--; + } + /* Handle left over samples */ + block_cnt = block_size % 0x4; + +#else + /* Run the below code for Cortex-M0 */ + /* Loop over block_size number of values */ + block_cnt = block_size; +#endif + + while (block_cnt > 0) + { + *dst++ = (q15_t)*src++ + offset; + + /* Decrement the loop counter */ + block_cnt--; + } +} + +/** + * @} end of nndata_convert group + */ diff --git a/Drivers/CMSIS/NN/Source/PoolingFunctions/CMakeLists.txt b/Drivers/CMSIS/NN/Source/PoolingFunctions/CMakeLists.txt new file mode 100644 index 0000000..a37503b --- /dev/null +++ b/Drivers/CMSIS/NN/Source/PoolingFunctions/CMakeLists.txt @@ -0,0 +1,24 @@ +# +# Copyright (c) 2019-2022 Arm Limited. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +file(GLOB SRC "./*_s8.c") +file(GLOB SRC_S16 "./*_s16.c") +target_sources(cmsis-nn PRIVATE ${SRC} ${SRC_S16}) + + + diff --git a/Drivers/CMSIS/NN/Source/PoolingFunctions/arm_avgpool_s16.c b/Drivers/CMSIS/NN/Source/PoolingFunctions/arm_avgpool_s16.c new file mode 100644 index 0000000..5cd2b1c --- /dev/null +++ b/Drivers/CMSIS/NN/Source/PoolingFunctions/arm_avgpool_s16.c @@ -0,0 +1,128 @@ +/* + * Copyright (C) 2022 Arm Limited or its affiliates. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_avgpool_s16.c + * Description: Pooling function implementations + * + * $Date: 3. February 2022 + * $Revision: V.1.0.1 + * + * Target Processor: Cortex-M CPUs + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup Pooling + * @{ + */ + +/* + * s16 average pooling function + * + * Refer to header file for details. + * + */ +arm_status arm_avgpool_s16(const cmsis_nn_context *ctx, + const cmsis_nn_pool_params *pool_params, + const cmsis_nn_dims *input_dims, + const q15_t *src, + const cmsis_nn_dims *filter_dims, + const cmsis_nn_dims *output_dims, + q15_t *dst) +{ + (void)ctx; + const int32_t input_y = input_dims->h; + const int32_t input_x = input_dims->w; + const int32_t output_y = output_dims->h; + const int32_t output_x = output_dims->w; + const int32_t stride_y = pool_params->stride.h; + const int32_t stride_x = pool_params->stride.w; + const int32_t kernel_y = filter_dims->h; + const int32_t kernel_x = filter_dims->w; + const int32_t pad_y = pool_params->padding.h; + const int32_t pad_x = pool_params->padding.w; + const int32_t act_min = pool_params->activation.min; + const int32_t act_max = pool_params->activation.max; + const int32_t ch_src = input_dims->c; + + /* Reference C code adapted from CMSIS-NN arm_avgpool_s8.c. + */ + + for (int i_y = 0, base_idx_y = -pad_y; i_y < output_y; base_idx_y += stride_y, i_y++) + { + for (int i_x = 0, base_idx_x = -pad_x; i_x < output_x; base_idx_x += stride_x, i_x++) + { + /* Condition for kernel start dimension: (base_idx_<x,y> + kernel_<x,y>_start) >= 0 */ + const int32_t ker_y_start = MAX(0, -base_idx_y); + const int32_t ker_x_start = MAX(0, -base_idx_x); + + /* Condition for kernel end dimension: (base_idx_<x,y> + kernel_<x,y>_end) < dim_src_<width,height> */ + const int32_t kernel_y_end = MIN(kernel_y, input_y - base_idx_y); + const int32_t kernel_x_end = MIN(kernel_x, input_x - base_idx_x); + + for (int i_ch_in = 0; i_ch_in < ch_src; i_ch_in++) + { + int sum = 0; + int count = 0; + + for (int k_y = ker_y_start; k_y < kernel_y_end; k_y++) + { + for (int k_x = ker_x_start; k_x < kernel_x_end; k_x++) + { + sum += src[i_ch_in + ch_src * (k_x + base_idx_x + (k_y + base_idx_y) * input_x)]; + count++; + } + } + + // Prevent static code issue DIVIDE_BY_ZERO. + if (count == 0) + { + return ARM_MATH_ARGUMENT_ERROR; + } + + sum = sum > 0 ? (sum + count / 2) / count : (sum - count / 2) / count; + sum = MAX(sum, act_min); + sum = MIN(sum, act_max); + + dst[i_ch_in + ch_src * (i_x + i_y * output_x)] = sum; + } + } + } + + return ARM_MATH_SUCCESS; +} + +int32_t arm_avgpool_s16_get_buffer_size(const int output_x, const int ch_src) +{ + (void)output_x; + (void)ch_src; + return 0; +} + +/** + * @} end of Pooling group + */ diff --git a/Drivers/CMSIS/NN/Source/PoolingFunctions/arm_avgpool_s8.c b/Drivers/CMSIS/NN/Source/PoolingFunctions/arm_avgpool_s8.c new file mode 100644 index 0000000..3e9861e --- /dev/null +++ b/Drivers/CMSIS/NN/Source/PoolingFunctions/arm_avgpool_s8.c @@ -0,0 +1,401 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_avgpool_s8.c + * Description: Pooling function implementations + * + * $Date: 01. March 2021 + * $Revision: V.2.0.4 + * + * Target Processor: Cortex-M CPUs + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) + +static void scale_q31_to_q7_and_clamp(const q31_t *buffer, + q7_t *target, + int32_t length, + const int32_t count, + const int act_min, + const int act_max) +{ + const int half_count = count / 2; + + // Prevent static code issue DIVIDE_BY_ZERO. + if (count == 0) + { + return; + } + + for (int i = 0; i < length; i++) + { + int32_t sum = buffer[i] > 0 ? (buffer[i] + half_count) : (buffer[i] - half_count); + sum = sum / count; + sum = MAX(sum, act_min); + sum = MIN(sum, act_max); + + target[i] = (q7_t)sum; + } +} +#endif + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup Pooling + * @{ + */ + +/* + * s8 average pooling function + * + * Refer to header file for details. + * + */ + +#if defined(ARM_MATH_MVEI) + +arm_status arm_avgpool_s8(const cmsis_nn_context *ctx, + const cmsis_nn_pool_params *pool_params, + const cmsis_nn_dims *input_dims, + const q7_t *src, + const cmsis_nn_dims *filter_dims, + const cmsis_nn_dims *output_dims, + q7_t *dst) +{ + (void)ctx; + const int32_t input_y = input_dims->h; + const int32_t input_x = input_dims->w; + const int32_t output_y = output_dims->h; + const int32_t output_x = output_dims->w; + const int32_t stride_y = pool_params->stride.h; + const int32_t stride_x = pool_params->stride.w; + const int32_t kernel_y = filter_dims->h; + const int32_t kernel_x = filter_dims->w; + const int32_t pad_y = pool_params->padding.h; + const int32_t pad_x = pool_params->padding.w; + const int32_t act_min = pool_params->activation.min; + const int32_t act_max = pool_params->activation.max; + const int32_t ch_src = input_dims->c; + + int32_t i_x, i_y; + int32_t k_x, k_y; + + for (i_y = 0; i_y < output_y; i_y++) + { + for (i_x = 0; i_x < output_x; i_x++) + { + + int32_t k_y_start, k_y_end; + int32_t k_x_start, k_x_end; + int32_t chCnt; + const int8_t *pTmp, *pTmpInner; + int8_t *pDst; + + k_y_start = MAX(0, i_y * stride_y - pad_y); + k_y_end = MIN(i_y * stride_y - pad_y + kernel_y, input_y); + + k_x_start = MAX(0, i_x * stride_x - pad_x); + k_x_end = MIN(i_x * stride_x - pad_x + kernel_x, input_x); + + pTmp = src; + pDst = &dst[ch_src * (i_x + i_y * output_x)]; + + chCnt = ch_src >> 4; + while (chCnt > 0) + { + int32x4_t sumV1, sumV2, sumV3, sumV4; + + int8x16_t tempV; + int16x8_t tempVLO, tempVHI; + int32x4_t tempVLOLO, tempVLOHI, tempVHILO, tempVHIHI; + int32_t count = 0; + + sumV1 = vdupq_n_s32(0); + sumV2 = vdupq_n_s32(0); + sumV3 = vdupq_n_s32(0); + sumV4 = vdupq_n_s32(0); + + for (k_y = k_y_start; k_y < k_y_end; k_y++) + { + for (k_x = k_x_start; k_x < k_x_end; k_x++) + { + pTmpInner = pTmp + (ch_src * (k_x + k_y * input_x)); + tempV = vldrbq_s8(pTmpInner); + + tempVLO = vmovlbq_s8(tempV); + tempVHI = vmovltq_s8(tempV); + + tempVLOLO = vmovlbq_s16(tempVLO); + tempVLOHI = vmovltq_s16(tempVLO); + + tempVHILO = vmovlbq_s16(tempVHI); + tempVHIHI = vmovltq_s16(tempVHI); + + sumV1 = vaddq_s32(sumV1, tempVLOLO); + sumV2 = vaddq_s32(sumV2, tempVLOHI); + sumV3 = vaddq_s32(sumV3, tempVHILO); + sumV4 = vaddq_s32(sumV4, tempVHIHI); + + count++; + } + } + + // Prevent static code issue DIVIDE_BY_ZERO. + if (count == 0) + { + return ARM_MATH_ARGUMENT_ERROR; + } + + sumV1[0] = sumV1[0] > 0 ? (sumV1[0] + count / 2) / count : (sumV1[0] - count / 2) / count; + sumV1[1] = sumV1[1] > 0 ? (sumV1[1] + count / 2) / count : (sumV1[1] - count / 2) / count; + sumV1[2] = sumV1[2] > 0 ? (sumV1[2] + count / 2) / count : (sumV1[2] - count / 2) / count; + sumV1[3] = sumV1[3] > 0 ? (sumV1[3] + count / 2) / count : (sumV1[3] - count / 2) / count; + + sumV2[0] = sumV2[0] > 0 ? (sumV2[0] + count / 2) / count : (sumV2[0] - count / 2) / count; + sumV2[1] = sumV2[1] > 0 ? (sumV2[1] + count / 2) / count : (sumV2[1] - count / 2) / count; + sumV2[2] = sumV2[2] > 0 ? (sumV2[2] + count / 2) / count : (sumV2[2] - count / 2) / count; + sumV2[3] = sumV2[3] > 0 ? (sumV2[3] + count / 2) / count : (sumV2[3] - count / 2) / count; + + sumV3[0] = sumV3[0] > 0 ? (sumV3[0] + count / 2) / count : (sumV3[0] - count / 2) / count; + sumV3[1] = sumV3[1] > 0 ? (sumV3[1] + count / 2) / count : (sumV3[1] - count / 2) / count; + sumV3[2] = sumV3[2] > 0 ? (sumV3[2] + count / 2) / count : (sumV3[2] - count / 2) / count; + sumV3[3] = sumV3[3] > 0 ? (sumV3[3] + count / 2) / count : (sumV3[3] - count / 2) / count; + + sumV4[0] = sumV4[0] > 0 ? (sumV4[0] + count / 2) / count : (sumV4[0] - count / 2) / count; + sumV4[1] = sumV4[1] > 0 ? (sumV4[1] + count / 2) / count : (sumV4[1] - count / 2) / count; + sumV4[2] = sumV4[2] > 0 ? (sumV4[2] + count / 2) / count : (sumV4[2] - count / 2) / count; + sumV4[3] = sumV4[3] > 0 ? (sumV4[3] + count / 2) / count : (sumV4[3] - count / 2) / count; + + sumV1 = vmaxq_s32(sumV1, vdupq_n_s32(act_min)); + sumV1 = vminq_s32(sumV1, vdupq_n_s32(act_max)); + + sumV2 = vmaxq_s32(sumV2, vdupq_n_s32(act_min)); + sumV2 = vminq_s32(sumV2, vdupq_n_s32(act_max)); + + sumV3 = vmaxq_s32(sumV3, vdupq_n_s32(act_min)); + sumV3 = vminq_s32(sumV3, vdupq_n_s32(act_max)); + + sumV4 = vmaxq_s32(sumV4, vdupq_n_s32(act_min)); + sumV4 = vminq_s32(sumV4, vdupq_n_s32(act_max)); + + tempVLO = vmovnbq_s32(tempVLO, sumV1); + tempVLO = vmovntq_s32(tempVLO, sumV2); + + tempVHI = vmovnbq_s32(tempVHI, sumV3); + tempVHI = vmovntq_s32(tempVHI, sumV4); + + tempV = vmovnbq_s16(tempV, tempVLO); + tempV = vmovntq_s16(tempV, tempVHI); + + vstrbq_s8(pDst, tempV); + pDst += 16; + + chCnt--; + pTmp += 16; + } + + chCnt = ch_src & 0xF; + while (chCnt > 0) + { + int32_t sum = 0; + int32_t count = 0; + + for (k_y = k_y_start; k_y < k_y_end; k_y++) + { + for (k_x = k_x_start; k_x < k_x_end; k_x++) + { + sum += pTmp[ch_src * (k_x + k_y * input_x)]; + count++; + } + } + + // Prevent static code issue DIVIDE_BY_ZERO. + if (count == 0) + { + return ARM_MATH_ARGUMENT_ERROR; + } + + sum = sum > 0 ? (sum + count / 2) / count : (sum - count / 2) / count; + sum = MAX(sum, act_min); + sum = MIN(sum, act_max); + + *pDst++ = sum; + + chCnt--; + pTmp++; + } + } + } + return ARM_MATH_SUCCESS; +} + +#else +arm_status arm_avgpool_s8(const cmsis_nn_context *ctx, + const cmsis_nn_pool_params *pool_params, + const cmsis_nn_dims *input_dims, + const q7_t *src, + const cmsis_nn_dims *filter_dims, + const cmsis_nn_dims *output_dims, + q7_t *dst) +{ + const int32_t input_y = input_dims->h; + const int32_t input_x = input_dims->w; + const int32_t output_y = output_dims->h; + const int32_t output_x = output_dims->w; + const int32_t stride_y = pool_params->stride.h; + const int32_t stride_x = pool_params->stride.w; + const int32_t kernel_y = filter_dims->h; + const int32_t kernel_x = filter_dims->w; + const int32_t pad_y = pool_params->padding.h; + const int32_t pad_x = pool_params->padding.w; + const int32_t act_min = pool_params->activation.min; + const int32_t act_max = pool_params->activation.max; + const int32_t ch_src = input_dims->c; + + if (ctx->buf == NULL && arm_avgpool_s8_get_buffer_size(output_dims->w, input_dims->c)) + { + return ARM_MATH_ARGUMENT_ERROR; + } + q31_t *buffer = (q31_t *)ctx->buf; + +#if defined(ARM_MATH_DSP) + + /* Run the following code for CPU's with DSP extension + */ + for (int i_y = 0, idx_y = -pad_y; i_y < output_y; idx_y += stride_y, i_y++) + { + for (int i_x = 0, idx_x = -pad_x; i_x < output_x; idx_x += stride_x, i_x++) + { + /* Condition for kernel start dimension: + (base_idx_<x,y> + kernel_<x,y>_start) >= 0 */ + const int32_t kernel_y_start = MAX(0, -idx_y); + const int32_t kernel_x_start = MAX(0, -idx_x); + + /* Condition for kernel end dimension: + (base_idx_<x,y> + kernel_<x,y>_end) < dim_src_<width,height> */ + const int32_t kernel_y_end = MIN(kernel_y, input_y - idx_y); + const int32_t kernel_x_end = MIN(kernel_x, input_x - idx_x); + + int count = 0; + + for (int k_y = kernel_y_start; k_y < kernel_y_end; k_y++) + { + for (int k_x = kernel_x_start; k_x < kernel_x_end; k_x++) + { + const q7_t *start = src + ch_src * (k_x + idx_x + (k_y + idx_y) * input_x); + + if (count == 0) + { + for (int i = 0; i < ch_src; i++) + { + buffer[i] = start[i]; + } + } + else + { + for (int i = 0; i < ch_src; i++) + { + buffer[i] = __QADD(start[i], buffer[i]); + } + } + count++; + } + } + + // Prevent static code issue DIVIDE_BY_ZERO. + if (count == 0) + { + return ARM_MATH_ARGUMENT_ERROR; + } + + scale_q31_to_q7_and_clamp(buffer, dst, ch_src, count, act_min, act_max); + dst += ch_src; + } + } +#else + + /* Reference C code adapted from CMSIS-NN arm_avepool_q7_HWC. + */ + (void)buffer; + int16_t i_ch_in, i_x, i_y; + int16_t k_x, k_y; + + for (i_y = 0; i_y < output_y; i_y++) + { + for (i_x = 0; i_x < output_x; i_x++) + { + for (i_ch_in = 0; i_ch_in < ch_src; i_ch_in++) + { + int sum = 0; + int count = 0; + for (k_y = i_y * stride_y - pad_y; k_y < i_y * stride_y - pad_y + kernel_y; k_y++) + { + for (k_x = i_x * stride_x - pad_x; k_x < i_x * stride_x - pad_x + kernel_x; k_x++) + { + if (k_y >= 0 && k_x >= 0 && k_y < input_y && k_x < input_x) + { + sum += src[i_ch_in + ch_src * (k_x + k_y * input_x)]; + count++; + } + } + } + + // Prevent static code issue DIVIDE_BY_ZERO. + if (count == 0) + { + return ARM_MATH_ARGUMENT_ERROR; + } + + sum = sum > 0 ? (sum + count / 2) / count : (sum - count / 2) / count; + sum = MAX(sum, act_min); + sum = MIN(sum, act_max); + + dst[i_ch_in + ch_src * (i_x + i_y * output_x)] = sum; + } + } + } + +#endif + return ARM_MATH_SUCCESS; +} + +#endif /* ARM_MATH_MVEI */ + +int32_t arm_avgpool_s8_get_buffer_size(const int output_x, const int ch_src) +{ + (void)output_x; + +#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) + return (ch_src * sizeof(int32_t)); +#else + (void)ch_src; + return 0; +#endif +} +/** + * @} end of Pooling group + */ diff --git a/Drivers/CMSIS/NN/Source/PoolingFunctions/arm_max_pool_s16.c b/Drivers/CMSIS/NN/Source/PoolingFunctions/arm_max_pool_s16.c new file mode 100644 index 0000000..483f874 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/PoolingFunctions/arm_max_pool_s16.c @@ -0,0 +1,180 @@ +/* + * Copyright (C) 2022 Arm Limited or its affiliates. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_max_pool_s16.c + * Description: Pooling function implementations + * + * $Date: 24. January 2022 + * $Revision: V.1.0.0 + * + * Target Processor: Cortex-M CPUs + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +static void compare_and_replace_if_larger(int16_t *base, const int16_t *target, int32_t length) +{ + q15_t *dst = base; + const q15_t *src = target; + union arm_nnword ref_max; + union arm_nnword comp_max; + int32_t cnt = length >> 1; + + while (cnt > 0l) + { + ref_max.word = arm_nn_read_q15x2(dst); + comp_max.word = arm_nn_read_q15x2_ia(&src); + + if (comp_max.half_words[0] > ref_max.half_words[0]) + { + ref_max.half_words[0] = comp_max.half_words[0]; + } + if (comp_max.half_words[1] > ref_max.half_words[1]) + { + ref_max.half_words[1] = comp_max.half_words[1]; + } + + arm_nn_write_q15x2_ia(&dst, ref_max.word); + + cnt--; + } + + if (length & 0x1) + { + if (*src > *dst) + { + *dst = *src; + } + } +} + +static void clamp_output(int16_t *source, int32_t length, const int16_t act_min, const int16_t act_max) +{ + union arm_nnword in; + int32_t cnt = length >> 1; + + while (cnt > 0l) + { + in.word = arm_nn_read_q15x2(source); + + in.half_words[0] = MAX(in.half_words[0], act_min); + in.half_words[0] = MIN(in.half_words[0], act_max); + in.half_words[1] = MAX(in.half_words[1], act_min); + in.half_words[1] = MIN(in.half_words[1], act_max); + + arm_nn_write_q15x2_ia(&source, in.word); + cnt--; + } + + if (length & 0x1) + { + int16_t comp = *source; + comp = MAX(comp, act_min); + comp = MIN(comp, act_max); + *source = comp; + } +} + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup Pooling + * @{ + */ + +/* + * Optimized s16 max pooling function + * + * Refer to header file for details. + * + */ + +arm_status arm_max_pool_s16(const cmsis_nn_context *ctx, + const cmsis_nn_pool_params *pool_params, + const cmsis_nn_dims *input_dims, + const int16_t *src, + const cmsis_nn_dims *filter_dims, + const cmsis_nn_dims *output_dims, + int16_t *dst) +{ + const int32_t input_y = input_dims->h; + const int32_t input_x = input_dims->w; + const int32_t output_y = output_dims->h; + const int32_t output_x = output_dims->w; + const int32_t stride_y = pool_params->stride.h; + const int32_t stride_x = pool_params->stride.w; + const int32_t kernel_y = filter_dims->h; + const int32_t kernel_x = filter_dims->w; + const int32_t pad_y = pool_params->padding.h; + const int32_t pad_x = pool_params->padding.w; + const int16_t act_min = pool_params->activation.min; + const int16_t act_max = pool_params->activation.max; + const int32_t channel_in = input_dims->c; + (void)ctx; + int16_t *dst_base = dst; + + for (int i_y = 0, base_idx_y = -pad_y; i_y < output_y; base_idx_y += stride_y, i_y++) + { + for (int i_x = 0, base_idx_x = -pad_x; i_x < output_x; base_idx_x += stride_x, i_x++) + { + /* Condition for kernel start dimension: (base_idx_<x,y> + kernel_<x,y>_start) >= 0 */ + const int32_t ker_y_start = MAX(0, -base_idx_y); + const int32_t ker_x_start = MAX(0, -base_idx_x); + + /* Condition for kernel end dimension: (base_idx_<x,y> + kernel_<x,y>_end) < dim_src_<width,height> */ + const int32_t kernel_y_end = MIN(kernel_y, input_y - base_idx_y); + const int32_t kernel_x_end = MIN(kernel_x, input_x - base_idx_x); + + int count = 0; + + for (int k_y = ker_y_start; k_y < kernel_y_end; k_y++) + { + for (int k_x = ker_x_start; k_x < kernel_x_end; k_x++) + { + const int16_t *start = src + channel_in * (k_x + base_idx_x + (k_y + base_idx_y) * input_x); + + if (count == 0) + { + memcpy(dst, start, channel_in * sizeof(int16_t)); + count++; + } + else + { + compare_and_replace_if_larger(dst, start, channel_in); + } + } + } + /* 'count' is expected to be non-zero here. */ + dst += channel_in; + } + } + + clamp_output(dst_base, output_x * output_y * channel_in, act_min, act_max); + + return ARM_MATH_SUCCESS; +} + +/** + * @} end of Pooling group + */ diff --git a/Drivers/CMSIS/NN/Source/PoolingFunctions/arm_max_pool_s8.c b/Drivers/CMSIS/NN/Source/PoolingFunctions/arm_max_pool_s8.c new file mode 100644 index 0000000..4fbbc91 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/PoolingFunctions/arm_max_pool_s8.c @@ -0,0 +1,229 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_max_pool_s8.c + * Description: Pooling function implementations + * + * $Date: 20. July 2021 + * $Revision: V.2.0.3 + * + * Target Processor: Cortex-M CPUs + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +static void compare_and_replace_if_larger_q7(q7_t *base, const q7_t *target, int32_t length) +{ +#if defined(ARM_MATH_MVEI) + int32_t loop_count = (length + 15) / 16; + for (int i = 0; i < loop_count; i++) + { + mve_pred16_t p = vctp8q((uint32_t)length); + const int8x16_t op_1 = vldrbq_z_s8(base, p); + const int8x16_t op_2 = vldrbq_z_s8(target, p); + const int8x16_t max = vmaxq_m_s8(vuninitializedq_s8(), op_1, op_2, p); + vstrbq_p_s8(base, max, p); + base += 16; + target += 16; + length -= 16; + } +#else + q7_t *dst = base; + const q7_t *src = target; + union arm_nnword ref_max; + union arm_nnword comp_max; + int32_t cnt = length >> 2; + + while (cnt > 0l) + { + ref_max.word = arm_nn_read_q7x4(dst); + comp_max.word = arm_nn_read_q7x4_ia(&src); + + if (comp_max.bytes[0] > ref_max.bytes[0]) + { + ref_max.bytes[0] = comp_max.bytes[0]; + } + if (comp_max.bytes[1] > ref_max.bytes[1]) + { + ref_max.bytes[1] = comp_max.bytes[1]; + } + if (comp_max.bytes[2] > ref_max.bytes[2]) + { + ref_max.bytes[2] = comp_max.bytes[2]; + } + if (comp_max.bytes[3] > ref_max.bytes[3]) + { + ref_max.bytes[3] = comp_max.bytes[3]; + } + + arm_nn_write_q7x4_ia(&dst, ref_max.word); + + cnt--; + } + + cnt = length & 0x3; + while (cnt > 0l) + { + if (*src > *dst) + { + *dst = *src; + } + dst++; + src++; + cnt--; + } +#endif +} + +static void clamp_output(q7_t *source, int32_t length, const int32_t act_min, const int32_t act_max) +{ +#if defined(ARM_MATH_MVEI) + int32_t loop_count = (length + 15) / 16; + for (int i = 0; i < loop_count; i++) + { + mve_pred16_t p = vctp8q((uint32_t)length); + length -= 16; + const int8x16_t src = vldrbq_z_s8(source, p); + const int8x16_t predicated_min = vdupq_m_n_s8(vuninitializedq_s8(), (int8_t)act_min, p); + const int8x16_t predicated_max = vdupq_m_n_s8(vuninitializedq_s8(), (int8_t)act_max, p); + int8x16_t res = vmaxq_m_s8(vuninitializedq_s8(), src, predicated_min, p); + res = vminq_m_s8(vuninitializedq_s8(), res, predicated_max, p); + vstrbq_p_s8(source, res, p); + source += 16; + } +#else + union arm_nnword in; + int32_t cnt = length >> 2; + + while (cnt > 0l) + { + in.word = arm_nn_read_q7x4(source); + + in.bytes[0] = MAX(in.bytes[0], act_min); + in.bytes[0] = MIN(in.bytes[0], act_max); + in.bytes[1] = MAX(in.bytes[1], act_min); + in.bytes[1] = MIN(in.bytes[1], act_max); + in.bytes[2] = MAX(in.bytes[2], act_min); + in.bytes[2] = MIN(in.bytes[2], act_max); + in.bytes[3] = MAX(in.bytes[3], act_min); + in.bytes[3] = MIN(in.bytes[3], act_max); + + arm_nn_write_q7x4_ia(&source, in.word); + cnt--; + } + + cnt = length & 0x3; + while (cnt > 0l) + { + int32_t comp = *source; + comp = MAX(comp, act_min); + comp = MIN(comp, act_max); + *source++ = (int8_t)comp; + cnt--; + } +#endif +} + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup Pooling + * @{ + */ + +/* + * Optimized s8 max pooling function + * + * Refer to header file for details. + * + */ + +arm_status arm_max_pool_s8(const cmsis_nn_context *ctx, + const cmsis_nn_pool_params *pool_params, + const cmsis_nn_dims *input_dims, + const q7_t *src, + const cmsis_nn_dims *filter_dims, + const cmsis_nn_dims *output_dims, + q7_t *dst) +{ + const int32_t input_y = input_dims->h; + const int32_t input_x = input_dims->w; + const int32_t output_y = output_dims->h; + const int32_t output_x = output_dims->w; + const int32_t stride_y = pool_params->stride.h; + const int32_t stride_x = pool_params->stride.w; + const int32_t kernel_y = filter_dims->h; + const int32_t kernel_x = filter_dims->w; + const int32_t pad_y = pool_params->padding.h; + const int32_t pad_x = pool_params->padding.w; + const int32_t act_min = pool_params->activation.min; + const int32_t act_max = pool_params->activation.max; + const int32_t channel_in = input_dims->c; + (void)ctx; + q7_t *dst_base = dst; + + for (int i_y = 0, base_idx_y = -pad_y; i_y < output_y; base_idx_y += stride_y, i_y++) + { + for (int i_x = 0, base_idx_x = -pad_x; i_x < output_x; base_idx_x += stride_x, i_x++) + { + /* Condition for kernel start dimension: (base_idx_<x,y> + kernel_<x,y>_start) >= 0 */ + const int32_t ker_y_start = MAX(0, -base_idx_y); + const int32_t ker_x_start = MAX(0, -base_idx_x); + + /* Condition for kernel end dimension: (base_idx_<x,y> + kernel_<x,y>_end) < dim_src_<width,height> */ + const int32_t kernel_y_end = MIN(kernel_y, input_y - base_idx_y); + const int32_t kernel_x_end = MIN(kernel_x, input_x - base_idx_x); + + int count = 0; + + for (int k_y = ker_y_start; k_y < kernel_y_end; k_y++) + { + for (int k_x = ker_x_start; k_x < kernel_x_end; k_x++) + { + const q7_t *start = src + channel_in * (k_x + base_idx_x + (k_y + base_idx_y) * input_x); + + if (count == 0) + { + arm_memcpy_q7(dst, start, channel_in); + count++; + } + else + { + compare_and_replace_if_larger_q7(dst, start, channel_in); + } + } + } + /* 'count' is expected to be non-zero here. */ + dst += channel_in; + } + } + + clamp_output(dst_base, output_x * output_y * channel_in, act_min, act_max); + + return ARM_MATH_SUCCESS; +} + +/** + * @} end of Pooling group + */ diff --git a/Drivers/CMSIS/NN/Source/PoolingFunctions/arm_pool_q7_HWC.c b/Drivers/CMSIS/NN/Source/PoolingFunctions/arm_pool_q7_HWC.c new file mode 100644 index 0000000..5a3b1af --- /dev/null +++ b/Drivers/CMSIS/NN/Source/PoolingFunctions/arm_pool_q7_HWC.c @@ -0,0 +1,464 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_pool_q7_HWC.c + * Description: Pooling function implementations + * + * $Date: 20. July 2021 + * $Revision: V.1.1.1 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) + +/** + * @brief A few utility functions used by pooling functions + * + * + */ + +static void buffer_scale_back_q15_to_q7(q15_t *buffer, q7_t *target, uint16_t length, uint16_t scale) +{ + int i; + + for (i = 0; i < length; i++) + { + target[i] = (q7_t)(buffer[i] / scale); + } +} + +static void compare_and_replace_if_larger_q7(q7_t *base, // base data + const q7_t *target, // compare target + const uint16_t length // data length +) +{ + q7_t *pIn = base; + const q7_t *pCom = target; + union arm_nnword in; + union arm_nnword com; + uint16_t cnt = length >> 2; + + while (cnt > 0u) + { + in.word = arm_nn_read_q7x4((const q7_t *)pIn); + com.word = arm_nn_read_q7x4_ia((const q7_t **)&pCom); + + // if version + if (com.bytes[0] > in.bytes[0]) + in.bytes[0] = com.bytes[0]; + if (com.bytes[1] > in.bytes[1]) + in.bytes[1] = com.bytes[1]; + if (com.bytes[2] > in.bytes[2]) + in.bytes[2] = com.bytes[2]; + if (com.bytes[3] > in.bytes[3]) + in.bytes[3] = com.bytes[3]; + + arm_nn_write_q7x4_ia(&pIn, in.word); + + cnt--; + } + + cnt = length & 0x3; + while (cnt > 0u) + { + if (*pCom > *pIn) + { + *pIn = *pCom; + } + pIn++; + pCom++; + cnt--; + } +} + +static void accumulate_q7_to_q15(q15_t *base, q7_t *target, const uint16_t length) +{ + q15_t *pCnt = base; + q7_t *pV = target; + q31_t v1, v2, vo1, vo2; + uint16_t cnt = length >> 2; + q31_t in; + + while (cnt > 0u) + { + q31_t value = arm_nn_read_q7x4_ia((const q7_t **)&pV); + v1 = __SXTB16(__ROR(value, 8)); + v2 = __SXTB16(value); +#ifndef ARM_MATH_BIG_ENDIAN + + vo2 = __PKHTB(v1, v2, 16); + vo1 = __PKHBT(v2, v1, 16); + +#else + + vo1 = __PKHTB(v1, v2, 16); + vo2 = __PKHBT(v2, v1, 16); + +#endif + + in = arm_nn_read_q15x2(pCnt); + arm_nn_write_q15x2_ia(&pCnt, __QADD16(vo1, in)); + + in = arm_nn_read_q15x2(pCnt); + arm_nn_write_q15x2_ia(&pCnt, __QADD16(vo2, in)); + + cnt--; + } + cnt = length & 0x3; + while (cnt > 0u) + { + *pCnt++ += *pV++; + cnt--; + } +} + +#endif // ARM_MATH_DSP + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup Pooling + * @{ + */ + +/** + * @brief Q7 max pooling function + * @param[in, out] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimention + * @param[in] ch_im_in number of input tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA Not used + * @param[in,out] Im_out pointer to output tensor + * + * @details + * + * The pooling function is implemented as split x-pooling then + * y-pooling. + * + * This pooling function is input-destructive. Input data is undefined + * after calling this function. + * + */ + +void arm_maxpool_q7_HWC(q7_t *Im_in, + const uint16_t dim_im_in, + const uint16_t ch_im_in, + const uint16_t dim_kernel, + const uint16_t padding, + const uint16_t stride, + const uint16_t dim_im_out, + q7_t *bufferA, + q7_t *Im_out) +{ + (void)bufferA; +#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) + /* Run the following code for Cortex-M4 and Cortex-M7 */ + + int16_t i_x, i_y; + + /* first does the pooling along x axis */ + for (i_y = 0; i_y < dim_im_in; i_y++) + { + + for (i_x = 0; i_x < dim_im_out; i_x++) + { + /* for each output pixel */ + q7_t *target = Im_in + (i_y * dim_im_in + i_x) * ch_im_in; + q7_t *win_start; + q7_t *win_stop; + if (i_x * stride - padding < 0) + { + win_start = target; + } + else + { + win_start = Im_in + (i_y * dim_im_in + i_x * stride - padding) * ch_im_in; + } + + if (i_x * stride - padding + dim_kernel >= dim_im_in) + { + win_stop = Im_in + (i_y * dim_im_in + dim_im_in) * ch_im_in; + } + else + { + win_stop = Im_in + (i_y * dim_im_in + i_x * stride - padding + dim_kernel) * ch_im_in; + } + + /* first step is to copy over initial data */ + /* arm_copy_q7(win_start, target, ch_im_in); */ + memmove(target, win_start, ch_im_in); + + /* start the max operation from the second part */ + win_start += ch_im_in; + for (; win_start < win_stop; win_start += ch_im_in) + { + compare_and_replace_if_larger_q7(target, win_start, ch_im_in); + } + } + } + + /* then does the pooling along y axis */ + for (i_y = 0; i_y < dim_im_out; i_y++) + { + + /* for each output row */ + q7_t *target = Im_out + i_y * dim_im_out * ch_im_in; + q7_t *row_start; + q7_t *row_end; + /* setting the starting row */ + if (i_y * stride - padding < 0) + { + row_start = Im_in; + } + else + { + row_start = Im_in + (i_y * stride - padding) * dim_im_in * ch_im_in; + } + /* setting the stopping row */ + if (i_y * stride - padding + dim_kernel >= dim_im_in) + { + row_end = Im_in + dim_im_in * dim_im_in * ch_im_in; + } + else + { + row_end = Im_in + (i_y * stride - padding + dim_kernel) * dim_im_in * ch_im_in; + } + + /* copy over the first row */ + /* arm_copy_q7(row_start, target, dim_im_out * ch_im_in); */ + memmove(target, row_start, dim_im_out * ch_im_in); + + /* move over to next row */ + row_start += ch_im_in * dim_im_in; + + for (; row_start < row_end; row_start += dim_im_in * ch_im_in) + { + compare_and_replace_if_larger_q7(target, row_start, dim_im_out * ch_im_in); + } + } + +#else + /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ + int16_t i_ch_in, i_x, i_y; + int16_t k_x, k_y; + + for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++) + { + for (i_y = 0; i_y < dim_im_out; i_y++) + { + for (i_x = 0; i_x < dim_im_out; i_x++) + { + int max = -129; + for (k_y = i_y * stride - padding; k_y < i_y * stride - padding + dim_kernel; k_y++) + { + for (k_x = i_x * stride - padding; k_x < i_x * stride - padding + dim_kernel; k_x++) + { + if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in && k_x < dim_im_in) + { + if (Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)] > max) + { + max = Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)]; + } + } + } + } + Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out)] = max; + } + } + } + +#endif /* ARM_MATH_DSP */ +} + +/** + * @brief Q7 average pooling function + * @param[in,out] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimention + * @param[in] ch_im_in number of input tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] Im_out pointer to output tensor + * + * @details + * + * <b>Buffer size:</b> + * + * bufferA size: 2*dim_im_out*ch_im_in + * + * The pooling function is implemented as split x-pooling then + * y-pooling. + * + * This pooling function is input-destructive. Input data is undefined + * after calling this function. + * + */ + +void arm_avepool_q7_HWC(q7_t *Im_in, + const uint16_t dim_im_in, + const uint16_t ch_im_in, + const uint16_t dim_kernel, + const uint16_t padding, + const uint16_t stride, + const uint16_t dim_im_out, + q7_t *bufferA, + q7_t *Im_out) +{ + +#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) + /* Run the following code for Cortex-M4 and Cortex-M7 */ + + q15_t *buffer = (q15_t *)bufferA; + int16_t i_x, i_y; + int16_t count = 0; + + /* first does the pooling along x axis */ + for (i_y = 0; i_y < dim_im_in; i_y++) + { + + for (i_x = 0; i_x < dim_im_out; i_x++) + { + /* for each output pixel */ + q7_t *target = Im_in + (i_y * dim_im_in + i_x) * ch_im_in; + q7_t *win_start; + q7_t *win_stop; + if (i_x * stride - padding < 0) + { + win_start = target; + } + else + { + win_start = Im_in + (i_y * dim_im_in + i_x * stride - padding) * ch_im_in; + } + + if (i_x * stride - padding + dim_kernel >= dim_im_in) + { + win_stop = Im_in + (i_y * dim_im_in + dim_im_in) * ch_im_in; + } + else + { + win_stop = Im_in + (i_y * dim_im_in + i_x * stride - padding + dim_kernel) * ch_im_in; + } + + /* first step is to copy over initial data */ + arm_q7_to_q15_no_shift(win_start, buffer, ch_im_in); + count = 1; + + /* start the max operation from the second part */ + win_start += ch_im_in; + for (; win_start < win_stop; win_start += ch_im_in) + { + accumulate_q7_to_q15(buffer, win_start, ch_im_in); + count++; + } + buffer_scale_back_q15_to_q7(buffer, target, ch_im_in, count); + } + } + + /* then does the pooling along y axis */ + for (i_y = 0; i_y < dim_im_out; i_y++) + { + /* for each output row */ + q7_t *target = Im_out + i_y * dim_im_out * ch_im_in; + q7_t *row_start; + q7_t *row_end; + /* setting the starting row */ + if (i_y * stride - padding < 0) + { + row_start = Im_in; + } + else + { + row_start = Im_in + (i_y * stride - padding) * dim_im_in * ch_im_in; + } + /* setting the stopping row */ + if (i_y * stride - padding + dim_kernel >= dim_im_in) + { + row_end = Im_in + dim_im_in * dim_im_in * ch_im_in; + } + else + { + row_end = Im_in + (i_y * stride - padding + dim_kernel) * dim_im_in * ch_im_in; + } + + /* copy over the first row */ + arm_q7_to_q15_no_shift(row_start, buffer, dim_im_out * ch_im_in); + count = 1; + + /* move over to next row */ + row_start += ch_im_in * dim_im_in; + + for (; row_start < row_end; row_start += dim_im_in * ch_im_in) + { + accumulate_q7_to_q15(buffer, row_start, dim_im_out * ch_im_in); + count++; + } + buffer_scale_back_q15_to_q7(buffer, target, dim_im_out * ch_im_in, count); + } + +#else + /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ + + (void)bufferA; + int16_t i_ch_in, i_x, i_y; + int16_t k_x, k_y; + + for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++) + { + for (i_y = 0; i_y < dim_im_out; i_y++) + { + for (i_x = 0; i_x < dim_im_out; i_x++) + { + int sum = 0; + int count = 0; + for (k_y = i_y * stride - padding; k_y < i_y * stride - padding + dim_kernel; k_y++) + { + for (k_x = i_x * stride - padding; k_x < i_x * stride - padding + dim_kernel; k_x++) + { + if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in && k_x < dim_im_in) + { + sum += Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)]; + count++; + } + } + } + Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out)] = sum / count; + } + } + } + +#endif /* ARM_MATH_DSP */ +} + +/** + * @} end of Pooling group + */ diff --git a/Drivers/CMSIS/NN/Source/ReshapeFunctions/CMakeLists.txt b/Drivers/CMSIS/NN/Source/ReshapeFunctions/CMakeLists.txt new file mode 100644 index 0000000..9d3f543 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ReshapeFunctions/CMakeLists.txt @@ -0,0 +1,20 @@ +# +# Copyright (c) 2019-2021 Arm Limited. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +file(GLOB SRC "./*_*.c") +target_sources(cmsis-nn PRIVATE ${SRC}) diff --git a/Drivers/CMSIS/NN/Source/ReshapeFunctions/arm_reshape_s8.c b/Drivers/CMSIS/NN/Source/ReshapeFunctions/arm_reshape_s8.c new file mode 100644 index 0000000..5464ca1 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ReshapeFunctions/arm_reshape_s8.c @@ -0,0 +1,57 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_reshape_s8.c + * Description: Reshape a s8 vector + * + * $Date: September 2019 + * $Revision: V.1.0.0 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup Reshape + * @{ + */ + +/** + * Basic s8 reshape function. + * + * Refer header file for details. + * + */ + +void arm_reshape_s8(const int8_t *input, int8_t *output, const uint32_t total_size) +{ + arm_memcpy_q7(output, input, total_size); +} + +/** + * @} end of Reshape group + */ diff --git a/Drivers/CMSIS/NN/Source/SVDFunctions/CMakeLists.txt b/Drivers/CMSIS/NN/Source/SVDFunctions/CMakeLists.txt new file mode 100644 index 0000000..67c3f79 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/SVDFunctions/CMakeLists.txt @@ -0,0 +1,20 @@ +# +# Copyright (c) 2019-2021 Arm Limited. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +file(GLOB SRC "./*_s8.c") +target_sources(cmsis-nn PRIVATE ${SRC}) diff --git a/Drivers/CMSIS/NN/Source/SVDFunctions/arm_svdf_s8.c b/Drivers/CMSIS/NN/Source/SVDFunctions/arm_svdf_s8.c new file mode 100644 index 0000000..ea092dc --- /dev/null +++ b/Drivers/CMSIS/NN/Source/SVDFunctions/arm_svdf_s8.c @@ -0,0 +1,271 @@ +/* + * Copyright (C) 2010-2022 Arm Limited or its affiliates. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_svdf_s8.c + * Description: S8 basic SVDF layer function + * + * $Date: 28 April 2022 + * $Revision: V.3.0.1 + * + * Target Processor: Cortex-M processors + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup SVDF + * @{ + */ + +/* + * S8 SVDF layer function for TensorFlow Lite with 8 bit state tensor + * + * Refer to header file for details. + * + */ + +arm_status arm_svdf_s8(const cmsis_nn_context *input_ctx, + const cmsis_nn_context *output_ctx, + const cmsis_nn_svdf_params *svdf_params, + const cmsis_nn_per_tensor_quant_params *input_quant_params, + const cmsis_nn_per_tensor_quant_params *output_quant_params, + const cmsis_nn_dims *input_dims, + const q7_t *input_data, + const cmsis_nn_dims *state_dims, + q7_t *state_data, + const cmsis_nn_dims *weights_feature_dims, + const q7_t *weights_feature_data, + const cmsis_nn_dims *weights_time_dims, + const q7_t *weights_time_data, + const cmsis_nn_dims *bias_dims, + const q31_t *bias_data, + const cmsis_nn_dims *output_dims, + q7_t *output_data) +{ + (void)bias_dims; + (void)state_dims; + (void)output_dims; + + const q31_t multiplier_in = input_quant_params->multiplier; + const q31_t shift_in = input_quant_params->shift; + const q31_t multiplier_out = output_quant_params->multiplier; + const q31_t shift_2 = output_quant_params->shift; + const int32_t zp_in = svdf_params->input_offset; + const int32_t zp_out = svdf_params->output_offset; + const int32_t in_activation_min = svdf_params->input_activation.min; + const int32_t in_activation_max = svdf_params->input_activation.max; + const int32_t out_activation_min = svdf_params->output_activation.min; + const int32_t out_activation_max = svdf_params->output_activation.max; + const int16_t rank = svdf_params->rank; + + const int32_t input_batches = input_dims->n; + const int32_t input_height = input_dims->h; + const int32_t feature_batches = weights_feature_dims->n; + const int32_t time_batches = weights_time_dims->h; + const int32_t unit_count = feature_batches / rank; + + if (input_ctx->buf == NULL) + { + return ARM_MATH_ARGUMENT_ERROR; + } + q31_t *buffer_a = (q31_t *)input_ctx->buf; + + if (output_ctx->buf == NULL) + { + return ARM_MATH_ARGUMENT_ERROR; + } + q31_t *buffer_b = (q31_t *)output_ctx->buf; + + // Left shift state + memmove((int8_t *)state_data, + (int8_t *)state_data + 1, + (size_t)((input_batches * feature_batches * time_batches - 1) * (int32_t)sizeof(int8_t))); + + // Matrix multiplication input * feature weight + for (int i_batch = 0; i_batch < input_batches; i_batch++) + { + q7_t *res_ptr = state_data + (time_batches * i_batch * feature_batches) + (time_batches - 1); + const q7_t *weight = weights_feature_data; + const q7_t *input = input_data + i_batch * input_height; + + arm_status res = arm_nn_vec_mat_mult_t_s8(input, + weight, + NULL, + res_ptr, + -zp_in, + 0, + 0, + multiplier_in, + shift_in, + input_height, + feature_batches, + in_activation_min, + in_activation_max, + time_batches); + + if (res != ARM_MATH_SUCCESS) + { + return res; + } + } + + // Matrix multiplicate time weight * state tensors + { + q31_t *ptr_a = buffer_a; + const int8_t *v2 = state_data; + for (int i_batch = 0; i_batch < input_batches; i_batch++) + { + const int8_t *v1 = weights_time_data; + + for (int i_feature_batch = 0; i_feature_batch < feature_batches; i_feature_batch++) + { + *ptr_a = 0; + int32_t sum = 0; +#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) + // Perform matrix multiplication in blocks of four + int j = 0; + int32_t block_count = time_batches >> 2; + for (int i = 0; i < block_count; i++) + { + j += 4; + + q31_t r1_1, r1_2, r2_1, r2_2; + v1 = read_and_pad_reordered(v1, &r1_1, &r1_2); + v2 = read_and_pad_reordered(v2, &r2_1, &r2_2); + sum = __SMLAD(r1_1, r2_1, sum); + sum = __SMLAD(r1_2, r2_2, sum); + } + + // Process the remaining data + for (; j < time_batches; j++) + { + sum += *v1 * *v2; + v1++; + v2++; + } +#else + for (int j = 0; j < time_batches; j++) + { + sum += *v1 * *v2; + v1++; + v2++; + } +#endif + + *ptr_a = sum; + ptr_a++; + } + } + } + + if (bias_data) + { + if (unit_count == feature_batches) + { + for (int i = 0; i < input_batches; i++) + { + q31_t *output_temp = buffer_b + i * feature_batches; + const q31_t *ptr_a = buffer_a + i * feature_batches; + + const int32_t *bi = bias_data; + for (int j = 0; j < feature_batches; j++) + { + output_temp[j] = ptr_a[j] + bi[j]; + } + } + } + else + { + for (int i_batch = 0; i_batch < input_batches; i_batch++) + { + q31_t *output_data_temp = buffer_b + i_batch * unit_count; + q31_t *ptr_a = buffer_a + i_batch * feature_batches; + + for (int i = 0; i < unit_count; i++) + { + int32_t sum = bias_data[i]; + for (int j = 0; j < rank; j++) + { + sum += *ptr_a; + ptr_a++; + } + output_data_temp[i] = sum; + } + } + } + } + else + { + for (int i_batch = 0; i_batch < input_batches; i_batch++) + { + q31_t *output_data_temp = buffer_b + i_batch * unit_count; + q31_t *ptr_a = buffer_a + i_batch * feature_batches; + + for (int i = 0; i < unit_count; i++) + { + int32_t sum = 0; + for (int j = 0; j < rank; j++) + { + sum += *ptr_a; + ptr_a++; + } + output_data_temp[i] = sum; + } + } + } + +#if defined(ARM_MATH_MVEI) + int32_t num_elements = input_batches * unit_count; + const int32_t loop_count = (num_elements + 3) / 4; + for (int i_op = 0; i_op < loop_count; i_op++) + { + mve_pred16_t p = vctp32q((uint32_t)num_elements); + int32x4_t op = vldrwq_z_s32(buffer_b, p); + op = arm_requantize_mve(op, multiplier_out, shift_2); + op = vaddq_n_s32(op, zp_out); + const int32x4_t min_vec = vdupq_n_s32((int8_t)out_activation_min); + const int32x4_t max_vec = vdupq_n_s32((int8_t)out_activation_max); + op = vmaxq_s32(op, min_vec); + op = vminq_s32(op, max_vec); + vstrbq_p_s32(output_data, op, p); + output_data += 4; + buffer_b += 4; + num_elements -= 4; + } +#else + for (int i = 0; i < input_batches * unit_count; i++) + { + output_data[i] = (q7_t)CLAMP( + arm_nn_requantize(buffer_b[i], multiplier_out, shift_2) + zp_out, out_activation_max, out_activation_min); + } +#endif + + return (ARM_MATH_SUCCESS); +} + +/** + * @} end of SVDF group + */ diff --git a/Drivers/CMSIS/NN/Source/SVDFunctions/arm_svdf_state_s16_s8.c b/Drivers/CMSIS/NN/Source/SVDFunctions/arm_svdf_state_s16_s8.c new file mode 100644 index 0000000..988409b --- /dev/null +++ b/Drivers/CMSIS/NN/Source/SVDFunctions/arm_svdf_state_s16_s8.c @@ -0,0 +1,267 @@ +/* + * Copyright (C) 2022 Arm Limited or its affiliates. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_svdf_s8.c + * Description: S8 basic SVDF layer function with s16 state tensor + * + * $Date: 28 April 2022 + * $Revision: V.1.0.1 + * + * Target Processor: Cortex-M processors + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup SVDF + * @{ + */ + +/* + * S8 SVDF layer function for TensorFlow Lite with 16 bit state tensor + * + * Refer to header file for details. + * + */ + +arm_status arm_svdf_state_s16_s8(const cmsis_nn_context *input_ctx, + const cmsis_nn_context *output_ctx, + const cmsis_nn_svdf_params *svdf_params, + const cmsis_nn_per_tensor_quant_params *input_quant_params, + const cmsis_nn_per_tensor_quant_params *output_quant_params, + const cmsis_nn_dims *input_dims, + const q7_t *input_data, + const cmsis_nn_dims *state_dims, + q15_t *state_data, + const cmsis_nn_dims *weights_feature_dims, + const q7_t *weights_feature_data, + const cmsis_nn_dims *weights_time_dims, + const q15_t *weights_time_data, + const cmsis_nn_dims *bias_dims, + const q31_t *bias_data, + const cmsis_nn_dims *output_dims, + q7_t *output_data) +{ + (void)bias_dims; + (void)state_dims; + (void)output_dims; + + const q31_t multiplier_in = input_quant_params->multiplier; + const q31_t shift_in = input_quant_params->shift; + const q31_t multiplier_out = output_quant_params->multiplier; + const q31_t shift_2 = output_quant_params->shift; + const int32_t zp_in = svdf_params->input_offset; + const int32_t zp_out = svdf_params->output_offset; + const int32_t in_activation_min = svdf_params->input_activation.min; + const int32_t in_activation_max = svdf_params->input_activation.max; + const int32_t out_activation_min = svdf_params->output_activation.min; + const int32_t out_activation_max = svdf_params->output_activation.max; + const int16_t rank = svdf_params->rank; + + const int32_t input_batches = input_dims->n; + const int32_t input_height = input_dims->h; + const int32_t feature_batches = weights_feature_dims->n; + const int32_t time_batches = weights_time_dims->h; + const int32_t unit_count = feature_batches / rank; + + if (input_ctx->buf == NULL) + { + return ARM_MATH_ARGUMENT_ERROR; + } + q31_t *buffer_a = (q31_t *)input_ctx->buf; + + if (output_ctx->buf == NULL) + { + return ARM_MATH_ARGUMENT_ERROR; + } + q31_t *buffer_b = (q31_t *)output_ctx->buf; + + // Left shift state + memmove((q15_t *)state_data, + (q15_t *)state_data + 1, + (size_t)((input_batches * feature_batches * time_batches - 1) * (int32_t)sizeof(int16_t))); + + // Matrix multiplication input * feature weight + for (int i_batch = 0; i_batch < input_batches; i_batch++) + { + q15_t *res_ptr = state_data + (time_batches * i_batch * feature_batches) + (time_batches - 1); + const q7_t *weight = weights_feature_data; + const q7_t *input = input_data + i_batch * input_height; + + arm_status res = arm_nn_vec_mat_mult_t_svdf_s8(input, + weight, + res_ptr, + -zp_in, + 0, + time_batches, + multiplier_in, + shift_in, + input_height, + feature_batches, + in_activation_min, + in_activation_max); + + if (res != ARM_MATH_SUCCESS) + { + return res; + } + } + + { + // Matrix multiplication time weight * state tensors + q31_t *ptr_a = buffer_a; + const q15_t *v2 = state_data; + for (int i_batch = 0; i_batch < input_batches; i_batch++) + { + const q15_t *v1 = weights_time_data; + + for (int i_feature_batch = 0; i_feature_batch < feature_batches; i_feature_batch++) + { + *ptr_a = 0; + int32_t sum = 0; +#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) + // Perform matrix multiplication in blocks of two + int j = 0; + int32_t block_count = time_batches >> 1; + for (int i = 0; i < block_count; i++) + { + j += 2; + q31_t r1 = arm_nn_read_q15x2_ia(&v1); + q31_t r2 = arm_nn_read_q15x2_ia(&v2); + + sum = __SMLAD(r1, r2, sum); + } + + // Process the remaining data + for (; j < time_batches; j++) + { + sum += *v1 * *v2; + v1++; + v2++; + } +#else + for (int j = 0; j < time_batches; j++) + { + sum += *v1 * *v2; + v1++; + v2++; + } +#endif + + *ptr_a = sum; + ptr_a++; + } + } + } + + if (bias_data) + { + if (unit_count == feature_batches) + { + for (int i = 0; i < input_batches; i++) + { + q31_t *output_temp = buffer_b + i * feature_batches; + const q31_t *ptr_a = buffer_a + i * feature_batches; + + const int32_t *bi = bias_data; + for (int j = 0; j < feature_batches; j++) + { + output_temp[j] = ptr_a[j] + bi[j]; + } + } + } + else + { + for (int i_batch = 0; i_batch < input_batches; i_batch++) + { + q31_t *output_data_temp = buffer_b + i_batch * unit_count; + q31_t *ptr_a = buffer_a + i_batch * feature_batches; + + for (int i = 0; i < unit_count; i++) + { + int32_t sum = bias_data[i]; + for (int j = 0; j < rank; j++) + { + sum += *ptr_a; + ptr_a++; + } + output_data_temp[i] = sum; + } + } + } + } + else + { + for (int i_batch = 0; i_batch < input_batches; i_batch++) + { + q31_t *output_data_temp = buffer_b + i_batch * unit_count; + q31_t *ptr_a = buffer_a + i_batch * feature_batches; + + for (int i = 0; i < unit_count; i++) + { + int32_t sum = 0; + for (int j = 0; j < rank; j++) + { + sum += *ptr_a; + ptr_a++; + } + output_data_temp[i] = sum; + } + } + } + +#if defined(ARM_MATH_MVEI) + int32_t num_elements = input_batches * unit_count; + const int32_t loop_count = (num_elements + 3) / 4; + for (int i_op = 0; i_op < loop_count; i_op++) + { + mve_pred16_t p = vctp32q((uint32_t)num_elements); + int32x4_t op = vldrwq_z_s32(buffer_b, p); + op = arm_requantize_mve(op, multiplier_out, shift_2); + op = vaddq_n_s32(op, zp_out); + const int32x4_t min_vec = vdupq_n_s32((int8_t)out_activation_min); + const int32x4_t max_vec = vdupq_n_s32((int8_t)out_activation_max); + op = vmaxq_s32(op, min_vec); + op = vminq_s32(op, max_vec); + vstrbq_p_s32(output_data, op, p); + output_data += 4; + buffer_b += 4; + num_elements -= 4; + } +#else + for (int i = 0; i < input_batches * unit_count; i++) + { + output_data[i] = (q7_t)CLAMP( + arm_nn_requantize(buffer_b[i], multiplier_out, shift_2) + zp_out, out_activation_max, out_activation_min); + } +#endif + + return (ARM_MATH_SUCCESS); +} + +/** + * @} end of SVDF group + */ diff --git a/Drivers/CMSIS/NN/Source/SoftmaxFunctions/CMakeLists.txt b/Drivers/CMSIS/NN/Source/SoftmaxFunctions/CMakeLists.txt new file mode 100644 index 0000000..622b990 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/SoftmaxFunctions/CMakeLists.txt @@ -0,0 +1,22 @@ +# +# Copyright (c) 2019-2022 Arm Limited. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +file(GLOB SRC "./*_s8.c") +target_sources(cmsis-nn PRIVATE ${SRC} arm_softmax_s8_s16.c + arm_softmax_s16.c + arm_nn_softmax_common_s8.c) diff --git a/Drivers/CMSIS/NN/Source/SoftmaxFunctions/arm_nn_softmax_common_s8.c b/Drivers/CMSIS/NN/Source/SoftmaxFunctions/arm_nn_softmax_common_s8.c new file mode 100644 index 0000000..84d1ac8 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/SoftmaxFunctions/arm_nn_softmax_common_s8.c @@ -0,0 +1,141 @@ +/* + * Copyright (C) 2022 Arm Limited or its affiliates. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_nn_softmax_common_s8.c + * Description: Softmax with s8 input and output of s8 or s16. + * + * $Date: 17 March 2022 + * $Revision: V.1.0.1 + * + * Target Processor: Cortex-M processors + * -------------------------------------------------------------------- */ + +#include "arm_nnsupportfunctions.h" + +#define ACCUM_BITS 12 + +/** + * @ingroup groupSupport + */ + +/** + * @addtogroup Softmax + * @{ + */ + +/* + * Softmax function with s8 input and output of s8 or s16. + * + * Refer header file for details. + * + */ +void arm_nn_softmax_common_s8(const int8_t *input, + const int32_t num_rows, + const int32_t row_size, + const int32_t mult, + const int32_t shift, + const int32_t diff_min, + const bool int16_output, + void *output) +{ + const int32_t mask = (1 << shift); + + int32_t col = 0; + int32_t row_idx; + + for (row_idx = 0; row_idx < num_rows; ++row_idx) + { + // Find the maximum value in order to ensure numerical stability + int8_t max = *input; + + for (col = 1; col < row_size; ++col) + { + max = MAX(max, input[col]); + } + + int32_t diff = 0; + int32_t sum = 0; + + for (col = 0; col < row_size; ++col) + { + diff = input[col] - max; + if (diff >= diff_min) + { + sum += DIV_POW2(EXP_ON_NEG(MUL_SAT(diff * mask, mult)), ACCUM_BITS); + } + } + + const int32_t headroom = __CLZ(sum); + const int32_t shifted_scale = ONE_OVER1((sum > 0 ? sum << headroom : 0) - (1 << 31)); + int32_t bits_over_unit; + + if (int16_output) + { + int16_t *output_s16 = (int16_t *)output + row_idx * row_size; + + bits_over_unit = ACCUM_BITS - headroom + 15; + + for (col = 0; col < row_size; ++col) + { + diff = input[col] - max; + + if (diff >= diff_min) + { + const int32_t res = + DIV_POW2(MUL_SAT(shifted_scale, EXP_ON_NEG(MUL_SAT(diff * mask, mult))), bits_over_unit) + + NN_Q15_MIN; + output_s16[col] = (int16_t)CLAMP(res, (int32_t)NN_Q15_MAX, (int32_t)NN_Q15_MIN); + } + else + { + output_s16[col] = NN_Q15_MIN; + } + } + } + else + { + int8_t *output_s8 = (int8_t *)output + row_idx * row_size; + + bits_over_unit = ACCUM_BITS - headroom + 23; + + for (col = 0; col < row_size; ++col) + { + diff = input[col] - max; + if (diff >= diff_min) + { + const int32_t res = + DIV_POW2(MUL_SAT(shifted_scale, EXP_ON_NEG(MUL_SAT(diff * mask, mult))), bits_over_unit) + + NN_Q7_MIN; + output_s8[col] = (int8_t)CLAMP(res, (int32_t)NN_Q7_MAX, (int32_t)NN_Q7_MIN); + } + else + { + output_s8[col] = NN_Q7_MIN; + } + } + } + + input += row_size; + } +} + +/** + * @} end of NNBasicMath group + */ diff --git a/Drivers/CMSIS/NN/Source/SoftmaxFunctions/arm_softmax_q15.c b/Drivers/CMSIS/NN/Source/SoftmaxFunctions/arm_softmax_q15.c new file mode 100644 index 0000000..18f3e83 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/SoftmaxFunctions/arm_softmax_q15.c @@ -0,0 +1,118 @@ +/* + * Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_softmax_q15.c + * Description: Q15 softmax function + * + * $Date: 09. October 2020 + * $Revision: V.1.0.1 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup Softmax + * @{ + */ + +/** + * @brief Q15 softmax function + * @param[in] vec_in pointer to input vector + * @param[in] dim_vec input vector dimention + * @param[out] p_out pointer to output vector + * + * @details + * + * Here, instead of typical e based softmax, we use + * 2-based softmax, i.e.,: + * + * y_i = 2^(x_i) / sum(2^x_j) + * + * The relative output will be different here. + * But mathematically, the gradient will be the same + * with a log(2) scaling factor. + * + */ + +void arm_softmax_q15(const q15_t *vec_in, const uint16_t dim_vec, q15_t *p_out) +{ + q31_t sum; + int16_t i; + uint8_t shift; + q31_t base; + base = -1 * 0x100000; + for (i = 0; i < dim_vec; i++) + { + if (vec_in[i] > base) + { + base = vec_in[i]; + } + } + + /* we ignore really small values + * anyway, they will be 0 after shrinking + * to q15_t + */ + base = base - 16; + + sum = 0; + + for (i = 0; i < dim_vec; i++) + { + if (vec_in[i] > base) + { + shift = (uint8_t)__USAT(vec_in[i] - base, 5); + sum += 0x1 << shift; + } + } + + /* This is effectively (0x1 << 32) / sum */ + int64_t div_base = 0x100000000LL; + int output_base = (int32_t)(div_base / sum); + + /* Final confidence will be output_base >> ( 17 - (vec_in[i] - base) ) + * so 32768 (0x1<<15) -> 100% confidence when sum = 0x1 << 16, output_base = 0x1 << 16 + * and vec_in[i]-base = 16 + */ + for (i = 0; i < dim_vec; i++) + { + if (vec_in[i] > base) + { + /* Here minimum value of 17+base-vec[i] will be 1 */ + shift = (uint8_t)__USAT(17 + base - vec_in[i], 5); + p_out[i] = (q15_t)__SSAT((output_base >> shift), 16); + } + else + { + p_out[i] = 0; + } + } +} + +/** + * @} end of Softmax group + */ diff --git a/Drivers/CMSIS/NN/Source/SoftmaxFunctions/arm_softmax_q7.c b/Drivers/CMSIS/NN/Source/SoftmaxFunctions/arm_softmax_q7.c new file mode 100644 index 0000000..58eb990 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/SoftmaxFunctions/arm_softmax_q7.c @@ -0,0 +1,107 @@ +/* + * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_softmax_q7.c + * Description: Q7 softmax function + * + * $Date: 09. October 2020 + * $Revision: V.1.0.2 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup Softmax + * @{ + */ + +/** + * @brief Q7 softmax function + * @param[in] vec_in pointer to input vector + * @param[in] dim_vec input vector dimention + * @param[out] p_out pointer to output vector + * + * @details + * + * Here, instead of typical natural logarithm e based softmax, we use + * 2-based softmax here, i.e.,: + * + * y_i = 2^(x_i) / sum(2^x_j) + * + * The relative output will be different here. + * But mathematically, the gradient will be the same + * with a log(2) scaling factor. + * + */ + +void arm_softmax_q7(const q7_t *vec_in, const uint16_t dim_vec, q7_t *p_out) +{ + q31_t sum; + int16_t i; + uint8_t shift; + q15_t base; + base = -128; + + /* We first search for the maximum */ + for (i = 0; i < dim_vec; i++) + { + if (vec_in[i] > base) + { + base = vec_in[i]; + } + } + + /* + * So the base is set to max-8, meaning + * that we ignore really small values. + * anyway, they will be 0 after shrinking to q7_t. + */ + base = base - (1 << 3); + + sum = 0; + + for (i = 0; i < dim_vec; i++) + { + shift = (uint8_t)__USAT(vec_in[i] - base, 3); + sum += 0x1 << shift; + } + + /* This is effectively (0x1 << 20) / sum */ + int output_base = (1 << 20) / sum; + + for (i = 0; i < dim_vec; i++) + { + + /* Here minimum value of 13+base-vec_in[i] will be 5 */ + shift = (uint8_t)__USAT(13 + base - vec_in[i], 5); + p_out[i] = (q7_t)__SSAT((output_base >> shift), 8); + } +} + +/** + * @} end of Softmax group + */ diff --git a/Drivers/CMSIS/NN/Source/SoftmaxFunctions/arm_softmax_s16.c b/Drivers/CMSIS/NN/Source/SoftmaxFunctions/arm_softmax_s16.c new file mode 100644 index 0000000..e840893 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/SoftmaxFunctions/arm_softmax_s16.c @@ -0,0 +1,122 @@ +/* + * Copyright (C) 2022 Arm Limited or its affiliates. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_softmax_s16.c + * Description: S16 softmax function + * + * $Date: 9 March 2022 + * $Revision: V.1.0.0 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @addtogroup Softmax + * @{ + */ + +arm_status arm_softmax_s16(const int16_t *input, + const int32_t num_rows, + const int32_t row_size, + const int32_t mult, + const int32_t shift, + const cmsis_nn_softmax_lut_s16 *softmax_params, + int16_t *output) +{ + int32_t col = 0; + int32_t row_idx; + + if (softmax_params->exp_lut == NULL || softmax_params->one_by_one_lut == NULL) + { + return ARM_MATH_ARGUMENT_ERROR; + } + + for (row_idx = 0; row_idx < num_rows; ++row_idx) + { + // Find the maximum value in order to ensure numerical stability + int16_t max = *input; + for (col = 1; col < row_size; ++col) + { + max = MAX(max, input[col]); + } + + int32_t diff = 0; + int32_t sum = 0; + int16_t *cached_exp_results = output; + + for (col = 0; col < row_size; ++col) + { + diff = input[col] - max; + const int32_t scaled_diff = arm_nn_requantize(diff, mult, shift); + const int32_t symmetric_scaled_diff = scaled_diff + NN_Q15_MAX; + const int16_t saturated_symmetric_scaled_diff = MIN(MAX(symmetric_scaled_diff, NN_Q15_MIN), NN_Q15_MAX); + + // Lookup from exp table and cache result for next step + const int16_t index = (256 + (saturated_symmetric_scaled_diff >> 7)); + const int16_t offset = saturated_symmetric_scaled_diff & 0x7f; + const int16_t base = softmax_params->exp_lut[index]; + const int16_t slope = softmax_params->exp_lut[index + 1] - softmax_params->exp_lut[index]; + const int16_t delta = (slope * offset + 64) >> 7; + const int16_t result = (base + delta); + cached_exp_results[col] = result; + + sum += cached_exp_results[col]; + } + + const int32_t headroom = __CLZ(sum); + + // Compute the reciprocal 1/sum + const int32_t shifted_sum = (((sum) << (headroom - 1)) + (1 << 13)) >> 14; + + // Since LUT computes 1/(1 + x), compute x = (sum - 1) => -65536 + // Since LUT expects a symmetrical input, recenter from [UINT16_MIN, UINT16_MAX] to [INT16_MIN, INT16_MAX] => + // -32768 ==> So in total -65536 -32768 => -98304 + const int16_t symmetric_shifted_sum = shifted_sum - 98304; + + // Lookup from one by one table + const int16_t index = (256 + (symmetric_shifted_sum >> 7)); + const int16_t offset = symmetric_shifted_sum & 0x7f; + const int16_t base = softmax_params->one_by_one_lut[index]; + const int16_t slope = softmax_params->one_by_one_lut[index + 1] - softmax_params->one_by_one_lut[index]; + const int16_t delta = (slope * offset + 64) >> 7; + const int16_t one_by_one_result = (base + delta); + + for (col = 0; col < row_size; ++col) + { + const int16_t right_shift = 30 - headroom; + int32_t result = (cached_exp_results[col] * one_by_one_result) >> right_shift; + result = (result + 1) >> 1; // Last shift position and insert round + output[col] = (int16_t)result; + } + + output += row_size; + input += row_size; + } + + return ARM_MATH_SUCCESS; +} + +/** + * @} end of Softmax group + */ diff --git a/Drivers/CMSIS/NN/Source/SoftmaxFunctions/arm_softmax_s8.c b/Drivers/CMSIS/NN/Source/SoftmaxFunctions/arm_softmax_s8.c new file mode 100644 index 0000000..fd70597 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/SoftmaxFunctions/arm_softmax_s8.c @@ -0,0 +1,215 @@ +/* + * Copyright (C) 2010-2022 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_softmax_s8.c + * Description: S8 softmax function + * + * $Date: 9 March 2022 + * $Revision: V.2.1.0 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +#define ACCUM_BITS 12 + +#ifdef ARM_MATH_MVEI +static int32x4_t arm_exp_on_negative_values_mve_32x4(int32x4_t val) +{ +#define SHIFT_START (24) + int32_t shift = SHIFT_START; + int32x4_t mask; + + const int32x4_t val_mod_minus_quarter = + vandq_s32(val, vdupq_n_s32((1 << SHIFT_START) - 1)) - vdupq_n_s32(1 << SHIFT_START); + const int32x4_t remainder = vsubq_s32(val_mod_minus_quarter, val); + const int32x4_t x = vaddq_n_s32(val_mod_minus_quarter << 5, 1 << 28); + const int32x4_t x2 = MUL_SAT_MVE(x, x); + const int32x4_t op_1 = DIV_POW2_MVE(MUL_SAT_MVE(x2, x2), 2) + MUL_SAT_MVE(x2, x); + const int32x4_t op_2 = x + DIV_POW2_MVE(MUL_SAT_MVE(op_1, vdupq_n_s32(715827883)) + x2, 1); + int32x4_t result = vdupq_n_s32(1895147668) + MUL_SAT_MVE(vdupq_n_s32(1895147668), op_2); + +#define SELECT_IF_NON_ZERO(x) \ + { \ + mve_pred16_t p = vcmpneq_n_s32(remainder & vdupq_n_s32(1 << shift++), 0); \ + mask = vmvnq_m_s32(vdupq_n_s32(0), vdupq_n_s32(0), p); \ + result = SELECT_USING_MASK(mask, MUL_SAT_MVE(result, vdupq_n_s32(x)), result); \ + } + + SELECT_IF_NON_ZERO(1672461947) + SELECT_IF_NON_ZERO(1302514674) + SELECT_IF_NON_ZERO(790015084) + SELECT_IF_NON_ZERO(290630308) + SELECT_IF_NON_ZERO(39332535) + SELECT_IF_NON_ZERO(720401) + SELECT_IF_NON_ZERO(242) + +#undef SELECT_IF_NON_ZERO + + mve_pred16_t p = vcmpeqq_n_s32(val, 0); + mask = vmvnq_m_s32(vdupq_n_s32(0), vdupq_n_s32(0), p); + + result = SELECT_USING_MASK(mask, vdupq_n_s32(NN_Q31_MAX), result); + return result; +} +#endif + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup Softmax + * @{ + */ + +void arm_softmax_s8(const int8_t *input, + const int32_t num_rows, + const int32_t row_size, + const int32_t mult, + const int32_t shift, + const int32_t diff_min, + int8_t *output) +{ +#ifdef ARM_MATH_MVEI + +#define ACT_MIN ((int8_t)NN_Q7_MIN) +#define ACT_MAX ((int8_t)NN_Q7_MAX) + + const int32_t mask = (1 << shift); + + for (int i_num_rows = 0; i_num_rows < num_rows; ++i_num_rows) + { + int8_t max = ACT_MIN; + + int32_t vec_count = (row_size + 15) / 16; + uint32_t r_count = (uint32_t)row_size; + for (int i = 0; i < vec_count; i++) + { + mve_pred16_t p = vctp8q(r_count); + const int8x16_t ip = vldrbq_z_s8(&input[i * 16], p); + max = vmaxvq_p_s8(max, ip, p); + r_count -= 16; + } + + vec_count = row_size / 4; + int32_t idx = 0; + int32_t sum = 0; + + while (vec_count) + { + int32x4_t ip = vldrbq_s32(&input[idx * 4]); + ip = vsubq_n_s32(ip, max); + mve_pred16_t p = vcmpgeq_n_s32(ip, diff_min); + if (p != 0) + { + ip = vmulq_n_s32(ip, mask); + + int32x4_t res = MUL_SAT_MVE(ip, vdupq_n_s32(mult)); + + res = arm_exp_on_negative_values_mve_32x4(res); + res = DIV_POW2_MVE(res, ACCUM_BITS); + res = vpselq_s32(res, vdupq_n_s32(0), p); + sum += vaddvq_s32(res); + } + + vec_count--; + idx++; + } + + const int32_t tail_idx = row_size & ~3; + for (int i = 0; i < (row_size & 3); i++) + { + const int32_t diff = input[tail_idx + i] - max; + if (diff >= diff_min) + { + sum += DIV_POW2(EXP_ON_NEG(MUL_SAT(diff * mask, mult)), ACCUM_BITS); + } + } + + const int32_t headroom = __CLZ((uint32_t)sum); + const int32_t bits_over_unit = ACCUM_BITS - headroom + 23; + const int32_t shifted_scale = ONE_OVER1((sum > 0 ? sum << headroom : 0) - (1 << 31)); + + vec_count = row_size / 4; + idx = 0; + + while (vec_count) + { + int32x4_t ip = vldrbq_s32(&input[idx]); + ip = vsubq_n_s32(ip, max); + + mve_pred16_t p = vcmpgeq_n_s32(ip, diff_min); + + int32x4_t tmp_res; + + if (p != 0) + { + ip = vmulq_n_s32(ip, mask); + + tmp_res = MUL_SAT_MVE(ip, vdupq_n_s32(mult)); + tmp_res = arm_exp_on_negative_values_mve_32x4(tmp_res); + tmp_res = MUL_SAT_MVE(vdupq_n_s32(shifted_scale), tmp_res); + tmp_res = DIV_POW2_MVE(tmp_res, bits_over_unit); + tmp_res += vdupq_n_s32(ACT_MIN); + + tmp_res = vmaxq_s32(tmp_res, vdupq_n_s32(ACT_MIN)); + tmp_res = vminq_s32(tmp_res, vdupq_n_s32(ACT_MAX)); + tmp_res = vpselq_s32(tmp_res, vdupq_n_s32(ACT_MIN), p); + } + else + { + tmp_res = vdupq_n_s32(ACT_MIN); + } + vstrbq_s32(&output[idx], tmp_res); + vec_count--; + idx += 4; + } + + for (int i = 0; i < (row_size & 3); i++) + { + int32_t diff = input[tail_idx + i] - max; + if (diff >= diff_min) + { + const int32_t res = + DIV_POW2(MUL_SAT(shifted_scale, EXP_ON_NEG(MUL_SAT(diff * mask, mult))), bits_over_unit) + + NN_Q7_MIN; + output[tail_idx + i] = (int8_t)CLAMP(res, (int32_t)ACT_MAX, (int32_t)ACT_MIN); + } + else + { + output[tail_idx + i] = ACT_MIN; + } + } + + input += row_size; + output += row_size; + } +#else + arm_nn_softmax_common_s8(input, num_rows, row_size, mult, shift, diff_min, false, (void *)output); +#endif +} + +/** + * @} end of Softmax group + */ diff --git a/Drivers/CMSIS/NN/Source/SoftmaxFunctions/arm_softmax_s8_s16.c b/Drivers/CMSIS/NN/Source/SoftmaxFunctions/arm_softmax_s8_s16.c new file mode 100644 index 0000000..9ba0b9a --- /dev/null +++ b/Drivers/CMSIS/NN/Source/SoftmaxFunctions/arm_softmax_s8_s16.c @@ -0,0 +1,55 @@ +/* + * Copyright (C) 2022 Arm Limited or its affiliates. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_softmax_s8_s16.c + * Description: S8 to s16 softmax function + * + * $Date: 7 January 2022 + * $Revision: V.1.0.0 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup Softmax + * @{ + */ + +void arm_softmax_s8_s16(const int8_t *input, + const int32_t num_rows, + const int32_t row_size, + const int32_t mult, + const int32_t shift, + const int32_t diff_min, + int16_t *output) +{ + arm_nn_softmax_common_s8(input, num_rows, row_size, mult, shift, diff_min, true, (void *)output); +} +/** + * @} end of Softmax group + */ diff --git a/Drivers/CMSIS/NN/Source/SoftmaxFunctions/arm_softmax_u8.c b/Drivers/CMSIS/NN/Source/SoftmaxFunctions/arm_softmax_u8.c new file mode 100644 index 0000000..c4df8f8 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/SoftmaxFunctions/arm_softmax_u8.c @@ -0,0 +1,103 @@ +/* + * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_softmax_u8.c + * Description: U8 softmax function + * + * $Date: 09. October 2020 + * $Revision: V.1.0.2 + * + * Target Processor: Cortex-M CPUs + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +#define ACCUM_BITS 12 + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup Softmax + * @{ + */ +void arm_softmax_u8(const uint8_t *input, + const int32_t num_rows, + const int32_t row_size, + const int32_t mult, + const int32_t shift, + const int32_t diff_min, + uint8_t *output) +{ + const int32_t mask = (1 << shift); + + int32_t col = 0; + int32_t row_idx; + + for (row_idx = 0; row_idx < num_rows; ++row_idx) + { + // Find the maximum value in order to ensure numerical stability + uint8_t max = *input; + + for (col = 1; col < row_size; ++col) + { + max = MAX(max, input[col]); + } + + int32_t diff = 0; + int32_t sum = 0; + + for (col = 0; col < row_size; ++col) + { + diff = input[col] - max; + if (diff >= diff_min) + { + sum += DIV_POW2(EXP_ON_NEG(MUL_SAT(diff * mask, mult)), ACCUM_BITS); + } + } + + const int32_t headroom = __CLZ((uint32_t)sum); + const int32_t bits_over_unit = ACCUM_BITS - headroom + 23; + const int32_t shifted_scale = ONE_OVER1((sum << headroom) - (1 << 31)); + + for (col = 0; col < row_size; ++col) + { + diff = input[col] - max; + if (diff >= diff_min) + { + const int32_t res = + DIV_POW2(MUL_SAT(shifted_scale, EXP_ON_NEG(MUL_SAT(diff * mask, mult))), bits_over_unit); + output[col] = (uint8_t)CLAMP(res, (int32_t)255, (int32_t)0); + } + else + { + output[col] = 0; + } + } + input += row_size; + output += row_size; + } +} +/** + * @} end of Softmax group + */
\ No newline at end of file diff --git a/Drivers/CMSIS/NN/Source/SoftmaxFunctions/arm_softmax_with_batch_q7.c b/Drivers/CMSIS/NN/Source/SoftmaxFunctions/arm_softmax_with_batch_q7.c new file mode 100644 index 0000000..66e892e --- /dev/null +++ b/Drivers/CMSIS/NN/Source/SoftmaxFunctions/arm_softmax_with_batch_q7.c @@ -0,0 +1,74 @@ +/* + * Copyright (C) 2010-2019 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_softmax_with_batch_q7.c + * Description: Q7 softmax function + * + * $Date: 09. October 2020 + * $Revision: V.1.0.1 + * + * Target Processor: Cortex-M and Cortex-A cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup Softmax + * @{ + */ + +/** + * @brief Q7 softmax function with batch parameter + * @param[in] vec_in pointer to input vector + * @param[in] nb_batches number of batches + * @param[in] dim_vec input vector dimention + * @param[out] p_out pointer to output vector + * + * @details + * + * Here, instead of typical natural logarithm e based softmax, we use + * 2-based softmax here, i.e.,: + * + * y_i = 2^(x_i) / sum(2^x_j) + * + * The relative output will be different here. + * But mathematically, the gradient will be the same + * with a log(2) scaling factor. + * + */ + +void arm_softmax_with_batch_q7(const q7_t *vec_in, const uint16_t nb_batches, const uint16_t dim_vec, q7_t *p_out) +{ + for (int i = 0; i < nb_batches; i++) + { + arm_softmax_q7(vec_in, dim_vec, p_out); + vec_in += dim_vec; + p_out += dim_vec; + } +} + +/** + * @} end of Softmax group + */ |