diff options
Diffstat (limited to 'Drivers/CMSIS/NN/Source/ConvolutionFunctions')
31 files changed, 7623 insertions, 0 deletions
diff --git a/Drivers/CMSIS/NN/Source/ConvolutionFunctions/CMakeLists.txt b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/CMakeLists.txt new file mode 100644 index 0000000..30be0fe --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/CMakeLists.txt @@ -0,0 +1,24 @@ +# +# Copyright (c) 2019-2022 Arm Limited. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +file(GLOB SRC "./*_s8*.c") +file(GLOB SRC_S16 "./*_s16*.c") +target_sources(cmsis-nn PRIVATE ${SRC} ${SRC_S16}) + + + diff --git a/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_1_x_n_s8.c b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_1_x_n_s8.c new file mode 100644 index 0000000..a3edd40 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_1_x_n_s8.c @@ -0,0 +1,205 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_convolve_1_x_n_s8.c + * Description: s8 version of 1xN convolution using symmetric quantization. + * + * $Date: December 14, 2021 + * $Revision: V.2.1.0 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup NNConv + * @{ + */ + +/* + * 1xN s8 convolution function. + * + * Refer header file for details. + * + */ + +arm_status arm_convolve_1_x_n_s8(const cmsis_nn_context *ctx, + const cmsis_nn_conv_params *conv_params, + const cmsis_nn_per_channel_quant_params *quant_params, + const cmsis_nn_dims *input_dims, + const q7_t *input_data, + const cmsis_nn_dims *filter_dims, + const q7_t *filter_data, + const cmsis_nn_dims *bias_dims, + const int32_t *bias_data, + const cmsis_nn_dims *output_dims, + q7_t *output_data) +{ + (void)bias_dims; + arm_status status = ARM_MATH_SUCCESS; + if (output_dims->w % 4 != 0) + { + status = ARM_MATH_SIZE_MISMATCH; + goto out; + } + +#if defined(ARM_MATH_MVEI) + (void)ctx; + + const uint16_t input_x = input_dims->w; + const uint16_t kernel_x = filter_dims->w; + const uint16_t output_x = output_dims->w; + const uint16_t output_ch = output_dims->c; + const uint16_t input_ch = input_dims->c; + const uint16_t pad_x = conv_params->padding.w; + const uint16_t stride_x = conv_params->stride.w; + + const int32_t input_offset = conv_params->input_offset; + const int32_t out_offset = conv_params->output_offset; + const int32_t out_activation_min = conv_params->activation.min; + const int32_t out_activation_max = conv_params->activation.max; + int32_t *output_mult = quant_params->multiplier; + int32_t *output_shift = quant_params->shift; + + for (int i_out_x = 0; i_out_x <= (output_x - 4); i_out_x += 4) + { + int32_t input_begin_idx[4]; + int32_t ker_begin_idx[4]; + int32_t ker_end_idx[4]; + + for (int i = 0; i < 4; i++) + { + const int32_t est_input_x_idx = stride_x * (i_out_x + i) - pad_x; + input_begin_idx[i] = MAX(0, est_input_x_idx); + ker_begin_idx[i] = MAX(0, -est_input_x_idx); + ker_end_idx[i] = MIN(kernel_x, input_x - est_input_x_idx); + } + + if ((ker_begin_idx[0] != 0) || (ker_end_idx[3] != kernel_x)) + { + for (int i_out_ch = 0; i_out_ch < output_ch; i_out_ch++) + { + int32x4_t s_offset; + int32_t acc[4]; + { + int32_t sum_row[4]; + + (void)arm_nn_mat_mul_core_1x_s8((ker_end_idx[0] - ker_begin_idx[0]) * input_ch, + input_data + input_begin_idx[0] * input_ch, + filter_data + (input_ch * kernel_x * i_out_ch) + + (ker_begin_idx[0] * input_ch), + &sum_row[0], + &acc[0]); + (void)arm_nn_mat_mul_core_1x_s8((ker_end_idx[1] - ker_begin_idx[1]) * input_ch, + input_data + input_begin_idx[1] * input_ch, + filter_data + (input_ch * kernel_x * i_out_ch) + + (ker_begin_idx[1] * input_ch), + &sum_row[1], + &acc[1]); + + (void)arm_nn_mat_mul_core_1x_s8((ker_end_idx[2] - ker_begin_idx[2]) * input_ch, + input_data + input_begin_idx[2] * input_ch, + filter_data + (input_ch * kernel_x * i_out_ch) + + (ker_begin_idx[2] * input_ch), + &sum_row[2], + &acc[2]); + + (void)arm_nn_mat_mul_core_1x_s8((ker_end_idx[3] - ker_begin_idx[3]) * input_ch, + input_data + input_begin_idx[3] * input_ch, + filter_data + (input_ch * kernel_x * i_out_ch) + + (ker_begin_idx[3] * input_ch), + &sum_row[3], + &acc[3]); + + s_offset = vldrwq_s32(sum_row); + } + int32x4_t res = vldrwq_s32(acc); + s_offset = vmulq_n_s32(s_offset, input_offset); + res = vaddq_s32(res, s_offset); + if (bias_data) + { + res = vaddq_n_s32(res, bias_data[i_out_ch]); + } + res = arm_requantize_mve(res, output_mult[i_out_ch], output_shift[i_out_ch]); + res = vaddq_n_s32(res, out_offset); + + res = vmaxq_s32(res, vdupq_n_s32(out_activation_min)); + res = vminq_s32(res, vdupq_n_s32(out_activation_max)); + + const uint32x4_t scatter_offset = {0, output_ch, output_ch * 2, output_ch * 3}; + vstrbq_scatter_offset_s32(output_data, scatter_offset, res); + output_data++; + } + output_data += (3 * output_ch); + } + else + { + output_data = arm_nn_mat_mul_core_4x_s8(kernel_x * input_ch, + stride_x * input_ch, + input_data + input_begin_idx[0] * input_ch, + filter_data, + output_ch, + conv_params, + quant_params, + bias_data, + output_data); + } + } + +#else + status = arm_convolve_s8(ctx, + conv_params, + quant_params, + input_dims, + input_data, + filter_dims, + filter_data, + bias_dims, + bias_data, + output_dims, + output_data); +#endif + +out: + /* Return to application */ + return status; +} + +int32_t arm_convolve_1_x_n_s8_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims) +{ +#if !defined(ARM_MATH_MVEI) + return (2 * input_dims->c * filter_dims->w * filter_dims->h) * sizeof(int16_t); +#else + (void)input_dims; + (void)filter_dims; + return 0; +#endif +} + +/** + * @} end of NNConv group + */ diff --git a/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_1x1_HWC_q7_fast_nonsquare.c b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_1x1_HWC_q7_fast_nonsquare.c new file mode 100644 index 0000000..3db3ba4 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_1x1_HWC_q7_fast_nonsquare.c @@ -0,0 +1,235 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_convolve_1x1_HWC_q7_fast_nonsquare.c + * Description: Fast Q7 version of 1x1 convolution (non-square shape) + * + * $Date: July 20, 2021 + * $Revision: V.1.1.2 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup NNConv + * @{ + */ + +/** + * @brief Fast Q7 version of 1x1 convolution (non-sqaure shape) + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in_x input tensor dimention x + * @param[in] dim_im_in_y input tensor dimention y + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel_x filter kernel size x + * @param[in] dim_kernel_y filter kernel size y + * @param[in] padding_x padding size x + * @param[in] padding_y padding size y + * @param[in] stride_x convolution stride x + * @param[in] stride_y convolution stride y + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out_x output tensor dimension x + * @param[in] dim_im_out_y output tensor dimension y + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferB pointer to buffer space for output + * @return The function returns either + * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking. + * + * This function is optimized for convolution with 1x1 kernel size (i.e., dim_kernel_x=1 + * and dim_kernel_y=1). It can be used for the second half of MobileNets [1] after depthwise + * separable convolution. + * + * This function is the version with full list of optimization tricks, but with + * some constraints: + * ch_im_in is multiple of 4 + * ch_im_out is multiple of 2 + * + * [1] MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications + * https://arxiv.org/abs/1704.04861 + */ + +arm_status arm_convolve_1x1_HWC_q7_fast_nonsquare(const q7_t *Im_in, + const uint16_t dim_im_in_x, + const uint16_t dim_im_in_y, + const uint16_t ch_im_in, + const q7_t *wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel_x, + const uint16_t dim_kernel_y, + const uint16_t padding_x, + const uint16_t padding_y, + const uint16_t stride_x, + const uint16_t stride_y, + const q7_t *bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q7_t *Im_out, + const uint16_t dim_im_out_x, + const uint16_t dim_im_out_y, + q15_t *bufferA, + q7_t *bufferB) +{ + (void)bufferB; +#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) + /* Run the following code for Cortex-M4 and Cortex-M7 */ + (void)dim_im_in_y; + int16_t i_out_y, i_out_x; + int16_t i_ch_out; + + /* ----------------------- + * Here we use bufferA as q15_t internally as computation are done with q15_t level + * im2col are done to output in q15_t format from q7_t input + */ + + q15_t *pBuffer = bufferA; + q7_t *pOut = Im_out; + + if (ch_im_in % 4 != 0 || ch_im_out % 2 != 0 || dim_kernel_x != 1 || dim_kernel_y != 1 || padding_x != 0 || + padding_y != 0 || stride_x != 1 || stride_y != 1) + { + /* check if the input dimension meets the constraints */ + return ARM_MATH_SIZE_MISMATCH; + } + + for (i_out_y = 0; i_out_y < dim_im_out_y; i_out_y++) + { + for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++) + { + /* This part implements the im2col function */ + arm_q7_to_q15_reordered_no_shift( + (q7_t *)Im_in + (i_out_y * dim_im_in_x + i_out_x) * ch_im_in, pBuffer, ch_im_in); + pBuffer += ch_im_in; + + if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y) + { + pOut = arm_nn_mat_mult_kernel_q7_q15_reordered( + wt, bufferA, ch_im_out, ch_im_in, bias_shift, out_shift, bias, pOut); + /* counter reset */ + pBuffer = bufferA; + } + } + } + + /* check if there is left-over for compute */ + if (pBuffer != bufferA) + { + const q7_t *pA = wt; + for (i_ch_out = 0; i_ch_out < ch_im_out; i_ch_out++) + { + q31_t sum = ((q31_t)(bias[i_ch_out]) << bias_shift) + NN_ROUND(out_shift); + const q15_t *pB = bufferA; + /* basically each time it process 4 entries */ + uint16_t colCnt = ch_im_in * dim_kernel_x * dim_kernel_y >> 2; + + while (colCnt) + { + + q31_t inA1, inA2; + q31_t inB1, inB2; + + pA = read_and_pad_reordered(pA, &inA1, &inA2); + + inB1 = arm_nn_read_q15x2_ia(&pB); + sum = __SMLAD(inA1, inB1, sum); + inB2 = arm_nn_read_q15x2_ia(&pB); + + sum = __SMLAD(inA2, inB2, sum); + + colCnt--; + } + colCnt = ch_im_in * dim_kernel_y * dim_kernel_x & 0x3; + while (colCnt) + { + q7_t inA1 = *pA++; + q15_t inB1 = *pB++; + sum += inA1 * inB1; + colCnt--; + } + *pOut = (q7_t)__SSAT((sum >> out_shift), 8); + pOut++; + } + } + +#else + (void)bufferA; + /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ + int i, j, k, l, m, n; + int conv_out; + int in_row, in_col; + + if (ch_im_in % 4 != 0 || ch_im_out % 2 != 0 || dim_kernel_x != 1 || dim_kernel_y != 1 || padding_x != 0 || + padding_y != 0 || stride_x != 1 || stride_y != 1) + { + /* check if the input dimension meets the constraints */ + return ARM_MATH_SIZE_MISMATCH; + } + + for (i = 0; i < ch_im_out; i++) + { + for (j = 0; j < dim_im_out_y; j++) + { + for (k = 0; k < dim_im_out_x; k++) + { + conv_out = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift); + for (m = 0; m < dim_kernel_y; m++) + { + for (n = 0; n < dim_kernel_x; n++) + { + // if-for implementation + in_row = stride_y * j + m - padding_y; + in_col = stride_x * k + n - padding_x; + if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in_y && in_col < dim_im_in_x) + { + for (l = 0; l < ch_im_in; l++) + { + conv_out += Im_in[(in_row * dim_im_in_x + in_col) * ch_im_in + l] * + wt[i * ch_im_in * dim_kernel_y * dim_kernel_x + (m * dim_kernel_y + n) * ch_im_in + + l]; + } + } + } + } + Im_out[i + (j * dim_im_out_x + k) * ch_im_out] = (q7_t)__SSAT((conv_out >> out_shift), 8); + } + } + } + +#endif /* ARM_MATH_DSP */ + + /* Return to application */ + return ARM_MATH_SUCCESS; +} + +/** + * @} end of NNConv group + */ diff --git a/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_1x1_s8_fast.c b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_1x1_s8_fast.c new file mode 100644 index 0000000..6183f55 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_1x1_s8_fast.c @@ -0,0 +1,161 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_convolve_1x1_s8_fast.c + * Description: Fast q7 version of 1x1 convolution (non-square shape) + * + * $Date: 12. November 2021 + * $Revision: V.2.0.4 + * + * Target Processor: Cortex-M Processors + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" +#include <stdio.h> + +#define DIM_KER_X (1U) +#define DIM_KER_Y (1U) + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup NNConv + * @{ + */ + +/* + * Fast s8 version for 1x1 convolution (non-square shape) + * + * Refer header file for details. + * + */ + +arm_status arm_convolve_1x1_s8_fast(const cmsis_nn_context *ctx, + const cmsis_nn_conv_params *conv_params, + const cmsis_nn_per_channel_quant_params *quant_params, + const cmsis_nn_dims *input_dims, + const q7_t *input_data, + const cmsis_nn_dims *filter_dims, + const q7_t *filter_data, + const cmsis_nn_dims *bias_dims, + const int32_t *bias_data, + const cmsis_nn_dims *output_dims, + q7_t *output_data) +{ + if (input_dims->c % 4 != 0 || conv_params->padding.w != 0 || conv_params->padding.h != 0 || + conv_params->stride.w != 1 || conv_params->stride.h != 1) + { + return ARM_MATH_SIZE_MISMATCH; + } + + (void)ctx; + (void)filter_dims; + (void)bias_dims; + +#if defined(ARM_MATH_MVEI) + + const int32_t col_len = input_dims->w * input_dims->h * input_dims->n; + const int32_t output_ch = output_dims->c; + const int32_t input_ch = input_dims->c; + const int32_t input_offset = conv_params->input_offset; + const int32_t out_offset = conv_params->output_offset; + const int32_t out_activation_min = conv_params->activation.min; + const int32_t out_activation_max = conv_params->activation.max; + int32_t *output_mult = quant_params->multiplier; + int32_t *output_shift = quant_params->shift; + + for (int i_items = 0; i_items <= (col_len - 4); i_items += 4) + { + + output_data = arm_nn_mat_mul_core_4x_s8(input_ch, + input_ch, + input_data + i_items * input_ch, + filter_data, + output_ch, + conv_params, + quant_params, + bias_data, + output_data); + } + + /* Handle left over elements */ + for (int i_items = (col_len & ~0x3); i_items < col_len; i_items++) + { + for (int i_out_ch = 0; i_out_ch < output_ch; i_out_ch++) + { + int32_t sum_row = 0; + int32_t acc; + (void)arm_nn_mat_mul_core_1x_s8( + input_ch, input_data + i_items * input_ch, filter_data + i_out_ch * input_ch, &sum_row, &acc); + if (bias_data) + { + acc += bias_data[i_out_ch]; + } + sum_row = (sum_row * input_offset); + acc += sum_row; + acc = arm_nn_requantize(acc, output_mult[i_out_ch], output_shift[i_out_ch]); + acc += out_offset; + + acc = MAX(acc, out_activation_min); + acc = MIN(acc, out_activation_max); + *output_data++ = acc; + } + } + +#else + /* Run the following code as reference implementation for Cortex-M processors with or without DSP extension */ + + const int32_t lhs_rows = input_dims->w * input_dims->h * input_dims->n; + const int32_t rhs_rows = output_dims->c; + const int32_t rhs_cols = input_dims->c; + + arm_nn_mat_mult_nt_t_s8(input_data, + filter_data, + bias_data, + output_data, + quant_params->multiplier, + quant_params->shift, + lhs_rows, + rhs_rows, + rhs_cols, + conv_params->input_offset, + conv_params->output_offset, + conv_params->activation.min, + conv_params->activation.max); + +#endif + + /* Return to application */ + return ARM_MATH_SUCCESS; +} + +int32_t arm_convolve_1x1_s8_fast_get_buffer_size(const cmsis_nn_dims *input_dims) +{ + (void)input_dims; + return 0; +} + +/** + * @} end of NNConv group + */ diff --git a/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_basic.c b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_basic.c new file mode 100644 index 0000000..0a6868a --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_basic.c @@ -0,0 +1,209 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_convolve_HWC_q15_basic.c + * Description: Q15 version of convolution + * + * $Date: July 20, 2021 + * $Revision: V.1.1.2 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup NNConv + * @{ + */ + +/** + * @brief Basic Q15 convolution function + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimention + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferB pointer to buffer space for output + * @return The function returns <code>ARM_MATH_SUCCESS</code> + * + * @details + * + * <b>Buffer size:</b> + * + * bufferA size: ch_im_in*dim_kernel*dim_kernel + * + * bufferB size: 0 + * + * This basic version is designed to work for any input tensor and weight + * dimension. + */ + +arm_status arm_convolve_HWC_q15_basic(const q15_t *Im_in, + const uint16_t dim_im_in, + const uint16_t ch_im_in, + const q15_t *wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel, + const uint16_t padding, + const uint16_t stride, + const q15_t *bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q15_t *Im_out, + const uint16_t dim_im_out, + q15_t *bufferA, + q7_t *bufferB) +{ + (void)bufferB; +#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) + /* Run the following code for Cortex-M4 and Cortex-M7 */ + + int16_t i_out_y, i_out_x, i_ker_y, i_ker_x; + + uint16_t im2col_out_pixel_index = 0; + q15_t *pBuffer = bufferA; + q15_t *pOut = Im_out; + q15_t *im_buffer = bufferA; + const q15_t *pA; + int i; + + /* This part implements the im2col function */ + for (i_out_y = 0; i_out_y < dim_im_out; i_out_y++) + { + for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++) + { + for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++) + { + for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++) + { + if (i_ker_y < 0 || i_ker_y >= dim_im_in || i_ker_x < 0 || i_ker_x >= dim_im_in) + { + /* Filling 0 for out-of-bound paddings */ + /* arm_fill_q15(0, pBuffer, ch_im_in); */ + memset(pBuffer, 0, sizeof(q15_t) * ch_im_in); + } + else + { + /* arm_copy_q15((q15_t *) Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, pBuffer, + * ch_im_in); */ + memcpy(pBuffer, + (q15_t *)Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, + sizeof(q15_t) * ch_im_in); + } + pBuffer += ch_im_in; + } + } + + pA = wt; + for (i = 0; i < ch_im_out; i++) + { + q31_t sum = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift); + const q15_t *pB = im_buffer; + uint16_t colCnt = ch_im_in * dim_kernel * dim_kernel >> 2; + while (colCnt) + { + q31_t inA1 = arm_nn_read_q15x2_ia(&pA); + q31_t inB1 = arm_nn_read_q15x2_ia(&pB); + q31_t inA2 = arm_nn_read_q15x2_ia(&pA); + q31_t inB2 = arm_nn_read_q15x2_ia(&pB); + + sum = __SMLAD(inA1, inB1, sum); + sum = __SMLAD(inA2, inB2, sum); + + colCnt--; + } + colCnt = ch_im_in * dim_kernel * dim_kernel & 0x3; + while (colCnt) + { + q15_t inA1 = *pA++; + q15_t inB1 = *pB++; + sum += inA1 * inB1; + colCnt--; + } + *pOut = (q15_t)__SSAT((sum >> out_shift), 16); + pOut++; + } + + /* counter reset */ + pBuffer = im_buffer; + im2col_out_pixel_index++; + } + } + +#else + (void)bufferA; + /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ + int i, j, k, l, m, n; + int conv_out; + int in_row, in_col; + + for (i = 0; i < ch_im_out; i++) + { + for (j = 0; j < dim_im_out; j++) + { + for (k = 0; k < dim_im_out; k++) + { + conv_out = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift); + for (m = 0; m < dim_kernel; m++) + { + for (n = 0; n < dim_kernel; n++) + { + in_row = stride * j + m - padding; + in_col = stride * k + n - padding; + if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in && in_col < dim_im_in) + { + for (l = 0; l < ch_im_in; l++) + { + conv_out += Im_in[(in_row * dim_im_in + in_col) * ch_im_in + l] * + wt[i * ch_im_in * dim_kernel * dim_kernel + (m * dim_kernel + n) * ch_im_in + l]; + } + } + } + } + Im_out[i + (j * dim_im_out + k) * ch_im_out] = (q15_t)__SSAT((conv_out >> out_shift), 16); + } + } + } + +#endif /* ARM_MATH_DSP */ + + /* Return to application */ + return ARM_MATH_SUCCESS; +} + +/** + * @} end of NNConv group + */ diff --git a/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_fast.c b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_fast.c new file mode 100644 index 0000000..66fbc00 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_fast.c @@ -0,0 +1,259 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_convolve_HWC_q15_fast.c + * Description: Fast Q15 version of convolution + * + * $Date: July 20, 2021 + * $Revision: V.1.1.2 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup NNConv + * @{ + */ + +/** + * @brief Fast Q15 convolution function + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimention + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferB pointer to buffer space for output + * @return The function returns either + * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking. + * + * @details + * + * <b>Buffer size:</b> + * + * bufferA size: 2*ch_im_in*dim_kernel*dim_kernel + * + * bufferB size: 0 + * + * <b>Input dimension constraints:</b> + * + * ch_im_in is multiple of 2 + * + * ch_im_out is multiple of 2 + * + * dim_im_out is a multiple of 2 + * + */ + +arm_status arm_convolve_HWC_q15_fast(const q15_t *Im_in, + const uint16_t dim_im_in, + const uint16_t ch_im_in, + const q15_t *wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel, + const uint16_t padding, + const uint16_t stride, + const q15_t *bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q15_t *Im_out, + const uint16_t dim_im_out, + q15_t *bufferA, + q7_t *bufferB) +{ + (void)bufferB; +#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) + int16_t i_out_y, i_out_x, i_ker_y, i_ker_x; + + q15_t *pBuffer = bufferA; + q15_t *im_buffer = bufferA; + q15_t *pOut = Im_out; + + if (ch_im_in % 2 != 0 || ch_im_out % 2 != 0 || dim_im_out & 0x1) + { + /* check if the input dimension meets the constraints */ + return ARM_MATH_SIZE_MISMATCH; + } + + /* Run the following code for Cortex-M4 and Cortex-M7 */ + + /* This part implements the im2col function */ + for (i_out_y = 0; i_out_y < dim_im_out; i_out_y++) + { + for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++) + { + for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++) + { + for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++) + { + if (i_ker_y < 0 || i_ker_y >= dim_im_in || i_ker_x < 0 || i_ker_x >= dim_im_in) + { + /* arm_fill_q15(0, pBuffer, ch_im_in); */ + memset(pBuffer, 0, sizeof(q15_t) * ch_im_in); + } + else + { + /* arm_copy_q15((q15_t *) Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, pBuffer, + * ch_im_in); */ + memcpy(pBuffer, + (q15_t *)Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, + sizeof(q15_t) * ch_im_in); + } + pBuffer += ch_im_in; + } + } + + if (i_out_x & 0x1) + { + int i; + /* initialize the matrix pointers for A */ + const q15_t *pA = wt; + + /* set up the second output pointers */ + q15_t *pOut2 = pOut + ch_im_out; + + /* this loop over rows in A */ + for (i = 0; i < ch_im_out; i += 2) + { + /* setup pointers for B */ + const q15_t *pB = im_buffer; + const q15_t *pB2 = pB + ch_im_in * dim_kernel * dim_kernel; + + /* aling the second pointer for A */ + const q15_t *pA2 = pA + ch_im_in * dim_kernel * dim_kernel; + + /* init the sum with bias */ + q31_t sum = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift); + q31_t sum2 = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift); + q31_t sum3 = ((q31_t)bias[i + 1] << bias_shift) + NN_ROUND(out_shift); + q31_t sum4 = ((q31_t)bias[i + 1] << bias_shift) + NN_ROUND(out_shift); + + uint16_t colCnt = ch_im_in * dim_kernel * dim_kernel >> 1; + /* accumulate over the vector */ + while (colCnt) + { + q31_t inA1 = arm_nn_read_q15x2_ia(&pA); + q31_t inB1 = arm_nn_read_q15x2_ia(&pB); + q31_t inA2 = arm_nn_read_q15x2_ia(&pA2); + q31_t inB2 = arm_nn_read_q15x2_ia(&pB2); + + sum = __SMLAD(inA1, inB1, sum); + sum2 = __SMLAD(inA1, inB2, sum2); + sum3 = __SMLAD(inA2, inB1, sum3); + sum4 = __SMLAD(inA2, inB2, sum4); + + colCnt--; + } /* while over colCnt */ + colCnt = ch_im_in * dim_kernel * dim_kernel & 0x1; + while (colCnt) + { + q15_t inA1 = *pA++; + q15_t inB1 = *pB++; + q15_t inA2 = *pA2++; + q15_t inB2 = *pB2++; + + sum += inA1 * inB1; + sum2 += inA1 * inB2; + sum3 += inA2 * inB1; + sum4 += inA2 * inB2; + colCnt--; + } /* while over colCnt */ + *pOut++ = (q15_t)__SSAT(sum >> out_shift, 16); + *pOut++ = (q15_t)__SSAT(sum3 >> out_shift, 16); + *pOut2++ = (q15_t)__SSAT(sum2 >> out_shift, 16); + *pOut2++ = (q15_t)__SSAT(sum4 >> out_shift, 16); + + /* skip the row computed with A2 */ + pA += ch_im_in * dim_kernel * dim_kernel; + } /* for over ch_im_out */ + + pOut += ch_im_out; + /* counter reset */ + pBuffer = im_buffer; + } + } + } + +#else + (void)bufferA; + /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ + int i, j, k, l, m, n; + int conv_out; + int in_row, in_col; + + if (ch_im_in % 2 != 0 || ch_im_out % 2 != 0) + { + /* check if the input dimension meets the constraints */ + return ARM_MATH_SIZE_MISMATCH; + } + + for (i = 0; i < ch_im_out; i++) + { + for (j = 0; j < dim_im_out; j++) + { + for (k = 0; k < dim_im_out; k++) + { + conv_out = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift); + for (m = 0; m < dim_kernel; m++) + { + for (n = 0; n < dim_kernel; n++) + { + in_row = stride * j + m - padding; + in_col = stride * k + n - padding; + if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in && in_col < dim_im_in) + { + for (l = 0; l < ch_im_in; l++) + { + conv_out += Im_in[(in_row * dim_im_in + in_col) * ch_im_in + l] * + wt[i * ch_im_in * dim_kernel * dim_kernel + (m * dim_kernel + n) * ch_im_in + l]; + } + } + } + } + Im_out[i + (j * dim_im_out + k) * ch_im_out] = (q15_t)__SSAT((conv_out >> out_shift), 16); + } + } + } + +#endif /* ARM_MATH_DSP */ + + /* Return to application */ + return ARM_MATH_SUCCESS; +} + +/** + * @} end of NNConv group + */ diff --git a/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_fast_nonsquare.c b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_fast_nonsquare.c new file mode 100644 index 0000000..7babe51 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_fast_nonsquare.c @@ -0,0 +1,270 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_convolve_HWC_q15_fast.c + * Description: Fast Q15 version of convolution + * + * $Date: July 20, 2021 + * $Revision: V.1.1.2 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup NNConv + * @{ + */ + +/** + * @brief Fast Q15 convolution function (non-sqaure shape) + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in_x input tensor dimention x + * @param[in] dim_im_in_y input tensor dimention y + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel_x filter kernel size x + * @param[in] dim_kernel_y filter kernel size y + * @param[in] padding_x padding size x + * @param[in] padding_y padding size y + * @param[in] stride_x convolution stride x + * @param[in] stride_y convolution stride y + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out_x output tensor dimension x + * @param[in] dim_im_out_y output tensor dimension y + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferB pointer to buffer space for output + * @return The function returns either + * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking. + * + * @details + * + * <b>Buffer size:</b> + * + * bufferA size: 2*ch_im_in*dim_kernel*dim_kernel + * + * bufferB size: 0 + * + * <b>Input dimension constraints:</b> + * + * ch_im_in is multiple of 2 + * + * ch_im_out is multiple of 2 + * + */ + +arm_status arm_convolve_HWC_q15_fast_nonsquare(const q15_t *Im_in, + const uint16_t dim_im_in_x, + const uint16_t dim_im_in_y, + const uint16_t ch_im_in, + const q15_t *wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel_x, + const uint16_t dim_kernel_y, + const uint16_t padding_x, + const uint16_t padding_y, + const uint16_t stride_x, + const uint16_t stride_y, + const q15_t *bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q15_t *Im_out, + const uint16_t dim_im_out_x, + const uint16_t dim_im_out_y, + q15_t *bufferA, + q7_t *bufferB) +{ + (void)bufferB; +#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) + int16_t i_out_y, i_out_x, i_ker_y, i_ker_x; + + q15_t *pBuffer = bufferA; + q15_t *im_buffer = bufferA; + q15_t *pOut = Im_out; + + if (ch_im_in % 2 != 0 || ch_im_out % 2 != 0) + { + /* check if the input dimension meets the constraints */ + return ARM_MATH_SIZE_MISMATCH; + } + + /* Run the following code for Cortex-M4 and Cortex-M7 */ + + /* This part implements the im2col function */ + for (i_out_y = 0; i_out_y < dim_im_out_y; i_out_y++) + { + for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++) + { + for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y; + i_ker_y++) + { + for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x; + i_ker_x++) + { + if (i_ker_y < 0 || i_ker_y >= dim_im_in_y || i_ker_x < 0 || i_ker_x >= dim_im_in_x) + { + /* arm_fill_q15(0, pBuffer, ch_im_in); */ + memset(pBuffer, 0, sizeof(q15_t) * ch_im_in); + } + else + { + /* arm_copy_q15((q15_t *) Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, pBuffer, + * ch_im_in); */ + memcpy(pBuffer, + (q15_t *)Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, + sizeof(q15_t) * ch_im_in); + } + pBuffer += ch_im_in; + } + } + + if (i_out_x & 0x1) + { + int i; + /* initialize the matrix pointers for A */ + const q15_t *pA = wt; + + /* set up the second output pointers */ + q15_t *pOut2 = pOut + ch_im_out; + + /* this loop over rows in A */ + for (i = 0; i < ch_im_out; i += 2) + { + /* setup pointers for B */ + const q15_t *pB = im_buffer; + const q15_t *pB2 = pB + ch_im_in * dim_kernel_y * dim_kernel_x; + + /* aling the second pointer for A */ + const q15_t *pA2 = pA + ch_im_in * dim_kernel_y * dim_kernel_x; + + /* init the sum with bias */ + q31_t sum = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift); + q31_t sum2 = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift); + q31_t sum3 = ((q31_t)bias[i + 1] << bias_shift) + NN_ROUND(out_shift); + q31_t sum4 = ((q31_t)bias[i + 1] << bias_shift) + NN_ROUND(out_shift); + + uint16_t colCnt = ch_im_in * dim_kernel_y * dim_kernel_x >> 1; + /* accumulate over the vector */ + while (colCnt) + { + q31_t inA1 = arm_nn_read_q15x2_ia(&pA); + q31_t inB1 = arm_nn_read_q15x2_ia(&pB); + q31_t inA2 = arm_nn_read_q15x2_ia(&pA2); + q31_t inB2 = arm_nn_read_q15x2_ia(&pB2); + + sum = __SMLAD(inA1, inB1, sum); + sum2 = __SMLAD(inA1, inB2, sum2); + sum3 = __SMLAD(inA2, inB1, sum3); + sum4 = __SMLAD(inA2, inB2, sum4); + + colCnt--; + } /* while over colCnt */ + colCnt = ch_im_in * dim_kernel_y * dim_kernel_x & 0x1; + while (colCnt) + { + q15_t inA1 = *pA++; + q15_t inB1 = *pB++; + q15_t inA2 = *pA2++; + q15_t inB2 = *pB2++; + + sum += inA1 * inB1; + sum2 += inA1 * inB2; + sum3 += inA2 * inB1; + sum4 += inA2 * inB2; + colCnt--; + } /* while over colCnt */ + *pOut++ = (q15_t)__SSAT(sum >> out_shift, 16); + *pOut++ = (q15_t)__SSAT(sum3 >> out_shift, 16); + *pOut2++ = (q15_t)__SSAT(sum2 >> out_shift, 16); + *pOut2++ = (q15_t)__SSAT(sum4 >> out_shift, 16); + + /* skip the row computed with A2 */ + pA += ch_im_in * dim_kernel_y * dim_kernel_x; + } /* for over ch_im_out */ + + pOut += ch_im_out; + /* counter reset */ + pBuffer = im_buffer; + } + } + } + +#else + (void)bufferA; + /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ + int i, j, k, l, m, n; + int conv_out; + int in_row, in_col; + + if (ch_im_in % 2 != 0 || ch_im_out % 2 != 0) + { + /* check if the input dimension meets the constraints */ + return ARM_MATH_SIZE_MISMATCH; + } + + for (i = 0; i < ch_im_out; i++) + { + for (j = 0; j < dim_im_out_y; j++) + { + for (k = 0; k < dim_im_out_x; k++) + { + conv_out = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift); + for (m = 0; m < dim_kernel_y; m++) + { + for (n = 0; n < dim_kernel_x; n++) + { + in_row = stride_y * j + m - padding_y; + in_col = stride_x * k + n - padding_x; + if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in_y && in_col < dim_im_in_x) + { + for (l = 0; l < ch_im_in; l++) + { + conv_out += Im_in[(in_row * dim_im_in_x + in_col) * ch_im_in + l] * + wt[i * ch_im_in * dim_kernel_x * dim_kernel_y + (m * dim_kernel_x + n) * ch_im_in + + l]; + } + } + } + } + Im_out[i + (j * dim_im_out_x + k) * ch_im_out] = (q15_t)__SSAT((conv_out >> out_shift), 16); + } + } + } + +#endif /* ARM_MATH_DSP */ + + /* Return to application */ + return ARM_MATH_SUCCESS; +} + +/** + * @} end of NNConv group + */ diff --git a/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_RGB.c b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_RGB.c new file mode 100644 index 0000000..618f492 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_RGB.c @@ -0,0 +1,280 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_convolve_HWC_q7_RGB.c + * Description: Q7 version of convolution for RGB image + * + * $Date: July 20, 2021 + * $Revision: V.1.1.2 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup NNConv + * @{ + */ + +/** + * @brief Q7 convolution function for RGB image + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimention + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferB pointer to buffer space for output + * @return The function returns either + * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking. + * + * @details + * + * <b>Buffer size:</b> + * + * bufferA size: 2*ch_im_in*dim_kernel*dim_kernel + * + * bufferB size: 0 + * + * <b>Input dimension constraints:</b> + * + * ch_im_in equals 3 + * + * This kernel is written exclusively for convolution with ch_im_in + * equals 3. This applies on the first layer of CNNs which has input + * image with RGB format. + */ + +arm_status arm_convolve_HWC_q7_RGB(const q7_t *Im_in, + const uint16_t dim_im_in, + const uint16_t ch_im_in, + const q7_t *wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel, + const uint16_t padding, + const uint16_t stride, + const q7_t *bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q7_t *Im_out, + const uint16_t dim_im_out, + q15_t *bufferA, + q7_t *bufferB) +{ + (void)bufferB; +#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) + /* Run the following code for Cortex-M4 and Cortex-M7 */ + int16_t i_out_y, i_out_x, i_ker_y, i_ker_x; + + /* + * Here we use bufferA as q15_t internally as computation are done with q15_t level + * im2col are done to output in q15_t format from q7_t input + */ + q15_t *pBuffer = bufferA; + q7_t *pOut = Im_out; + + // check if number of input channels is 3 + if (ch_im_in != 3) + { + return ARM_MATH_SIZE_MISMATCH; + } + // This part implements the im2col function + for (i_out_y = 0; i_out_y < dim_im_out; i_out_y++) + { + for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++) + { + for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++) + { + for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++) + { + if (i_ker_y < 0 || i_ker_y >= dim_im_in || i_ker_x < 0 || i_ker_x >= dim_im_in) + { + /* Equivalent to arm_fill_q15(0, pBuffer, ch_im_in) with assumption: ch_im_in = 3 */ + arm_memset_q7((q7_t *)pBuffer, (q7_t)0, 3 * sizeof(q15_t)); + pBuffer += 3; + } + else + { + /* + * Equivalent to: + * arm_q7_to_q15_no_shift( (q7_t*)Im_in+(i_ker_y*dim_im_in+i_ker_x)*3, pBuffer, 3); + */ + + const q7_t *pPixel = Im_in + (i_ker_y * dim_im_in + i_ker_x) * 3; + q31_t buf = arm_nn_read_q7x4(pPixel); + + union arm_nnword top; + union arm_nnword bottom; + + top.word = __SXTB16(buf); + bottom.word = __SXTB16(__ROR(buf, 8)); + +#ifndef ARM_MATH_BIG_ENDIAN + /* + * little-endian, | omit | 3rd | 2nd | 1st | + * MSB LSB + * top | 3rd | 1st |; bottom | omit | 2nd | + * + * version 1, need to swap 2nd and 3rd weight + * *__SIMD32(pBuffer) = top.word; + * *(pBuffer+2) = bottom.half_words[0]; + * + * version 2, no weight shuffling required + */ + *pBuffer++ = top.half_words[0]; + int32_t packed_word = __PKHBT(bottom.word, top.word, 0); + arm_memcpy_q7((q7_t *)pBuffer, (q7_t *)&packed_word, 4); +#else + /* + * big-endian, | 1st | 2nd | 3rd | omit | + * MSB LSB + * top | 2nd | omit |; bottom | 1st | 3rd | + * + * version 1, need to swap 2nd and 3rd weight + * *__SIMD32(pBuffer) = bottom.word; + * *(pBuffer+2) = top.half_words[1]; + * + * version 2, no weight shuffling required + */ + *pBuffer++ = bottom.half_words[0]; + int32_t packed_word = __PKHTB(top.word, bottom.word, 0); + arm_memcpy_q7((q7_t *)pBuffer, (q7_t *)&packed_word, 4); +#endif + pBuffer += 2; + } + } + } + + if (pBuffer == bufferA + 2 * 3 * dim_kernel * dim_kernel) + { + pOut = arm_nn_mat_mult_kernel_q7_q15( + wt, bufferA, ch_im_out, 3 * dim_kernel * dim_kernel, bias_shift, out_shift, bias, pOut); + + /* counter reset */ + pBuffer = bufferA; + } + } + } + + /* left-over because odd number of output pixels */ + if (pBuffer != bufferA) + { + const q7_t *pA = wt; + int i; + + for (i = 0; i < ch_im_out; i++) + { + q31_t sum = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift); + q15_t *pB = bufferA; + /* basically each time it process 4 entries */ + uint16_t colCnt = 3 * dim_kernel * dim_kernel >> 2; + + while (colCnt) + { + + q31_t inA1, inA2; + q31_t inB1, inB2; + + pA = read_and_pad(pA, &inA1, &inA2); + + inB1 = arm_nn_read_q15x2_ia((const q15_t **)&pB); + sum = __SMLAD(inA1, inB1, sum); + inB2 = arm_nn_read_q15x2_ia((const q15_t **)&pB); + sum = __SMLAD(inA2, inB2, sum); + + colCnt--; + } + colCnt = 3 * dim_kernel * dim_kernel & 0x3; + while (colCnt) + { + q7_t inA1 = *pA++; + q15_t inB1 = *pB++; + sum += inA1 * inB1; + colCnt--; + } + *pOut++ = (q7_t)__SSAT((sum >> out_shift), 8); + } + } +#else + (void)bufferA; + /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ + int i, j, k, l, m, n; + int conv_out; + int in_row, in_col; + + // check if number of input channels is 3 + if (ch_im_in != 3) + { + return ARM_MATH_SIZE_MISMATCH; + } + + for (i = 0; i < ch_im_out; i++) + { + for (j = 0; j < dim_im_out; j++) + { + for (k = 0; k < dim_im_out; k++) + { + conv_out = (bias[i] << bias_shift) + NN_ROUND(out_shift); + for (m = 0; m < dim_kernel; m++) + { + for (n = 0; n < dim_kernel; n++) + { + /* if-for implementation */ + in_row = stride * j + m - padding; + in_col = stride * k + n - padding; + if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in && in_col < dim_im_in) + { + for (l = 0; l < ch_im_in; l++) + { + conv_out += Im_in[(in_row * dim_im_in + in_col) * ch_im_in + l] * + wt[i * ch_im_in * dim_kernel * dim_kernel + (m * dim_kernel + n) * ch_im_in + l]; + } + } + } + } + Im_out[i + (j * dim_im_out + k) * ch_im_out] = (q7_t)__SSAT((conv_out >> out_shift), 8); + } + } + } + +#endif /* ARM_MATH_DSP */ + + /* Return to application */ + return (ARM_MATH_SUCCESS); +} + +/** + * @} end of NNConv group + */ diff --git a/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_basic.c b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_basic.c new file mode 100644 index 0000000..e274413 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_basic.c @@ -0,0 +1,227 @@ +/* + * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_convolve_HWC_q7_basic.c + * Description: Q7 version of convolution + * + * $Date: 20. July 2021 + * $Revision: V.1.1.1 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup NNConv + * @{ + */ + +/** + * @brief Basic Q7 convolution function + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimention + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferB pointer to buffer space for output + * @return The function returns <code>ARM_MATH_SUCCESS</code> + * + * @details + * + * <b>Buffer size:</b> + * + * bufferA size: 2*ch_im_in*dim_kernel*dim_kernel + * + * bufferB size: 0 + * + * This basic version is designed to work for any input tensor and weight + * dimension. + */ + +arm_status arm_convolve_HWC_q7_basic(const q7_t *Im_in, + const uint16_t dim_im_in, + const uint16_t ch_im_in, + const q7_t *wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel, + const uint16_t padding, + const uint16_t stride, + const q7_t *bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q7_t *Im_out, + const uint16_t dim_im_out, + q15_t *bufferA, + q7_t *bufferB) +{ + (void)bufferB; +#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) + /* Run the following code for Cortex-M4 and Cortex-M7 */ + + int16_t i_out_y, i_out_x, i_ker_y, i_ker_x; + + /* + * Here we use bufferA as q15_t internally as computation are done with q15_t level + * im2col are done to output in q15_t format from q7_t input + */ + q15_t *pBuffer = bufferA; + q7_t *pOut = Im_out; + + /* This part implements the im2col function */ + for (i_out_y = 0; i_out_y < dim_im_out; i_out_y++) + { + for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++) + { + for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++) + { + for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++) + { + if (i_ker_y < 0 || i_ker_y >= dim_im_in || i_ker_x < 0 || i_ker_x >= dim_im_in) + { + /* Filling 0 for out-of-bound paddings */ + /* arm_fill_q15(0, pBuffer, ch_im_in); */ + memset(pBuffer, 0, sizeof(q15_t) * ch_im_in); + } + else + { + /* Copying the pixel data to column */ + arm_q7_to_q15_no_shift( + (q7_t *)Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, pBuffer, ch_im_in); + } + pBuffer += ch_im_in; + } + } + + /* Computation is filed for every 2 columns */ + if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel * dim_kernel) + { + pOut = arm_nn_mat_mult_kernel_q7_q15( + wt, bufferA, ch_im_out, ch_im_in * dim_kernel * dim_kernel, bias_shift, out_shift, bias, pOut); + + /* counter reset */ + pBuffer = bufferA; + } + } + } + + /* left-over because odd number of output pixels */ + if (pBuffer != bufferA) + { + const q7_t *pA = wt; + int i; + + for (i = 0; i < ch_im_out; i++) + { + /* Load the accumulator with bias first */ + q31_t sum = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift); + + /* Point to the beging of the im2col buffer */ + const q15_t *pB = bufferA; + + /* Each time it process 4 entries */ + uint16_t colCnt = ch_im_in * dim_kernel * dim_kernel >> 2; + + while (colCnt) + { + q31_t inA1, inA2; + q31_t inB1, inB2; + + pA = read_and_pad(pA, &inA1, &inA2); + + inB1 = arm_nn_read_q15x2_ia(&pB); + sum = __SMLAD(inA1, inB1, sum); + inB2 = arm_nn_read_q15x2_ia(&pB); + + sum = __SMLAD(inA2, inB2, sum); + + colCnt--; + } + colCnt = ch_im_in * dim_kernel * dim_kernel & 0x3; + while (colCnt) + { + q7_t inA1 = *pA++; + q15_t inB1 = *pB++; + sum += inA1 * inB1; + colCnt--; + } + *pOut++ = (q7_t)__SSAT((sum >> out_shift), 8); + } + } +#else + /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ + (void)bufferA; + int i, j, k, l, m, n; + int conv_out; + int in_row, in_col; + + for (i = 0; i < ch_im_out; i++) + { + for (j = 0; j < dim_im_out; j++) + { + for (k = 0; k < dim_im_out; k++) + { + conv_out = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift); + for (m = 0; m < dim_kernel; m++) + { + for (n = 0; n < dim_kernel; n++) + { + // if-for implementation + in_row = stride * j + m - padding; + in_col = stride * k + n - padding; + if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in && in_col < dim_im_in) + { + for (l = 0; l < ch_im_in; l++) + { + conv_out += Im_in[(in_row * dim_im_in + in_col) * ch_im_in + l] * + wt[i * ch_im_in * dim_kernel * dim_kernel + (m * dim_kernel + n) * ch_im_in + l]; + } + } + } + } + Im_out[i + (j * dim_im_out + k) * ch_im_out] = (q7_t)__SSAT((conv_out >> out_shift), 8); + } + } + } + +#endif /* ARM_MATH_DSP */ + + /* Return to application */ + return ARM_MATH_SUCCESS; +} + +/** + * @} end of NNConv group + */ diff --git a/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_basic_nonsquare.c b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_basic_nonsquare.c new file mode 100644 index 0000000..b42a57d --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_basic_nonsquare.c @@ -0,0 +1,229 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_convolve_HWC_q7_basic.c + * Description: Q7 version of convolution + * + * $Date: July 20, 2021 + * $Revision: V.1.1.2 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup NNConv + * @{ + */ + +/** + * @brief Basic Q7 convolution function (non-sqaure shape) + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in_x input tensor dimention x + * @param[in] dim_im_in_y input tensor dimention y + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel_x filter kernel size x + * @param[in] dim_kernel_y filter kernel size y + * @param[in] padding_x padding size x + * @param[in] padding_y padding size y + * @param[in] stride_x convolution stride x + * @param[in] stride_y convolution stride y + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out_x output tensor dimension x + * @param[in] dim_im_out_y output tensor dimension y + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferB pointer to buffer space for output + * @return The function returns <code>ARM_MATH_SUCCESS</code> + */ + +arm_status arm_convolve_HWC_q7_basic_nonsquare(const q7_t *Im_in, + const uint16_t dim_im_in_x, + const uint16_t dim_im_in_y, + const uint16_t ch_im_in, + const q7_t *wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel_x, + const uint16_t dim_kernel_y, + const uint16_t padding_x, + const uint16_t padding_y, + const uint16_t stride_x, + const uint16_t stride_y, + const q7_t *bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q7_t *Im_out, + const uint16_t dim_im_out_x, + const uint16_t dim_im_out_y, + q15_t *bufferA, + q7_t *bufferB) +{ + (void)bufferB; +#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) + /* Run the following code for Cortex-M4 and Cortex-M7 */ + + int16_t i_out_y, i_out_x, i_ker_y, i_ker_x; + + /* + * Here we use bufferA as q15_t internally as computation are done with q15_t level + * im2col are done to output in q15_t format from q7_t input + */ + q15_t *pBuffer = bufferA; + q7_t *pOut = Im_out; + + /* This part implements the im2col function */ + for (i_out_y = 0; i_out_y < dim_im_out_y; i_out_y++) + { + for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++) + { + for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y; + i_ker_y++) + { + for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x; + i_ker_x++) + { + if (i_ker_y < 0 || i_ker_y >= dim_im_in_y || i_ker_x < 0 || i_ker_x >= dim_im_in_x) + { + /* Filling 0 for out-of-bound paddings */ + /* arm_fill_q15(0, pBuffer, ch_im_in); */ + memset(pBuffer, 0, sizeof(q15_t) * ch_im_in); + } + else + { + /* Copying the pixel data to column */ + arm_q7_to_q15_no_shift( + (q7_t *)Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, pBuffer, ch_im_in); + } + pBuffer += ch_im_in; + } + } + + /* Computation is filed for every 2 columns */ + if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_y * dim_kernel_x) + { + pOut = arm_nn_mat_mult_kernel_q7_q15( + wt, bufferA, ch_im_out, ch_im_in * dim_kernel_y * dim_kernel_x, bias_shift, out_shift, bias, pOut); + + /* counter reset */ + pBuffer = bufferA; + } + } + } + + /* left-over because odd number of output pixels */ + if (pBuffer != bufferA) + { + const q7_t *pA = wt; + int i; + + for (i = 0; i < ch_im_out; i++) + { + /* Load the accumulator with bias first */ + q31_t sum = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift); + + /* Point to the beging of the im2col buffer */ + const q15_t *pB = bufferA; + + /* Each time it process 4 entries */ + uint16_t colCnt = ch_im_in * dim_kernel_y * dim_kernel_x >> 2; + + while (colCnt) + { + q31_t inA1, inA2; + q31_t inB1, inB2; + + pA = read_and_pad(pA, &inA1, &inA2); + + inB1 = arm_nn_read_q15x2_ia(&pB); + sum = __SMLAD(inA1, inB1, sum); + inB2 = arm_nn_read_q15x2_ia(&pB); + + sum = __SMLAD(inA2, inB2, sum); + + colCnt--; + } + colCnt = ch_im_in * dim_kernel_y * dim_kernel_x & 0x3; + while (colCnt) + { + q7_t inA1 = *pA++; + q15_t inB1 = *pB++; + sum += inA1 * inB1; + colCnt--; + } + *pOut++ = (q7_t)__SSAT((sum >> out_shift), 8); + } + } +#else + /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ + (void)bufferA; + int i, j, k, l, m, n; + int conv_out; + int in_row, in_col; + + for (i = 0; i < ch_im_out; i++) + { + for (j = 0; j < dim_im_out_y; j++) + { + for (k = 0; k < dim_im_out_x; k++) + { + conv_out = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift); + for (m = 0; m < dim_kernel_y; m++) + { + for (n = 0; n < dim_kernel_x; n++) + { + // if-for implementation + in_row = stride_y * j + m - padding_y; + in_col = stride_x * k + n - padding_x; + if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in_y && in_col < dim_im_in_x) + { + for (l = 0; l < ch_im_in; l++) + { + conv_out += Im_in[(in_row * dim_im_in_x + in_col) * ch_im_in + l] * + wt[i * ch_im_in * dim_kernel_y * dim_kernel_x + (m * dim_kernel_x + n) * ch_im_in + + l]; + } + } + } + } + Im_out[i + (j * dim_im_out_x + k) * ch_im_out] = (q7_t)__SSAT((conv_out >> out_shift), 8); + } + } + } + +#endif /* ARM_MATH_DSP */ + + /* Return to application */ + return ARM_MATH_SUCCESS; +} + +/** + * @} end of NNConv group + */ diff --git a/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_fast.c b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_fast.c new file mode 100644 index 0000000..51d98fd --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_fast.c @@ -0,0 +1,380 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_convolve_HWC_q7_fast.c + * Description: Fast Q7 version of convolution + * + * $Date: July 20, 2021 + * $Revision: V.1.1.2 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup NNConv + * @{ + */ + +/** + * @brief Fast Q7 convolution function + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimention + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferB pointer to buffer space for output + * @return The function returns either + * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking. + * + * @details + * + * <b>Buffer size:</b> + * + * bufferA size: 2*ch_im_in*dim_kernel*dim_kernel + * + * bufferB size: 0 + * + * <b>Input dimension constraints:</b> + * + * ch_im_in is multiple of 4 ( because of the SIMD32 read and swap ) + * + * ch_im_out is multiple of 2 ( bacause 2x2 mat_mult kernel ) + * + * The im2col converts the Q7 tensor input into Q15 column, which is stored in + * bufferA. There is reordering happenning during this im2col process with + * arm_q7_to_q15_reordered_no_shift. For every four elements, the second and + * third elements are swapped. + * + * The computation kernel arm_nn_mat_mult_kernel_q7_q15_reordered does the + * GEMM computation with the reordered columns. + * + * To speed-up the determination of the padding condition, we split the + * computation into 3x3 parts, i.e., {top, mid, bottom} X {left, mid, right}. + * This reduces the total number of boundary condition checks and improves + * the data copying performance. + */ + +arm_status arm_convolve_HWC_q7_fast(const q7_t *Im_in, + const uint16_t dim_im_in, + const uint16_t ch_im_in, + const q7_t *wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel, + const uint16_t padding, + const uint16_t stride, + const q7_t *bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q7_t *Im_out, + const uint16_t dim_im_out, + q15_t *bufferA, + q7_t *bufferB) +{ + (void)bufferB; +#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) + /* Run the following code for Cortex-M4 and Cortex-M7 */ + + int16_t i_out_y, i_out_x, i_ker_y, i_ker_x; + + /* + * Here we use bufferA as q15_t internally as computation are done with q15_t level + * im2col are done to output in q15_t format from q7_t input + */ + + q15_t *pBuffer = bufferA; + q7_t *pOut = Im_out; + + if (ch_im_in % 4 != 0 || ch_im_out % 2 != 0) + { + /* check if the input dimension meets the constraints */ + return ARM_MATH_SIZE_MISMATCH; + } + + /* + * Here we split the entire matrix into three regions depending on the padding situation + * Top: i_out_y from 0 to padding - 1 + * Middle: i_out_y from padding to dim_im_out-padding-1 + * Bottom: i_out_y from dim_im_out-padding to dim_im_out-1 + */ + + /* top part */ + for (i_out_y = 0; i_out_y < padding; i_out_y++) + { + for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++) + { + /* This part implements the im2col function */ + for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++) + { + for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++) + { + if (i_ker_y < 0 || i_ker_y >= dim_im_in || i_ker_x < 0 || i_ker_x >= dim_im_in) + { + /* arm_fill_q15(0, pBuffer, ch_im_in); */ + memset(pBuffer, 0, sizeof(q15_t) * ch_im_in); + } + else + { + arm_q7_to_q15_reordered_no_shift( + (q7_t *)Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, pBuffer, ch_im_in); + } + pBuffer += ch_im_in; + } + } + + if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel * dim_kernel) + { + pOut = arm_nn_mat_mult_kernel_q7_q15_reordered( + wt, bufferA, ch_im_out, ch_im_in * dim_kernel * dim_kernel, bias_shift, out_shift, bias, pOut); + /* counter reset */ + pBuffer = bufferA; + } + } + } + + /* middle part, here we also divide the x into left, mid and right */ + for (; i_out_y < dim_im_out - padding; i_out_y++) + { + + /* left part */ + for (i_out_x = 0; i_out_x < padding; i_out_x++) + { + /* This part implements the im2col function */ + for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++) + { + for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++) + { + if (i_ker_x < 0 || i_ker_x >= dim_im_in) + { + /* arm_fill_q15(0, pBuffer, ch_im_in); */ + memset(pBuffer, 0, sizeof(q15_t) * ch_im_in); + } + else + { + arm_q7_to_q15_reordered_no_shift( + (q7_t *)Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, pBuffer, ch_im_in); + } + pBuffer += ch_im_in; + } + } + + if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel * dim_kernel) + { + pOut = arm_nn_mat_mult_kernel_q7_q15_reordered( + wt, bufferA, ch_im_out, ch_im_in * dim_kernel * dim_kernel, bias_shift, out_shift, bias, pOut); + /* counter reset */ + pBuffer = bufferA; + } + } + + /* mid part */ + for (; i_out_x < dim_im_out - padding; i_out_x++) + { + /* This part implements the im2col function */ + for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++) + { + arm_q7_to_q15_reordered_no_shift((q7_t *)Im_in + + (i_ker_y * dim_im_in + i_out_x * stride - padding) * ch_im_in, + pBuffer, + ch_im_in * dim_kernel); + pBuffer += ch_im_in * dim_kernel; + } + + if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel * dim_kernel) + { + pOut = arm_nn_mat_mult_kernel_q7_q15_reordered( + wt, bufferA, ch_im_out, ch_im_in * dim_kernel * dim_kernel, bias_shift, out_shift, bias, pOut); + /* counter reset */ + pBuffer = bufferA; + } + } + + /* right part */ + for (; i_out_x < dim_im_out; i_out_x++) + { + /* This part implements the im2col function */ + for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++) + { + for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++) + { + if (i_ker_x < 0 || i_ker_x >= dim_im_in) + { + /* arm_fill_q15(0, pBuffer, ch_im_in); */ + memset(pBuffer, 0, sizeof(q15_t) * ch_im_in); + } + else + { + arm_q7_to_q15_reordered_no_shift( + (q7_t *)Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, pBuffer, ch_im_in); + } + pBuffer += ch_im_in; + } + } + + if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel * dim_kernel) + { + pOut = arm_nn_mat_mult_kernel_q7_q15_reordered( + wt, bufferA, ch_im_out, ch_im_in * dim_kernel * dim_kernel, bias_shift, out_shift, bias, pOut); + /* counter reset */ + pBuffer = bufferA; + } + } + } + + for (; i_out_y < dim_im_out; i_out_y++) + { + for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++) + { + /* This part implements the im2col function */ + for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++) + { + for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++) + { + if (i_ker_y < 0 || i_ker_y >= dim_im_in || i_ker_x < 0 || i_ker_x >= dim_im_in) + { + /* arm_fill_q15(0, pBuffer, ch_im_in); */ + memset(pBuffer, 0, sizeof(q15_t) * ch_im_in); + } + else + { + arm_q7_to_q15_reordered_no_shift( + (q7_t *)Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, pBuffer, ch_im_in); + } + pBuffer += ch_im_in; + } + } + + if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel * dim_kernel) + { + pOut = arm_nn_mat_mult_kernel_q7_q15_reordered( + wt, bufferA, ch_im_out, ch_im_in * dim_kernel * dim_kernel, bias_shift, out_shift, bias, pOut); + /* counter reset */ + pBuffer = bufferA; + } + } + } + + /* check if there is left-over for compute */ + if (pBuffer != bufferA) + { + const q7_t *pA = wt; + int i; + + for (i = 0; i < ch_im_out; i++) + { + q31_t sum = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift); + const q15_t *pB = bufferA; + /* each time it process 4 entries */ + uint16_t colCnt = ch_im_in * dim_kernel * dim_kernel >> 2; + + while (colCnt) + { + + q31_t inA1, inA2; + q31_t inB1, inB2; + + pA = read_and_pad_reordered(pA, &inA1, &inA2); + + inB1 = arm_nn_read_q15x2_ia(&pB); + sum = __SMLAD(inA1, inB1, sum); + inB2 = arm_nn_read_q15x2_ia(&pB); + sum = __SMLAD(inA2, inB2, sum); + + colCnt--; + } + colCnt = ch_im_in * dim_kernel * dim_kernel & 0x3; + while (colCnt) + { + q7_t inA1 = *pA++; + q15_t inB1 = *pB++; + sum += inA1 * inB1; + colCnt--; + } + *pOut = (q7_t)__SSAT((sum >> out_shift), 8); + pOut++; + } + } +#else + (void)bufferA; + /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ + int i, j, k, l, m, n; + int conv_out; + int in_row, in_col; + + if (ch_im_in % 4 != 0 || ch_im_out % 2 != 0) + { + /* check if the input dimension meets the constraints */ + return ARM_MATH_SIZE_MISMATCH; + } + + for (i = 0; i < ch_im_out; i++) + { + for (j = 0; j < dim_im_out; j++) + { + for (k = 0; k < dim_im_out; k++) + { + conv_out = (bias[i] << bias_shift) + NN_ROUND(out_shift); + for (m = 0; m < dim_kernel; m++) + { + for (n = 0; n < dim_kernel; n++) + { + // if-for implementation + in_row = stride * j + m - padding; + in_col = stride * k + n - padding; + if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in && in_col < dim_im_in) + { + for (l = 0; l < ch_im_in; l++) + { + conv_out += Im_in[(in_row * dim_im_in + in_col) * ch_im_in + l] * + wt[i * ch_im_in * dim_kernel * dim_kernel + (m * dim_kernel + n) * ch_im_in + l]; + } + } + } + } + Im_out[i + (j * dim_im_out + k) * ch_im_out] = (q7_t)__SSAT((conv_out >> out_shift), 8); + } + } + } + +#endif /* ARM_MATH_DSP */ + + /* Return to application */ + return ARM_MATH_SUCCESS; +} + +/** + * @} end of NNConv group + */ diff --git a/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_fast_nonsquare.c b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_fast_nonsquare.c new file mode 100644 index 0000000..25f17bb --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_fast_nonsquare.c @@ -0,0 +1,378 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_convolve_HWC_q7_fast_nonsquare.c + * Description: Fast Q7 version of convolution (non-sqaure shape) + * + * $Date: July 20, 2021 + * $Revision: V.1.1.2 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup NNConv + * @{ + */ + +/** + * @brief Fast Q7 convolution function (non-sqaure shape) + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in_x input tensor dimention x + * @param[in] dim_im_in_y input tensor dimention y + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel_x filter kernel size x + * @param[in] dim_kernel_y filter kernel size y + * @param[in] padding_x padding size x + * @param[in] padding_y padding size y + * @param[in] stride_x convolution stride x + * @param[in] stride_y convolution stride y + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out_x output tensor dimension x + * @param[in] dim_im_out_y output tensor dimension y + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferB pointer to buffer space for output + * @return The function returns either + * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking. + * + * This function is the version with full list of optimization tricks, but with + * some constraints: + * ch_im_in is multiple of 4 + * ch_im_out is multiple of 2 + */ + +arm_status arm_convolve_HWC_q7_fast_nonsquare(const q7_t *Im_in, + const uint16_t dim_im_in_x, + const uint16_t dim_im_in_y, + const uint16_t ch_im_in, + const q7_t *wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel_x, + const uint16_t dim_kernel_y, + const uint16_t padding_x, + const uint16_t padding_y, + const uint16_t stride_x, + const uint16_t stride_y, + const q7_t *bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q7_t *Im_out, + const uint16_t dim_im_out_x, + const uint16_t dim_im_out_y, + q15_t *bufferA, + q7_t *bufferB) +{ + (void)bufferB; +#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) + /* Run the following code for Cortex-M4 and Cortex-M7 */ + + int16_t i_out_y, i_out_x, i_ker_y, i_ker_x; + + /* ----------------------- + * Here we use bufferA as q15_t internally as computation are done with q15_t level + * im2col are done to output in q15_t format from q7_t input + */ + + q15_t *pBuffer = bufferA; + q7_t *pOut = Im_out; + + if (ch_im_in % 4 != 0 || ch_im_out % 2 != 0) + { + /* check if the input dimension meets the constraints */ + return ARM_MATH_SIZE_MISMATCH; + } + + /* + * Here we split the entire matrix into three regions depending on the padding situation + * Top: i_out_y from 0 to padding - 1 + * Middle: i_out_y from padding to dim_im_out-padding-1 + * Bottom: i_out_y from dim_im_out-padding to dim_im_out-1 + */ + + /* top part */ + for (i_out_y = 0; i_out_y < padding_y; i_out_y++) + { + for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++) + { + /* This part implements the im2col function */ + for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y; + i_ker_y++) + { + for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x; + i_ker_x++) + { + if (i_ker_y < 0 || i_ker_y >= dim_im_in_y || i_ker_x < 0 || i_ker_x >= dim_im_in_x) + { + /* arm_fill_q15(0, pBuffer, ch_im_in); */ + memset(pBuffer, 0, sizeof(q15_t) * ch_im_in); + } + else + { + arm_q7_to_q15_reordered_no_shift( + (q7_t *)Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, pBuffer, ch_im_in); + } + pBuffer += ch_im_in; + } + } + + if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y) + { + pOut = arm_nn_mat_mult_kernel_q7_q15_reordered( + wt, bufferA, ch_im_out, ch_im_in * dim_kernel_x * dim_kernel_y, bias_shift, out_shift, bias, pOut); + /* counter reset */ + pBuffer = bufferA; + } + } + } + + /* middle part, here we also divide the x into left, mid and right */ + for (; i_out_y < dim_im_out_y - padding_y; i_out_y++) + { + + /* left part */ + for (i_out_x = 0; i_out_x < padding_x; i_out_x++) + { + /* This part implements the im2col function */ + for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y; + i_ker_y++) + { + for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x; + i_ker_x++) + { + if (i_ker_x < 0 || i_ker_x >= dim_im_in_x) + { + /* arm_fill_q15(0, pBuffer, ch_im_in); */ + memset(pBuffer, 0, sizeof(q15_t) * ch_im_in); + } + else + { + arm_q7_to_q15_reordered_no_shift( + (q7_t *)Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, pBuffer, ch_im_in); + } + pBuffer += ch_im_in; + } + } + + if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y) + { + pOut = arm_nn_mat_mult_kernel_q7_q15_reordered( + wt, bufferA, ch_im_out, ch_im_in * dim_kernel_x * dim_kernel_y, bias_shift, out_shift, bias, pOut); + /* counter reset */ + pBuffer = bufferA; + } + } + + /* mid part */ + for (; i_out_x < dim_im_out_x - padding_x; i_out_x++) + { + /* This part implements the im2col function */ + for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y; + i_ker_y++) + { + arm_q7_to_q15_reordered_no_shift( + (q7_t *)Im_in + (i_ker_y * dim_im_in_x + i_out_x * stride_x - padding_x) * ch_im_in, + pBuffer, + ch_im_in * dim_kernel_x); + pBuffer += ch_im_in * dim_kernel_x; + } + + if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y) + { + pOut = arm_nn_mat_mult_kernel_q7_q15_reordered( + wt, bufferA, ch_im_out, ch_im_in * dim_kernel_x * dim_kernel_y, bias_shift, out_shift, bias, pOut); + /* counter reset */ + pBuffer = bufferA; + } + } + + /* right part */ + for (; i_out_x < dim_im_out_x; i_out_x++) + { + /* This part implements the im2col function */ + for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y; + i_ker_y++) + { + for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x; + i_ker_x++) + { + if (i_ker_x < 0 || i_ker_x >= dim_im_in_x) + { + /* arm_fill_q15(0, pBuffer, ch_im_in); */ + memset(pBuffer, 0, sizeof(q15_t) * ch_im_in); + } + else + { + arm_q7_to_q15_reordered_no_shift( + (q7_t *)Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, pBuffer, ch_im_in); + } + pBuffer += ch_im_in; + } + } + + if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y) + { + pOut = arm_nn_mat_mult_kernel_q7_q15_reordered( + wt, bufferA, ch_im_out, ch_im_in * dim_kernel_x * dim_kernel_y, bias_shift, out_shift, bias, pOut); + /* counter reset */ + pBuffer = bufferA; + } + } + } + + for (; i_out_y < dim_im_out_y; i_out_y++) + { + for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++) + { + /* This part implements the im2col function */ + for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y; + i_ker_y++) + { + for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x; + i_ker_x++) + { + if (i_ker_y < 0 || i_ker_y >= dim_im_in_y || i_ker_x < 0 || i_ker_x >= dim_im_in_x) + { + /* arm_fill_q15(0, pBuffer, ch_im_in); */ + memset(pBuffer, 0, sizeof(q15_t) * ch_im_in); + } + else + { + arm_q7_to_q15_reordered_no_shift( + (q7_t *)Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, pBuffer, ch_im_in); + } + pBuffer += ch_im_in; + } + } + + if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y) + { + pOut = arm_nn_mat_mult_kernel_q7_q15_reordered( + wt, bufferA, ch_im_out, ch_im_in * dim_kernel_x * dim_kernel_y, bias_shift, out_shift, bias, pOut); + /* counter reset */ + pBuffer = bufferA; + } + } + } + + /* check if there is left-over for compute */ + if (pBuffer != bufferA) + { + const q7_t *pA = wt; + int i; + for (i = 0; i < ch_im_out; i++) + { + q31_t sum = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift); + const q15_t *pB = bufferA; + /* basically each time it process 4 entries */ + uint16_t colCnt = ch_im_in * dim_kernel_x * dim_kernel_y >> 2; + + while (colCnt) + { + + q31_t inA1, inA2; + q31_t inB1, inB2; + + pA = read_and_pad_reordered(pA, &inA1, &inA2); + + inB1 = arm_nn_read_q15x2_ia(&pB); + sum = __SMLAD(inA1, inB1, sum); + inB2 = arm_nn_read_q15x2_ia(&pB); + sum = __SMLAD(inA2, inB2, sum); + + colCnt--; + } + colCnt = (ch_im_in * dim_kernel_y * dim_kernel_x) & 0x3; + while (colCnt) + { + q7_t inA1 = *pA++; + q15_t inB1 = *pB++; + sum += inA1 * inB1; + colCnt--; + } + *pOut = (q7_t)__SSAT((sum >> out_shift), 8); + pOut++; + } + } + +#else + (void)bufferA; + /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ + int i, j, k, l, m, n; + int conv_out; + int in_row, in_col; + + if (ch_im_in % 4 != 0 || ch_im_out % 2 != 0) + { + /* check if the input dimension meets the constraints */ + return ARM_MATH_SIZE_MISMATCH; + } + + for (i = 0; i < ch_im_out; i++) + { + for (j = 0; j < dim_im_out_y; j++) + { + for (k = 0; k < dim_im_out_x; k++) + { + conv_out = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift); + for (m = 0; m < dim_kernel_y; m++) + { + for (n = 0; n < dim_kernel_x; n++) + { + /* if-for implementation */ + in_row = stride_y * j + m - padding_y; + in_col = stride_x * k + n - padding_x; + if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in_y && in_col < dim_im_in_x) + { + for (l = 0; l < ch_im_in; l++) + { + conv_out += Im_in[(in_row * dim_im_in_x + in_col) * ch_im_in + l] * + wt[i * ch_im_in * dim_kernel_y * dim_kernel_x + (m * dim_kernel_x + n) * ch_im_in + + l]; + } + } + } + } + Im_out[i + (j * dim_im_out_x + k) * ch_im_out] = (q7_t)__SSAT((conv_out >> out_shift), 8); + } + } + } + +#endif /* ARM_MATH_DSP */ + + /* Return to application */ + return ARM_MATH_SUCCESS; +} + +/** + * @} end of NNConv group + */ diff --git a/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_fast_s16.c b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_fast_s16.c new file mode 100644 index 0000000..f509f26 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_fast_s16.c @@ -0,0 +1,241 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_convolve_fast_s16.c + * Description: Optimized s16 version of convolution. + * + * $Date: 12 August 2021 + * $Revision: V.1.1.0 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup NNConv + * @{ + */ + +/* + * Basic s16 convolution function. + * + * Refer header file for details. Optimal use case for the DSP/MVE implementation is when input and output channels + * are multiples of 4 or atleast greater than 4. + * + */ + +arm_status arm_convolve_fast_s16(const cmsis_nn_context *ctx, + const cmsis_nn_conv_params *conv_params, + const cmsis_nn_per_channel_quant_params *quant_params, + const cmsis_nn_dims *input_dims, + const q15_t *input_data, + const cmsis_nn_dims *filter_dims, + const q7_t *filter_data, + const cmsis_nn_dims *bias_dims, + const int64_t *bias_data, + const cmsis_nn_dims *output_dims, + q15_t *output_data) +{ + (void)bias_dims; + if (filter_dims->w * filter_dims->h * input_dims->c >= 512) + { + return ARM_MATH_SIZE_MISMATCH; + } + + if (ctx->buf == NULL && arm_convolve_s8_get_buffer_size(input_dims, filter_dims) > 0) + { + return ARM_MATH_ARGUMENT_ERROR; + } + q15_t *buffer_a = (q15_t *)ctx->buf; + + const int32_t input_batches = input_dims->n; + const int32_t input_x = input_dims->w; + const int32_t input_y = input_dims->h; + const int32_t input_ch = input_dims->c; + const int32_t kernel_x = filter_dims->w; + const int32_t kernel_y = filter_dims->h; + const int32_t output_x = output_dims->w; + const int32_t output_y = output_dims->h; + const int32_t output_ch = output_dims->c; + + const int32_t pad_x = conv_params->padding.w; + const int32_t pad_y = conv_params->padding.h; + const int32_t stride_x = conv_params->stride.w; + const int32_t stride_y = conv_params->stride.h; + + const int16_t out_activation_min = conv_params->activation.min; + const int16_t out_activation_max = conv_params->activation.max; + int32_t *output_mult = quant_params->multiplier; + int32_t *output_shift = quant_params->shift; + + for (int i_batch = 0; i_batch < input_batches; i_batch++) + { +#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) + /* Generate two columns from the input tensor a GEMM computation */ + q15_t *two_column_buf = buffer_a; + q15_t *out = output_data; + /* This part implements the im2col function */ + for (int32_t i_out_y = 0; i_out_y < output_y; i_out_y++) + { + for (int32_t i_out_x = 0; i_out_x < output_x; i_out_x++) + { + for (int32_t i_ker_y = i_out_y * stride_y - pad_y; i_ker_y < i_out_y * stride_y - pad_y + kernel_y; + i_ker_y++) + { + for (int32_t i_ker_x = i_out_x * stride_x - pad_x; i_ker_x < i_out_x * stride_x - pad_x + kernel_x; + i_ker_x++) + { + if (i_ker_y < 0 || i_ker_y >= input_y || i_ker_x < 0 || i_ker_x >= input_x) + { + /* Filling 0 for out-of-bound paddings */ + arm_memset_q7((q7_t *)two_column_buf, 0, sizeof(q15_t) * input_ch); + } + else + { + arm_memcpy_q7((q7_t *)two_column_buf, + (const q7_t *)(input_data + (i_ker_y * input_x + i_ker_x) * input_ch), + input_ch * sizeof(q15_t)); + } + two_column_buf += input_ch; + } + } + /* Computation is filed for every 2 columns */ + if (two_column_buf == buffer_a + 2 * input_ch * kernel_y * kernel_x) + { + out = arm_nn_mat_mult_kernel_s16(filter_data, + buffer_a, + output_ch, + output_shift, + output_mult, + out_activation_min, + out_activation_max, + (input_ch * kernel_y * kernel_x), + bias_data, + out); + + /* Counter reset */ + two_column_buf = buffer_a; + } + } + } + + /* Left-over because odd number of output pixels */ + if (two_column_buf != buffer_a) + { + const q7_t *ker_a = filter_data; + int i; + + for (i = 0; i < output_ch; i++) + { + /* Init the accumulator*/ + q31_t sum = 0; + + /* Point to the beginning of the im2col buffer where the input is available as a rearranged column */ + const q15_t *ip_as_col = buffer_a; + + /* 4 multiply and accumulates are done in one loop. */ + uint16_t col_count = (input_ch * kernel_y * kernel_x) >> 2; + + while (col_count) + { + q31_t ker_a1, ker_a2; + q31_t ip_b1, ip_b2; + + ker_a = read_and_pad(ker_a, &ker_a1, &ker_a2); + + ip_b1 = arm_nn_read_q15x2_ia(&ip_as_col); + sum = __SMLAD(ker_a1, ip_b1, sum); + ip_b2 = arm_nn_read_q15x2_ia(&ip_as_col); + sum = __SMLAD(ker_a2, ip_b2, sum); + + col_count--; + } + /* Handle left over mac */ + col_count = input_ch * kernel_y * kernel_x & 0x3; + while (col_count) + { + q7_t ker_a1 = *ker_a++; + q15_t ip_b1 = *ip_as_col++; + sum += ker_a1 * ip_b1; + col_count--; + } + if (bias_data) + { + q31_t reduced_multiplier = REDUCE_MULTIPLIER(output_mult[i]); + q63_t acc_64 = sum + bias_data[i]; + sum = arm_nn_requantize_s64(acc_64, reduced_multiplier, output_shift[i]); + } + else + { + sum = arm_nn_requantize(sum, output_mult[i], output_shift[i]); + } + sum = MAX(sum, out_activation_min); + sum = MIN(sum, out_activation_max); + *out++ = (q15_t)sum; + } + } +#else + (void)input_data; + (void)output_data; + (void)bias_data; + (void)filter_data; + (void)buffer_a; + (void)kernel_x; + (void)kernel_y; + (void)pad_x; + (void)pad_y; + (void)stride_x; + (void)stride_y; + (void)out_activation_min; + (void)out_activation_max; + (void)output_mult; + (void)output_shift; + return ARM_MATH_ARGUMENT_ERROR; +#endif + /* Advance to the next batch */ + input_data += (input_x * input_y * input_ch); + output_data += (output_x * output_y * output_ch); + } + + /* Return to application */ + return ARM_MATH_SUCCESS; +} + +int32_t arm_convolve_fast_s16_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims) +{ +#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) + return (2 * input_dims->c * filter_dims->w * filter_dims->h) * (int32_t)sizeof(int16_t); +#else + (void)input_dims; + (void)filter_dims; + return 0; +#endif +} + +/** + * @} end of NNConv group + */ diff --git a/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_s16.c b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_s16.c new file mode 100644 index 0000000..9702575 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_s16.c @@ -0,0 +1,156 @@ +/* + * Copyright (C) 2010-2022 Arm Limited or its affiliates. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_convolve_s16.c + * Description: s16 version of convolution using symmetric quantization. + * + * $Date: January 13, 2022 + * $Revision: V.1.1.0 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup NNConv + * @{ + */ + +/* + * Basic s16 convolution function. + * + * Refer header file for details. Optimal use case for the DSP/MVE implementation is when input and output channels + * are multiples of 4 or atleast greater than 4. + * + */ + +arm_status arm_convolve_s16(const cmsis_nn_context *ctx, + const cmsis_nn_conv_params *conv_params, + const cmsis_nn_per_channel_quant_params *quant_params, + const cmsis_nn_dims *input_dims, + const q15_t *input_data, + const cmsis_nn_dims *filter_dims, + const q7_t *filter_data, + const cmsis_nn_dims *bias_dims, + const int64_t *bias_data, + const cmsis_nn_dims *output_dims, + q15_t *output_data) +{ + (void)bias_dims; + (void)ctx; + + const int32_t input_batches = input_dims->n; + const int32_t input_x = input_dims->w; + const int32_t input_y = input_dims->h; + const int32_t input_ch = input_dims->c; + const int32_t kernel_x = filter_dims->w; + const int32_t kernel_y = filter_dims->h; + const int32_t output_x = output_dims->w; + const int32_t output_y = output_dims->h; + const int32_t output_ch = output_dims->c; + + const int32_t pad_x = conv_params->padding.w; + const int32_t pad_y = conv_params->padding.h; + const int32_t stride_x = conv_params->stride.w; + const int32_t stride_y = conv_params->stride.h; + const int32_t dilation_x = conv_params->dilation.w; + const int32_t dilation_y = conv_params->dilation.h; + + const int32_t out_activation_min = conv_params->activation.min; + const int32_t out_activation_max = conv_params->activation.max; + int32_t *output_mult = quant_params->multiplier; + int32_t *output_shift = quant_params->shift; + + for (int i_batch = 0; i_batch < input_batches; i_batch++) + { + /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ + for (int32_t i_out_ch = 0; i_out_ch < output_ch; i_out_ch++) + { + const q31_t reduced_multiplier = REDUCE_MULTIPLIER(output_mult[i_out_ch]); + + for (int32_t base_idx_y = -pad_y, i_out_y = 0; i_out_y < output_y; base_idx_y += stride_y, i_out_y++) + { + for (int32_t base_idx_x = -pad_x, i_out_x = 0; i_out_x < output_x; base_idx_x += stride_x, i_out_x++) + { + int64_t conv_out_acc = 0; + + const int32_t start_y_max = (-base_idx_y + dilation_y - 1) / dilation_y; + const int32_t ker_y_start = MAX(0, start_y_max); + const int32_t start_x_max = (-base_idx_x + dilation_x - 1) / dilation_x; + const int32_t ker_x_start = MAX(0, start_x_max); + const int32_t end_min_y = (input_y - base_idx_y + dilation_y - 1) / dilation_y; + const int32_t ker_y_end = MIN(kernel_y, end_min_y); + const int32_t end_min_x = (input_x - base_idx_x + dilation_x - 1) / dilation_x; + const int32_t ker_x_end = MIN(kernel_x, end_min_x); + + for (int32_t i_ker_y = ker_y_start; i_ker_y < ker_y_end; i_ker_y++) + { + for (int32_t i_ker_x = ker_x_start; i_ker_x < ker_x_end; i_ker_x++) + { + const int32_t in_row = base_idx_y + dilation_y * i_ker_y; + const int32_t in_col = base_idx_x + dilation_x * i_ker_x; + + for (int32_t i_input_ch = 0; i_input_ch < input_ch; i_input_ch++) + { + conv_out_acc += input_data[(in_row * input_x + in_col) * input_ch + i_input_ch] * + filter_data[i_out_ch * input_ch * kernel_y * kernel_x + + (i_ker_y * kernel_x + i_ker_x) * input_ch + i_input_ch]; + } + } + } + + if (bias_data) + { + conv_out_acc += bias_data[i_out_ch]; + } + + int32_t conv_out = arm_nn_requantize_s64(conv_out_acc, reduced_multiplier, output_shift[i_out_ch]); + conv_out = MAX(conv_out, out_activation_min); + conv_out = MIN(conv_out, out_activation_max); + output_data[i_out_ch + (i_out_y * output_x + i_out_x) * output_ch] = (int16_t)conv_out; + } + } + } + /* Advance to the next batch */ + input_data += (input_x * input_y * input_ch); + output_data += (output_x * output_y * output_ch); + } + + /* Return to application */ + return ARM_MATH_SUCCESS; +} + +int32_t arm_convolve_s16_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims) +{ + (void)input_dims; + (void)filter_dims; + return 0; +} + +/** + * @} end of NNConv group + */ diff --git a/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_s8.c b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_s8.c new file mode 100644 index 0000000..e884b31 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_s8.c @@ -0,0 +1,335 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_convolve_s8.c + * Description: s8 version of convolution using symmetric quantization. + * + * $Date: December 14, 2021 + * $Revision: V.2.1.0 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup NNConv + * @{ + */ + +/* + * Basic s8 convolution function. + * + * Refer header file for details. Optimal use case for the DSP/MVE implementation is when input and output channels + * are multiples of 4 or atleast greater than 4. + * + */ + +arm_status arm_convolve_s8(const cmsis_nn_context *ctx, + const cmsis_nn_conv_params *conv_params, + const cmsis_nn_per_channel_quant_params *quant_params, + const cmsis_nn_dims *input_dims, + const q7_t *input_data, + const cmsis_nn_dims *filter_dims, + const q7_t *filter_data, + const cmsis_nn_dims *bias_dims, + const int32_t *bias_data, + const cmsis_nn_dims *output_dims, + q7_t *output_data) +{ + (void)bias_dims; + + if (ctx->buf == NULL && arm_convolve_s8_get_buffer_size(input_dims, filter_dims) > 0) + { + return ARM_MATH_ARGUMENT_ERROR; + } + q15_t *buffer_a = (q15_t *)ctx->buf; + + const int32_t input_batches = input_dims->n; + const uint16_t input_x = input_dims->w; + const uint16_t input_y = input_dims->h; + const uint16_t input_ch = input_dims->c; + const uint16_t kernel_x = filter_dims->w; + const uint16_t kernel_y = filter_dims->h; + const uint16_t output_x = output_dims->w; + const uint16_t output_y = output_dims->h; + const uint16_t output_ch = output_dims->c; + + const uint16_t pad_x = conv_params->padding.w; + const uint16_t pad_y = conv_params->padding.h; + const uint16_t stride_x = conv_params->stride.w; + const uint16_t stride_y = conv_params->stride.h; + + const int32_t input_offset = conv_params->input_offset; + const int32_t out_offset = conv_params->output_offset; + const int32_t out_activation_min = conv_params->activation.min; + const int32_t out_activation_max = conv_params->activation.max; + int32_t *output_mult = quant_params->multiplier; + int32_t *output_shift = quant_params->shift; + + int i_batch; + for (i_batch = 0; i_batch < input_batches; i_batch++) + { +#if defined(ARM_MATH_MVEI) + /* Generate upto four columns from the input tensor a GEMM computation */ + q7_t *im2col_buf = (q7_t *)buffer_a; + q7_t *out = output_data; + int32_t buffer_fill_cnt = 0; + int32_t padded = 0; + const int32_t num_elem = kernel_x * kernel_y * input_ch; + const int32_t dilation_x = conv_params->dilation.w; + const int32_t dilation_y = conv_params->dilation.h; + + /* This part implements the im2col function */ + for (int i_out_y = 0; i_out_y < output_y; i_out_y++) + { + for (int i_out_x = 0; i_out_x < output_x; i_out_x++) + { + const int32_t base_idx_x = stride_x * i_out_x - pad_x; + const int32_t base_idx_y = stride_y * i_out_y - pad_y; + + for (int32_t i_ker_y = 0; i_ker_y < kernel_y; i_ker_y++) + { + for (int32_t i_ker_x = 0; i_ker_x < kernel_x; i_ker_x++) + { + const int32_t k_y = base_idx_y + dilation_y * i_ker_y; + const int32_t k_x = base_idx_x + dilation_x * i_ker_x; + + if (k_y < 0 || k_y >= input_y || k_x < 0 || k_x >= input_x) + { + memset(im2col_buf, (int8_t)-input_offset, sizeof(q7_t) * input_ch); + padded = 1; + } + else + { + arm_memcpy_q7(im2col_buf, input_data + (k_y * input_x + k_x) * input_ch, input_ch); + } + im2col_buf += input_ch; + } + } + + buffer_fill_cnt++; + + /* Computation is filed for every 4 columns */ + if (buffer_fill_cnt == 4 && (padded == 0)) + { + buffer_fill_cnt = 0; + out = arm_nn_mat_mul_core_4x_s8(num_elem, + num_elem, + (q7_t *)buffer_a, + filter_data, + output_ch, + conv_params, + quant_params, + bias_data, + out); + im2col_buf = (q7_t *)buffer_a; + } + else if (buffer_fill_cnt == 4 && (padded != 0)) + { + buffer_fill_cnt = 0; + out = arm_nn_mat_mult_s8(filter_data, + (q7_t *)buffer_a, + output_ch, + 4, + output_shift, + output_mult, + out_offset, + input_offset, + 0, + out_activation_min, + out_activation_max, + num_elem, + bias_data, + out); + + im2col_buf = (q7_t *)buffer_a; + padded = 0; + } + } + } + /* Handle left over columns */ + if (buffer_fill_cnt != 0) + { + out = arm_nn_mat_mult_s8(filter_data, + (q7_t *)buffer_a, + output_ch, + buffer_fill_cnt, + output_shift, + output_mult, + out_offset, + input_offset, + 0, + out_activation_min, + out_activation_max, + num_elem, + bias_data, + out); + } +#else // #if defined(ARM_MATH_MVEI) + const uint16_t dilation_x = conv_params->dilation.w; + const uint16_t dilation_y = conv_params->dilation.h; + + int32_t i_out_y, i_out_x, i_ker_y, i_ker_x; + + /* Generate two columns from the input tensor a GEMM computation */ + q15_t *two_column_buf = buffer_a; + q7_t *out = output_data; + + /* This part implements the im2col function */ + for (i_out_y = 0; i_out_y < output_y; i_out_y++) + { + for (i_out_x = 0; i_out_x < output_x; i_out_x++) + { + const int32_t base_idx_y = stride_y * i_out_y - pad_y; + const int32_t base_idx_x = stride_x * i_out_x - pad_x; + + for (i_ker_y = 0; i_ker_y < kernel_y; i_ker_y++) + { + for (i_ker_x = 0; i_ker_x < kernel_x; i_ker_x++) + { + const int32_t k_y = base_idx_y + dilation_y * i_ker_y; + const int32_t k_x = base_idx_x + dilation_x * i_ker_x; + + if (k_y < 0 || k_y >= input_y || k_x < 0 || k_x >= input_x) + { + /* Filling 0 for out-of-bound paddings */ + memset(two_column_buf, 0, sizeof(q15_t) * input_ch); + } + else + { + /* Copying the pixel data to column */ + arm_q7_to_q15_with_offset( + input_data + (k_y * input_x + k_x) * input_ch, two_column_buf, input_ch, input_offset); + } + two_column_buf += input_ch; + } + } + + /* Computation is filed for every 2 columns */ + if (two_column_buf == buffer_a + 2 * input_ch * kernel_y * kernel_x) + { + out = arm_nn_mat_mult_kernel_s8_s16(filter_data, + buffer_a, + output_ch, + output_shift, + output_mult, + out_offset, + out_activation_min, + out_activation_max, + input_ch * kernel_y * kernel_x, + bias_data, + out); + + /* counter reset */ + two_column_buf = buffer_a; + } + } + } + + /* left-over because odd number of output pixels */ + if (two_column_buf != buffer_a) + { + const q7_t *ker_a = filter_data; + int i; + + for (i = 0; i < output_ch; i++) + { + /* Load the accumulator with bias first */ + q31_t sum = 0; + if (bias_data) + { + sum = bias_data[i]; + } + + /* Point to the beginning of the im2col buffer where the input is available as a rearranged column */ + const q15_t *ip_as_col = buffer_a; + + /* 4 multiply and accumulates are done in one loop. */ +#if defined(ARM_MATH_DSP) + uint16_t col_count = (input_ch * kernel_y * kernel_x) >> 2; + + while (col_count) + { + q31_t ker_a1, ker_a2; + q31_t ip_b1, ip_b2; + + ker_a = read_and_pad(ker_a, &ker_a1, &ker_a2); + + ip_b1 = arm_nn_read_q15x2_ia(&ip_as_col); + sum = __SMLAD(ker_a1, ip_b1, sum); + ip_b2 = arm_nn_read_q15x2_ia(&ip_as_col); + sum = __SMLAD(ker_a2, ip_b2, sum); + + col_count--; + } + /* Handle left over mac */ + col_count = input_ch * kernel_y * kernel_x & 0x3; +#else + uint16_t col_count = input_ch * kernel_y * kernel_x; +#endif + while (col_count) + { + q7_t ker_a1 = *ker_a++; + q15_t ip_b1 = *ip_as_col++; + sum += ker_a1 * ip_b1; + col_count--; + } + + sum = arm_nn_requantize(sum, output_mult[i], output_shift[i]); + sum += out_offset; + sum = MAX(sum, out_activation_min); + sum = MIN(sum, out_activation_max); + *out++ = (q7_t)sum; + } + } +#endif // #if defined(ARM_MATH_MVEI) + /* Advance to the next batch */ + input_data += (input_x * input_y * input_ch); + output_data += (output_x * output_y * output_ch); + } + + /* Return to application */ + return ARM_MATH_SUCCESS; +} + +int32_t arm_convolve_s8_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims) +{ +#if defined(ARM_MATH_MVEI) + int32_t col_length = input_dims->c * filter_dims->w * filter_dims->h; + // Get number of complete int16 lanes(multiple of 8) for given col_length. This is dependent on + // implementation of arm_nn_mat_mult_s8 + col_length = (col_length + 7) / 8; + // 4 -> number of im2col buffers, 8 -> 8 elements per Q register + return 4 * col_length * 8 * (int32_t)sizeof(int8_t); +#else + return (2 * input_dims->c * filter_dims->w * filter_dims->h) * (int32_t)sizeof(int16_t); +#endif +} + +/** + * @} end of NNConv group + */ diff --git a/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_wrapper_s16.c b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_wrapper_s16.c new file mode 100644 index 0000000..75bb26f --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_wrapper_s16.c @@ -0,0 +1,130 @@ +/* + * Copyright (C) 2021-2022 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_convolve_wrapper_s16.c + * Description: s16 convolution layer wrapper function with the main purpose to call the optimal kernel available in + * cmsis-nn to perform the convolution. + * + * $Date: 13 January 2022 + * $Revision: V.1.2.0 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup NNConv + * @{ + */ + +/* + * Convolution layer + * + * Refer header file for details. + * + */ + +arm_status arm_convolve_wrapper_s16(const cmsis_nn_context *ctx, + const cmsis_nn_conv_params *conv_params, + const cmsis_nn_per_channel_quant_params *quant_params, + const cmsis_nn_dims *input_dims, + const q15_t *input_data, + const cmsis_nn_dims *filter_dims, + const q7_t *filter_data, + const cmsis_nn_dims *bias_dims, + const int64_t *bias_data, + const cmsis_nn_dims *output_dims, + q15_t *output_data) +{ +#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) + if (filter_dims->w * filter_dims->h * input_dims->c < 512 && + (conv_params->dilation.w == 1 && conv_params->dilation.h == 1)) + { + return arm_convolve_fast_s16(ctx, + conv_params, + quant_params, + input_dims, + input_data, + filter_dims, + filter_data, + bias_dims, + bias_data, + output_dims, + output_data); + } + else + { + return arm_convolve_s16(ctx, + conv_params, + quant_params, + input_dims, + input_data, + filter_dims, + filter_data, + bias_dims, + bias_data, + output_dims, + output_data); + } +#else + return arm_convolve_s16(ctx, + conv_params, + quant_params, + input_dims, + input_data, + filter_dims, + filter_data, + bias_dims, + bias_data, + output_dims, + output_data); +#endif +} + +int32_t arm_convolve_wrapper_s16_get_buffer_size(const cmsis_nn_conv_params *conv_params, + const cmsis_nn_dims *input_dims, + const cmsis_nn_dims *filter_dims, + const cmsis_nn_dims *output_dims) +{ + (void)conv_params; + (void)output_dims; + +#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) + if (filter_dims->w * filter_dims->h * input_dims->c < 512 && + (conv_params->dilation.w == 1 && conv_params->dilation.h == 1)) + { + return arm_convolve_fast_s16_get_buffer_size(input_dims, filter_dims); + } + + return arm_convolve_s16_get_buffer_size(input_dims, filter_dims); +#else + return arm_convolve_s16_get_buffer_size(input_dims, filter_dims); +#endif +} + +/** + * @} end of NNConv group + */ diff --git a/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_wrapper_s8.c b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_wrapper_s8.c new file mode 100644 index 0000000..bf1cd70 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_wrapper_s8.c @@ -0,0 +1,133 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_convolve_wrapper_s8.c + * Description: s8 convolution layer wrapper function with the main purpose to call the optimal kernel available in + * cmsis-nn to perform the convolution. + * + * $Date: 02. December 2021 + * $Revision: V.1.1.0 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup NNConv + * @{ + */ + +/* + * Convolution layer + * + * Refer header file for details. + * + */ + +arm_status arm_convolve_wrapper_s8(const cmsis_nn_context *ctx, + const cmsis_nn_conv_params *conv_params, + const cmsis_nn_per_channel_quant_params *quant_params, + const cmsis_nn_dims *input_dims, + const q7_t *input_data, + const cmsis_nn_dims *filter_dims, + const q7_t *filter_data, + const cmsis_nn_dims *bias_dims, + const int32_t *bias_data, + const cmsis_nn_dims *output_dims, + q7_t *output_data) +{ + if ((conv_params->padding.w == 0) && (conv_params->padding.h == 0) && (input_dims->c % 4 == 0) && + (conv_params->stride.w == 1) && (conv_params->stride.h == 1) && (filter_dims->w == 1) && + (filter_dims->h == 1) && (conv_params->dilation.w == 1 && conv_params->dilation.h == 1)) + { + return arm_convolve_1x1_s8_fast(ctx, + conv_params, + quant_params, + input_dims, + input_data, + filter_dims, + filter_data, + bias_dims, + bias_data, + output_dims, + output_data); + } + else if ((output_dims->h == 1) && (input_dims->h == 1) && (filter_dims->h == 1) && (output_dims->w % 4 == 0) && + (input_dims->n == 1) && (conv_params->dilation.w == 1 && conv_params->dilation.h == 1)) + { + return arm_convolve_1_x_n_s8(ctx, + conv_params, + quant_params, + input_dims, + input_data, + filter_dims, + filter_data, + bias_dims, + bias_data, + output_dims, + output_data); + } + else + { + return arm_convolve_s8(ctx, + conv_params, + quant_params, + input_dims, + input_data, + filter_dims, + filter_data, + bias_dims, + bias_data, + output_dims, + output_data); + } +} + +int32_t arm_convolve_wrapper_s8_get_buffer_size(const cmsis_nn_conv_params *conv_params, + const cmsis_nn_dims *input_dims, + const cmsis_nn_dims *filter_dims, + const cmsis_nn_dims *output_dims) +{ + if ((conv_params->padding.w == 0) && (conv_params->padding.h == 0) && (input_dims->c % 4 == 0) && + (conv_params->stride.w == 1) && (conv_params->stride.h == 1) && (filter_dims->w == 1) && + (filter_dims->h == 1) && (conv_params->dilation.w == 1 && conv_params->dilation.h == 1)) + { + return arm_convolve_1x1_s8_fast_get_buffer_size(input_dims); + } + else if ((output_dims->h == 1) && (input_dims->h == 1) && (filter_dims->h == 1) && (output_dims->w % 4 == 0) && + (input_dims->n == 1) && (conv_params->dilation.w == 1 && conv_params->dilation.h == 1)) + { + return arm_convolve_1_x_n_s8_get_buffer_size(input_dims, filter_dims); + } + else + { + return arm_convolve_s8_get_buffer_size(input_dims, filter_dims); + } +} + +/** + * @} end of NNConv group + */ diff --git a/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_3x3_s8.c b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_3x3_s8.c new file mode 100644 index 0000000..d5569b3 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_3x3_s8.c @@ -0,0 +1,212 @@ +/* + * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_depthwise_conv_3x3_s8.c + * Description: Optimized s8 depthwise convolution function for channel + * multiplier of 1 and 3x3 kernel size. + * + * $Date: 09. October 2020 + * $Revision: V.2.0.1 + * + * Target Processor: Cortex-M CPUs + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup NNConv + * @{ + */ + +/* + * Optimized s8 depthwise convolution function with constraint that + * in_channel == out_channel and kernel_x == kernel_y == 3 with pads at most 1 + * + * Refer prototype header file for details. + * + */ + +arm_status arm_depthwise_conv_3x3_s8(const cmsis_nn_context *ctx, + const cmsis_nn_dw_conv_params *dw_conv_params, + const cmsis_nn_per_channel_quant_params *quant_params, + const cmsis_nn_dims *input_dims, + const q7_t *input, + const cmsis_nn_dims *filter_dims, + const q7_t *kernel, + const cmsis_nn_dims *bias_dims, + const int32_t *bias, + const cmsis_nn_dims *output_dims, + q7_t *output) +{ + (void)ctx; + (void)bias_dims; + + const int32_t input_x = input_dims->w; + const int32_t input_y = input_dims->h; + const int32_t input_ch = input_dims->c; + const int32_t output_ch = output_dims->c; + const int32_t pad_x = dw_conv_params->padding.w; + const int32_t pad_y = dw_conv_params->padding.h; + const int32_t stride_x = dw_conv_params->stride.w; + const int32_t stride_y = dw_conv_params->stride.h; + const int32_t *output_shift = quant_params->shift; + const int32_t *output_mult = quant_params->multiplier; + const int32_t output_x = output_dims->w; + const int32_t output_y = output_dims->h; + const int32_t output_offset = dw_conv_params->output_offset; + const int32_t input_offset = dw_conv_params->input_offset; + const int32_t output_activation_min = dw_conv_params->activation.min; + const int32_t output_activation_max = dw_conv_params->activation.max; + + /* Check input constraints input_ch == output_ch */ + if (input_ch != output_ch) + { + return ARM_MATH_SIZE_MISMATCH; + } + /* Check input constraints pad_x <= 1 */ + if (pad_x > 1 || filter_dims->w != 3 || filter_dims->h != 3) + { + return ARM_MATH_ARGUMENT_ERROR; + } + + for (int32_t in_h = -pad_y, out_h = 0, out_idx = 0; out_h < output_y; in_h += stride_y, ++out_h) + { + for (int32_t in_w = -pad_x, out_w = 0, ker_h_start = MAX(0, -in_h); out_w < output_x; in_w += stride_x, ++out_w) + { + int32_t in_ch = 0; + int32_t ker_w_start = MAX(0, -in_w); + + for (; in_ch <= (input_ch - 4); in_ch += 4) + { + int32_t out_buff0 = bias[in_ch + 0]; + int32_t out_buff1 = bias[in_ch + 1]; + int32_t out_buff2 = bias[in_ch + 2]; + int32_t out_buff3 = bias[in_ch + 3]; + + const int8_t *input_ptr = input + (in_h + ker_h_start) * (input_ch * input_x) + in_w * input_ch + in_ch; + const int8_t *kernel_ptr = kernel + ker_h_start * (input_ch * 3) + in_ch; + + for (int32_t ker_h = ker_h_start; ker_h < MIN(3, input_y - in_h); ++ker_h) + { + int32_t in_val = 0; + int32_t ker_val = 0; + + if (ker_w_start == 0) + { + in_val = arm_nn_read_q7x4(input_ptr); + ker_val = arm_nn_read_q7x4(kernel_ptr); + + out_buff0 += ((int8_t)in_val + input_offset) * (int8_t)ker_val; + out_buff1 += ((int8_t)(in_val >> 8) + input_offset) * (int8_t)(ker_val >> 8); + out_buff2 += ((int8_t)(in_val >> 16) + input_offset) * (int8_t)(ker_val >> 16); + out_buff3 += ((int8_t)(in_val >> 24) + input_offset) * (int8_t)(ker_val >> 24); + } + + in_val = arm_nn_read_q7x4(input_ptr + input_ch); + ker_val = arm_nn_read_q7x4(kernel_ptr + input_ch); + + out_buff0 += ((int8_t)in_val + input_offset) * (int8_t)ker_val; + out_buff1 += ((int8_t)(in_val >> 8) + input_offset) * (int8_t)(ker_val >> 8); + out_buff2 += ((int8_t)(in_val >> 16) + input_offset) * (int8_t)(ker_val >> 16); + out_buff3 += ((int8_t)(in_val >> 24) + input_offset) * (int8_t)(ker_val >> 24); + + if ((input_x - in_w) >= 3) + { + in_val = arm_nn_read_q7x4(input_ptr + (input_ch << 1)); + ker_val = arm_nn_read_q7x4(kernel_ptr + (input_ch << 1)); + + out_buff0 += ((int8_t)in_val + input_offset) * (int8_t)ker_val; + out_buff1 += ((int8_t)(in_val >> 8) + input_offset) * (int8_t)(ker_val >> 8); + out_buff2 += ((int8_t)(in_val >> 16) + input_offset) * (int8_t)(ker_val >> 16); + out_buff3 += ((int8_t)(in_val >> 24) + input_offset) * (int8_t)(ker_val >> 24); + } + + input_ptr += (input_ch * input_x); + kernel_ptr += (input_ch * 3); + } + + out_buff0 = arm_nn_requantize(out_buff0, output_mult[in_ch + 0], output_shift[in_ch + 0]); + out_buff1 = arm_nn_requantize(out_buff1, output_mult[in_ch + 1], output_shift[in_ch + 1]); + out_buff2 = arm_nn_requantize(out_buff2, output_mult[in_ch + 2], output_shift[in_ch + 2]); + out_buff3 = arm_nn_requantize(out_buff3, output_mult[in_ch + 3], output_shift[in_ch + 3]); + + out_buff0 += output_offset; + out_buff1 += output_offset; + out_buff2 += output_offset; + out_buff3 += output_offset; + + out_buff0 = MIN(MAX(out_buff0, output_activation_min), output_activation_max); + out_buff1 = MIN(MAX(out_buff1, output_activation_min), output_activation_max); + out_buff2 = MIN(MAX(out_buff2, output_activation_min), output_activation_max); + out_buff3 = MIN(MAX(out_buff3, output_activation_min), output_activation_max); + + output[out_idx++] = (int8_t)out_buff0; + output[out_idx++] = (int8_t)out_buff1; + output[out_idx++] = (int8_t)out_buff2; + output[out_idx++] = (int8_t)out_buff3; + } + + // Leftover + for (; in_ch < input_ch; ++in_ch) + { + int32_t out_buff = bias[in_ch]; + + const int8_t *input_ptr = input + (in_h + ker_h_start) * (input_ch * input_x) + in_w * input_ch + in_ch; + const int8_t *kernel_ptr = kernel + ker_h_start * (input_ch * 3) + in_ch; + + for (int32_t ker_h = ker_h_start; ker_h < MIN(3, input_y - in_h); ++ker_h) + { + if (ker_w_start == 0) + { + out_buff += (*(input_ptr) + input_offset) * *(kernel_ptr); + } + + out_buff += (*(input_ptr + input_ch) + input_offset) * *(kernel_ptr + input_ch); + + if ((input_x - in_w) >= 3) + { + out_buff += (*(input_ptr + (input_ch << 1)) + input_offset) * *(kernel_ptr + (input_ch << 1)); + } + + input_ptr += (input_ch * input_x); + kernel_ptr += (input_ch * 3); + } + + out_buff = arm_nn_requantize(out_buff, output_mult[in_ch], output_shift[in_ch]); + out_buff += output_offset; + out_buff = MIN(MAX(out_buff, output_activation_min), output_activation_max); + output[out_idx++] = (int8_t)out_buff; + } + } + } + + /* Return to application */ + return ARM_MATH_SUCCESS; +} + +/** + * @} end of NNConv group + */ diff --git a/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_s16.c b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_s16.c new file mode 100644 index 0000000..42e4bbd --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_s16.c @@ -0,0 +1,292 @@ +/* + * Copyright (C) 2022 Arm Limited or its affiliates. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_depthwise_conv_s16.c + * Description: s16 version of depthwise convolution. + * + * $Date: 26. Jan 2022 + * $Revision: V.1.0.0 + * + * Target Processor: Cortex-M CPUs + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup NNConv + * @{ + */ + +static void __attribute__((unused)) depthwise_conv_s16_mult_4_s16(const int16_t *input, + const int32_t input_x, + const int32_t input_y, + const int32_t input_ch, + const int8_t *kernel, + const int32_t output_ch, + const int32_t ch_mult, + const int32_t kernel_x, + const int32_t kernel_y, + const int32_t pad_x, + const int32_t pad_y, + const int32_t stride_x, + const int32_t stride_y, + const int64_t *bias, + int16_t *output, + const int32_t *output_shift, + const int32_t *output_mult, + const int32_t output_x, + const int32_t output_y, + const int32_t output_activation_min, + const int32_t output_activation_max) +{ + for (int32_t in_h = -pad_y, out_h = 0, out_idx = 0; out_h < output_y; in_h += stride_y, ++out_h) + { + for (int32_t in_w = -pad_x, out_w = 0, ker_h_start = MAX(0, -in_h); out_w < output_x; in_w += stride_x, ++out_w) + { + for (int32_t in_ch = 0, out_ch = 0, ker_w_start = MAX(0, -in_w); out_ch < output_ch; + ++in_ch, out_ch += ch_mult) + { + for (int mult_tile = 0; mult_tile < ch_mult; mult_tile += 4) + { + int32_t out_buff32[4] = {REDUCE_MULTIPLIER(output_mult[out_ch + 0 + mult_tile]), + REDUCE_MULTIPLIER(output_mult[out_ch + 1 + mult_tile]), + REDUCE_MULTIPLIER(output_mult[out_ch + 2 + mult_tile]), + REDUCE_MULTIPLIER(output_mult[out_ch + 3 + mult_tile])}; + + int64_t out_buff[4] = {0, 0, 0, 0}; + + if (bias) + { + out_buff[0] = bias[out_ch + 0 + mult_tile]; + out_buff[1] = bias[out_ch + 1 + mult_tile]; + out_buff[2] = bias[out_ch + 2 + mult_tile]; + out_buff[3] = bias[out_ch + 3 + mult_tile]; + } + + for (int32_t ker_h = ker_h_start; ker_h < MIN(kernel_y, input_y - in_h); ++ker_h) + { + int32_t ker_idx = ker_h * (output_ch * kernel_x) + ker_w_start * output_ch + out_ch; + int32_t in_idx = (in_h + ker_h) * (input_ch * input_x) + in_w * input_ch + in_ch; +#if defined(__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050) +#pragma clang loop unroll(disable) +#endif + for (int32_t ker_w = ker_w_start; ker_w < MIN(kernel_x, input_x - in_w); + ++ker_w, ker_idx += output_ch) + { + // TODO: Unroll of 4 with 64 bit accumulator will probably result in too much register + // spills. Try with unroll of 2 when enabling this. + int32_t in_val = input[in_idx + ker_w * input_ch]; + out_buff[0] += in_val * kernel[ker_idx + 0 + mult_tile]; + out_buff[1] += in_val * kernel[ker_idx + 1 + mult_tile]; + out_buff[2] += in_val * kernel[ker_idx + 2 + mult_tile]; + out_buff[3] += in_val * kernel[ker_idx + 3 + mult_tile]; + } + } + + out_buff32[0] = + arm_nn_requantize_s64(out_buff[0], out_buff32[0], output_shift[out_ch + 0 + mult_tile]); + out_buff32[1] = + arm_nn_requantize_s64(out_buff[1], out_buff32[1], output_shift[out_ch + 1 + mult_tile]); + out_buff32[2] = + arm_nn_requantize_s64(out_buff[2], out_buff32[2], output_shift[out_ch + 2 + mult_tile]); + out_buff32[3] = + arm_nn_requantize_s64(out_buff[3], out_buff32[3], output_shift[out_ch + 3 + mult_tile]); + + out_buff32[0] = MIN(MAX(out_buff32[0], output_activation_min), output_activation_max); + out_buff32[1] = MIN(MAX(out_buff32[1], output_activation_min), output_activation_max); + out_buff32[2] = MIN(MAX(out_buff32[2], output_activation_min), output_activation_max); + out_buff32[3] = MIN(MAX(out_buff32[3], output_activation_min), output_activation_max); + + output[out_idx++] = (int16_t)out_buff32[0]; + output[out_idx++] = (int16_t)out_buff32[1]; + output[out_idx++] = (int16_t)out_buff32[2]; + output[out_idx++] = (int16_t)out_buff32[3]; + } + } + } + } +} + +static void depthwise_conv_s16_generic_s16(const int16_t *input, + const uint16_t input_batches, + const uint16_t input_x, + const uint16_t input_y, + const uint16_t input_ch, + const int8_t *kernel, + const uint16_t ch_mult, + const uint16_t kernel_x, + const uint16_t kernel_y, + const uint16_t pad_x, + const uint16_t pad_y, + const uint16_t stride_x, + const uint16_t stride_y, + const int64_t *bias, + int16_t *output, + const int32_t *output_shift, + const int32_t *output_mult, + const uint16_t output_x, + const uint16_t output_y, + const int32_t output_activation_min, + const int32_t output_activation_max, + const uint16_t dilation_x, + const uint16_t dilation_y) + +{ + for (int i_batch = 0; i_batch < input_batches; i_batch++) + { + for (int i_out_y = 0; i_out_y < output_y; i_out_y++) + { + const int16_t base_idx_y = (i_out_y * stride_y) - pad_y; + for (int i_out_x = 0; i_out_x < output_x; i_out_x++) + { + const int16_t base_idx_x = (i_out_x * stride_x) - pad_x; + for (int i_input_ch = 0; i_input_ch < input_ch; i_input_ch++) + { + for (int i_ch_mult = 0; i_ch_mult < ch_mult; i_ch_mult++) + { + const int idx_out_ch = i_ch_mult + i_input_ch * ch_mult; + + const q31_t reduced_multiplier = REDUCE_MULTIPLIER(output_mult[idx_out_ch]); + int64_t acc_0 = 0; + + int ker_y_start; + int ker_x_start; + int ker_y_end; + int ker_x_end; + + if (dilation_x > 1) + { + const int32_t start_x_max = (-base_idx_x + dilation_x - 1) / dilation_x; + ker_x_start = MAX(0, start_x_max); + const int32_t end_min_x = (input_x - base_idx_x + dilation_x - 1) / dilation_x; + ker_x_end = MIN(kernel_x, end_min_x); + } + else + { + ker_x_start = MAX(0, -base_idx_x); + ker_x_end = MIN(kernel_x, input_x - base_idx_x); + } + + if (dilation_y > 1) + { + const int32_t start_y_max = (-base_idx_y + dilation_y - 1) / dilation_y; + ker_y_start = MAX(0, start_y_max); + const int32_t end_min_y = (input_y - base_idx_y + dilation_y - 1) / dilation_y; + ker_y_end = MIN(kernel_y, end_min_y); + } + else + { + ker_y_start = MAX(0, -base_idx_y); + ker_y_end = MIN(kernel_y, input_y - base_idx_y); + } + + if (bias) + { + acc_0 = bias[idx_out_ch]; + } + + for (int i_ker_y = ker_y_start; i_ker_y < ker_y_end; i_ker_y++) + { + const int32_t idx_y = base_idx_y + dilation_y * i_ker_y; + for (int i_ker_x = ker_x_start; i_ker_x < ker_x_end; i_ker_x++) + { + const int32_t idx_x = base_idx_x + dilation_x * i_ker_x; + int32_t idx_0 = (idx_y * input_x + idx_x) * input_ch + i_input_ch; + int32_t ker_idx_0 = (i_ker_y * kernel_x + i_ker_x) * (input_ch * ch_mult) + idx_out_ch; + + acc_0 += input[idx_0] * kernel[ker_idx_0]; + } + } + + /* Requantize and clamp output to provided range */ + int32_t result = arm_nn_requantize_s64(acc_0, reduced_multiplier, output_shift[idx_out_ch]); + result = MAX(result, output_activation_min); + result = MIN(result, output_activation_max); + *output++ = (int16_t)result; + } + } + } + } + /* Advance to the next batch */ + input += (input_x * input_y * input_ch); + } +} + +/* + * Basic s16 depthwise convolution function. + * + * Refer header file for details. + * + */ +arm_status arm_depthwise_conv_s16(const cmsis_nn_context *ctx, + const cmsis_nn_dw_conv_params *dw_conv_params, + const cmsis_nn_per_channel_quant_params *quant_params, + const cmsis_nn_dims *input_dims, + const q15_t *input, + const cmsis_nn_dims *filter_dims, + const q7_t *kernel, + const cmsis_nn_dims *bias_dims, + const int64_t *bias, + const cmsis_nn_dims *output_dims, + q15_t *output) +{ + const uint16_t dilation_x = dw_conv_params->dilation.w; + const uint16_t dilation_y = dw_conv_params->dilation.h; + + (void)bias_dims; + (void)ctx; + + depthwise_conv_s16_generic_s16(input, + input_dims->n, + input_dims->w, + input_dims->h, + input_dims->c, + kernel, + dw_conv_params->ch_mult, + filter_dims->w, + filter_dims->h, + dw_conv_params->padding.w, + dw_conv_params->padding.h, + dw_conv_params->stride.w, + dw_conv_params->stride.h, + bias, + output, + quant_params->shift, + quant_params->multiplier, + output_dims->w, + output_dims->h, + dw_conv_params->activation.min, + dw_conv_params->activation.max, + dilation_x, + dilation_y); + + /* Return to application */ + return ARM_MATH_SUCCESS; +} + +/** + * @} end of NNConv group + */ diff --git a/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_s8.c b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_s8.c new file mode 100644 index 0000000..297b7af --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_s8.c @@ -0,0 +1,347 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_depthwise_conv_s8.c + * Description: s8 version of depthwise convolution. + * + * $Date: 30. Dec 2021 + * $Revision: V.2.7.1 + * + * Target Processor: Cortex-M CPUs + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup NNConv + * @{ + */ + +static void depthwise_conv_s8_mult_4(const int8_t *input, + const int32_t input_x, + const int32_t input_y, + const int32_t input_ch, + const int8_t *kernel, + const int32_t output_ch, + const int32_t ch_mult, + const int32_t kernel_x, + const int32_t kernel_y, + const int32_t pad_x, + const int32_t pad_y, + const int32_t stride_x, + const int32_t stride_y, + const int32_t *bias, + int8_t *output, + const int32_t *output_shift, + const int32_t *output_mult, + const int32_t output_x, + const int32_t output_y, + const int32_t output_offset, + const int32_t input_offset, + const int32_t output_activation_min, + const int32_t output_activation_max) +{ + for (int32_t in_h = -pad_y, out_h = 0, out_idx = 0; out_h < output_y; in_h += stride_y, ++out_h) + { + for (int32_t in_w = -pad_x, out_w = 0, ker_h_start = MAX(0, -in_h); out_w < output_x; in_w += stride_x, ++out_w) + { + for (int32_t in_ch = 0, out_ch = 0, ker_w_start = MAX(0, -in_w); out_ch < output_ch; + ++in_ch, out_ch += ch_mult) + { + for (int mult_tile = 0; mult_tile < ch_mult; mult_tile += 4) + { + int32_t out_buff[4] = {0, 0, 0, 0}; + if (bias) + { + out_buff[0] = bias[out_ch + 0 + mult_tile]; + out_buff[1] = bias[out_ch + 1 + mult_tile]; + out_buff[2] = bias[out_ch + 2 + mult_tile]; + out_buff[3] = bias[out_ch + 3 + mult_tile]; + } + + for (int32_t ker_h = ker_h_start; ker_h < MIN(kernel_y, input_y - in_h); ++ker_h) + { + int32_t ker_idx = ker_h * (output_ch * kernel_x) + ker_w_start * output_ch + out_ch; + int32_t in_idx = (in_h + ker_h) * (input_ch * input_x) + in_w * input_ch + in_ch; +#if defined(__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050) +#pragma clang loop unroll(disable) +#endif + for (int32_t ker_w = ker_w_start; ker_w < MIN(kernel_x, input_x - in_w); + ++ker_w, ker_idx += output_ch) + { + int32_t in_val = input[in_idx + ker_w * input_ch] + input_offset; + out_buff[0] += in_val * kernel[ker_idx + 0 + mult_tile]; + out_buff[1] += in_val * kernel[ker_idx + 1 + mult_tile]; + out_buff[2] += in_val * kernel[ker_idx + 2 + mult_tile]; + out_buff[3] += in_val * kernel[ker_idx + 3 + mult_tile]; + } + } +#if defined(ARM_MATH_MVEI) + (void)out_idx; + int32x4_t res = vldrwq_s32(out_buff); + res = arm_requantize_mve_32x4(res, + vldrwq_s32(&output_mult[out_ch + mult_tile]), + vldrwq_s32(&output_shift[out_ch + mult_tile])); + res = vaddq_n_s32(res, output_offset); + + res = vmaxq_s32(res, vdupq_n_s32(output_activation_min)); + res = vminq_s32(res, vdupq_n_s32(output_activation_max)); + vstrbq_s32(output, res); + output += 4; +#else + out_buff[0] = arm_nn_requantize( + out_buff[0], output_mult[out_ch + 0 + mult_tile], output_shift[out_ch + 0 + mult_tile]); + out_buff[1] = arm_nn_requantize( + out_buff[1], output_mult[out_ch + 1 + mult_tile], output_shift[out_ch + 1 + mult_tile]); + out_buff[2] = arm_nn_requantize( + out_buff[2], output_mult[out_ch + 2 + mult_tile], output_shift[out_ch + 2 + mult_tile]); + out_buff[3] = arm_nn_requantize( + out_buff[3], output_mult[out_ch + 3 + mult_tile], output_shift[out_ch + 3 + mult_tile]); + + out_buff[0] += output_offset; + out_buff[1] += output_offset; + out_buff[2] += output_offset; + out_buff[3] += output_offset; + + out_buff[0] = MIN(MAX(out_buff[0], output_activation_min), output_activation_max); + out_buff[1] = MIN(MAX(out_buff[1], output_activation_min), output_activation_max); + out_buff[2] = MIN(MAX(out_buff[2], output_activation_min), output_activation_max); + out_buff[3] = MIN(MAX(out_buff[3], output_activation_min), output_activation_max); + + output[out_idx++] = (int8_t)out_buff[0]; + output[out_idx++] = (int8_t)out_buff[1]; + output[out_idx++] = (int8_t)out_buff[2]; + output[out_idx++] = (int8_t)out_buff[3]; + +#endif + } + } + } + } +} + +static void depthwise_conv_s8_generic(const q7_t *input, + const uint16_t input_batches, + const uint16_t input_x, + const uint16_t input_y, + const uint16_t input_ch, + const q7_t *kernel, + const uint16_t output_ch, + const uint16_t ch_mult, + const uint16_t kernel_x, + const uint16_t kernel_y, + const uint16_t pad_x, + const uint16_t pad_y, + const uint16_t stride_x, + const uint16_t stride_y, + const int32_t *bias, + q7_t *output, + const int32_t *output_shift, + const int32_t *output_mult, + const uint16_t output_x, + const uint16_t output_y, + const int32_t output_offset, + const int32_t input_offset, + const int32_t output_activation_min, + const int32_t output_activation_max, + const uint16_t dilation_x, + const uint16_t dilation_y) + +{ + (void)output_ch; + int i_out = 0; + int i_batch; + + for (i_batch = 0; i_batch < input_batches; i_batch++) + { + for (int i_out_y = 0; i_out_y < output_y; i_out_y++) + { + const int16_t base_idx_y = (i_out_y * stride_y) - pad_y; + for (int i_out_x = 0; i_out_x < output_x; i_out_x++) + { + const int16_t base_idx_x = (i_out_x * stride_x) - pad_x; + for (int i_input_ch = 0; i_input_ch < input_ch; i_input_ch++) + { + for (int i_ch_mult = 0; i_ch_mult < ch_mult; i_ch_mult++) + { + const int idx_out_ch = i_ch_mult + i_input_ch * ch_mult; + int32_t acc_0 = 0; + + int ker_y_start; + int ker_x_start; + int ker_y_end; + int ker_x_end; + + if (dilation_x > 1) + { + const int32_t start_x_max = (-base_idx_x + dilation_x - 1) / dilation_x; + ker_x_start = MAX(0, start_x_max); + const int32_t end_min_x = (input_x - base_idx_x + dilation_x - 1) / dilation_x; + ker_x_end = MIN(kernel_x, end_min_x); + } + else + { + ker_x_start = MAX(0, -base_idx_x); + ker_x_end = MIN(kernel_x, input_x - base_idx_x); + } + + if (dilation_y > 1) + { + const int32_t start_y_max = (-base_idx_y + dilation_y - 1) / dilation_y; + ker_y_start = MAX(0, start_y_max); + const int32_t end_min_y = (input_y - base_idx_y + dilation_y - 1) / dilation_y; + ker_y_end = MIN(kernel_y, end_min_y); + } + else + { + ker_y_start = MAX(0, -base_idx_y); + ker_y_end = MIN(kernel_y, input_y - base_idx_y); + } + + if (bias) + { + acc_0 = bias[idx_out_ch]; + } + + for (int i_ker_y = ker_y_start; i_ker_y < ker_y_end; i_ker_y++) + { + const int32_t idx_y = base_idx_y + dilation_y * i_ker_y; + for (int i_ker_x = ker_x_start; i_ker_x < ker_x_end; i_ker_x++) + { + const int32_t idx_x = base_idx_x + dilation_x * i_ker_x; + int32_t idx_0 = (idx_y * input_x + idx_x) * input_ch + i_input_ch; + int32_t ker_idx_0 = (i_ker_y * kernel_x + i_ker_x) * (input_ch * ch_mult) + idx_out_ch; + + acc_0 += (input[idx_0] + input_offset) * kernel[ker_idx_0]; + } + } + + /* Requantize and clamp output to provided range */ + acc_0 = arm_nn_requantize(acc_0, output_mult[idx_out_ch], output_shift[idx_out_ch]); + acc_0 += output_offset; + acc_0 = MAX(acc_0, output_activation_min); + acc_0 = MIN(acc_0, output_activation_max); + + output[i_out++] = acc_0; + } + } + } + } + /* Advance to the next batch */ + input += (input_x * input_y * input_ch); + } +} + +/* + * Basic s8 depthwise convolution function. + * + * Refer header file for details. + * Optimization using DSP extension is not available for the generic case where channel multiplier is > 1. + * + */ +arm_status arm_depthwise_conv_s8(const cmsis_nn_context *ctx, + const cmsis_nn_dw_conv_params *dw_conv_params, + const cmsis_nn_per_channel_quant_params *quant_params, + const cmsis_nn_dims *input_dims, + const q7_t *input, + const cmsis_nn_dims *filter_dims, + const q7_t *kernel, + const cmsis_nn_dims *bias_dims, + const int32_t *bias, + const cmsis_nn_dims *output_dims, + q7_t *output) +{ + const uint16_t dilation_x = dw_conv_params->dilation.w; + const uint16_t dilation_y = dw_conv_params->dilation.h; + + (void)dw_conv_params->dilation; + (void)bias_dims; + (void)ctx; + + if (dw_conv_params->ch_mult % 4 == 0 && input_dims->n == 1 && dw_conv_params->dilation.w == 1 && + dw_conv_params->dilation.h == 1) + { + depthwise_conv_s8_mult_4(input, + input_dims->w, + input_dims->h, + input_dims->c, + kernel, + output_dims->c, + dw_conv_params->ch_mult, + filter_dims->w, + filter_dims->h, + dw_conv_params->padding.w, + dw_conv_params->padding.h, + dw_conv_params->stride.w, + dw_conv_params->stride.h, + bias, + output, + quant_params->shift, + quant_params->multiplier, + output_dims->w, + output_dims->h, + dw_conv_params->output_offset, + dw_conv_params->input_offset, + dw_conv_params->activation.min, + dw_conv_params->activation.max); + } + else + { + depthwise_conv_s8_generic(input, + input_dims->n, + input_dims->w, + input_dims->h, + input_dims->c, + kernel, + output_dims->c, + dw_conv_params->ch_mult, + filter_dims->w, + filter_dims->h, + dw_conv_params->padding.w, + dw_conv_params->padding.h, + dw_conv_params->stride.w, + dw_conv_params->stride.h, + bias, + output, + quant_params->shift, + quant_params->multiplier, + output_dims->w, + output_dims->h, + dw_conv_params->output_offset, + dw_conv_params->input_offset, + dw_conv_params->activation.min, + dw_conv_params->activation.max, + dilation_x, + dilation_y); + } + + /* Return to application */ + return ARM_MATH_SUCCESS; +} + +/** + * @} end of NNConv group + */ diff --git a/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_s8_opt.c b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_s8_opt.c new file mode 100644 index 0000000..1edac04 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_s8_opt.c @@ -0,0 +1,433 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_depthwise_conv_s8_opt.c + * Description: Optimized s8 depthwise separable convolution function for + * channel multiplier of 1. + * + * $Date: January 26, 2021 + * $Revision: V.2.0.3 + * + * Target Processor: Cortex-M CPUs + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup NNConv + * @{ + */ + +/* + * Optimized s8 depthwise convolution function with constraint that in_channel equals out_channel + * + * Refer prototype header file for details. + * + */ + +arm_status arm_depthwise_conv_s8_opt(const cmsis_nn_context *ctx, + const cmsis_nn_dw_conv_params *dw_conv_params, + const cmsis_nn_per_channel_quant_params *quant_params, + const cmsis_nn_dims *input_dims, + const q7_t *input, + const cmsis_nn_dims *filter_dims, + const q7_t *kernel, + const cmsis_nn_dims *bias_dims, + const int32_t *bias, + const cmsis_nn_dims *output_dims, + q7_t *output) +{ + + const int32_t input_ch = input_dims->c; + const int32_t output_ch = output_dims->c; + + /* Check input constraints input_ch == output_ch */ + if (input_ch != output_ch) + { + return ARM_MATH_SIZE_MISMATCH; + } + + if (ctx->buf == NULL && arm_depthwise_conv_s8_opt_get_buffer_size(input_dims, filter_dims) > 0) + { + return ARM_MATH_ARGUMENT_ERROR; + } +#ifdef ARM_MATH_DSP + const int32_t input_x = input_dims->w; + const int32_t input_y = input_dims->h; + const int32_t kernel_x = filter_dims->w; + const int32_t kernel_y = filter_dims->h; + const int32_t pad_x = dw_conv_params->padding.w; + const int32_t pad_y = dw_conv_params->padding.h; + const int32_t stride_x = dw_conv_params->stride.w; + const int32_t stride_y = dw_conv_params->stride.h; + const int32_t *output_shift = quant_params->shift; + const int32_t *output_mult = quant_params->multiplier; + const int32_t output_x = output_dims->w; + const int32_t output_y = output_dims->h; + const int32_t output_offset = dw_conv_params->output_offset; + const int32_t input_offset = dw_conv_params->input_offset; + const int32_t output_activation_min = dw_conv_params->activation.min; + const int32_t output_activation_max = dw_conv_params->activation.max; + q15_t *buffer_a = (q15_t *)ctx->buf; + +#ifdef ARM_MATH_MVEI + (void)bias_dims; + /* Generate two columns from the input tensor */ + q7_t *lhs_buffer = (q7_t *)buffer_a; + q7_t *out = output; + int padded = 0; + int buffer_count = 0; + const int32_t kernel_size = kernel_x * kernel_y; + + /* This part implements the im2col function */ + for (int i_out_y = 0, base_idx_y = -pad_y; i_out_y < output_y; base_idx_y += stride_y, i_out_y++) + { + for (int i_out_x = 0, base_idx_x = -pad_x; i_out_x < output_x; base_idx_x += stride_x, i_out_x++) + { + for (int i_ker_y = base_idx_y; i_ker_y < base_idx_y + kernel_y; i_ker_y++) + { + for (int i_ker_x = base_idx_x; i_ker_x < base_idx_x + kernel_x; i_ker_x++) + { + if (i_ker_y < 0 || i_ker_y >= input_y || i_ker_x < 0 || i_ker_x >= input_x) + { + arm_memset_q7(lhs_buffer, (int8_t)-input_offset, (uint32_t)input_ch); + padded = 1; + } + else + { + arm_memcpy_q7(lhs_buffer, input + (i_ker_y * input_x + i_ker_x) * input_ch, (uint32_t)input_ch); + } + lhs_buffer += input_ch; + } + } + buffer_count++; + + if (buffer_count == 4) + { + lhs_buffer = (q7_t *)buffer_a; + if (padded == 0) + { + out = arm_nn_depthwise_conv_nt_t_s8(lhs_buffer, + kernel, + input_offset, + input_ch, + output_shift, + output_mult, + output_offset, + output_activation_min, + output_activation_max, + kernel_size, + bias, + out); + } + else + { + out = arm_nn_depthwise_conv_nt_t_padded_s8(lhs_buffer, + kernel, + input_offset, + input_ch, + output_shift, + output_mult, + output_offset, + output_activation_min, + output_activation_max, + kernel_size, + bias, + out); + padded = 0; + } + buffer_count = 0; + } + } + } + + /* Handle left over buffers */ + lhs_buffer = (q7_t *)buffer_a; + + for (int i_buf = 0; i_buf < buffer_count; i_buf++) + { + int32_t loop_count = (input_ch + 3) / 4; + + int32_t num_ch_to_process = input_ch; + for (int i_loop_cnt = 0, offset = 0; i_loop_cnt < loop_count; num_ch_to_process -= 4, offset += 4, i_loop_cnt++) + { + const int8_t *col_0 = lhs_buffer + (kernel_size * input_ch * i_buf) + offset; + const int8_t *row_0 = kernel + offset; + int32x4_t out_0 = vldrwq_s32(&bias[offset]); + + for (int i_ker = 0; i_ker < kernel_size; i_ker++) + { + const int32x4_t ker_0 = vldrbq_s32(row_0); + + int32x4_t ip_0 = vldrbq_s32(col_0); + ip_0 = vaddq_n_s32(ip_0, input_offset); + out_0 += vmulq_s32(ip_0, ker_0); + + col_0 += input_ch; + row_0 += input_ch; + } + + const int32x4_t mult = vldrwq_s32(&output_mult[offset]); + const int32x4_t shift = vldrwq_s32(&output_shift[offset]); + + out_0 = arm_requantize_mve_32x4(out_0, mult, shift); + out_0 = vaddq_n_s32(out_0, output_offset); + out_0 = vmaxq_s32(out_0, vdupq_n_s32(output_activation_min)); + out_0 = vminq_s32(out_0, vdupq_n_s32(output_activation_max)); + mve_pred16_t p = vctp32q((uint32_t)num_ch_to_process); + vstrbq_p_s32(out, out_0, p); + + out += 4; + } + + const int tail_ch = input_ch & 0x3; + if (tail_ch != 0) + { + out -= (4 - tail_ch); + } + } + +#else // ARM_MATH_DSP + (void)bias_dims; + /* Run the following code in cores using DSP extension */ + q15_t *const col_buffer_start = buffer_a; + q15_t *col_buffer = col_buffer_start; + const int32_t *const bias_start_pos = bias; + const q31_t *const out_mult_start_pos = output_mult; + const q31_t *const out_shift_start_pos = output_shift; + uint16_t row_count; + uint16_t row_shift; + + for (int i_out_y = 0; i_out_y < output_y; i_out_y++) + { + const int16_t base_idx_y = (i_out_y * stride_y) - pad_y; + for (int i_out_x = 0; i_out_x < output_x; i_out_x++) + { + const int16_t base_idx_x = (i_out_x * stride_x) - pad_x; + + /* Out of bounds is only considered for the y axis as it provides a contiguous zero'ing opportunity than + along the x axis */ + const int ker_y_start = MAX(0, -base_idx_y); + /* Condition for kernel end dimension: (base_idx_y + ker_y_end) < input_y */ + const int ker_y_end = MIN(kernel_y, input_y - base_idx_y); + + int32_t index = 0; + if (ker_y_start != 0) + { + memset(&col_buffer[index], 0, (kernel_x * input_ch) * ker_y_start * sizeof(q15_t)); + index += (kernel_x * input_ch) * ker_y_start; + } + + for (int i_ker_y = ker_y_start; i_ker_y < ker_y_end; i_ker_y++) + { + const int32_t idx_y = base_idx_y + i_ker_y; + + for (int i_ker_x = 0; i_ker_x < kernel_x; i_ker_x++) + { + const int32_t idx_x = base_idx_x + i_ker_x; + if (idx_x < 0 || idx_x >= input_x) + { + memset(&col_buffer[index], 0, input_ch * sizeof(q15_t)); + } + else + { + arm_q7_to_q15_with_offset((q7_t *)input + (idx_y * input_x + idx_x) * input_ch, + &col_buffer[index], + input_ch, + input_offset); + } + index += input_ch; + } + } + + const int diff = kernel_y - ker_y_end; + if (diff != 0) + { + memset(&col_buffer[index], 0, (kernel_x * input_ch) * diff * sizeof(q15_t)); + } + + row_count = output_ch / 4; + row_shift = 0; + bias = bias_start_pos; + output_mult = out_mult_start_pos; + output_shift = out_shift_start_pos; + + while (row_count) + { + q31_t sum = *bias++; + q31_t sum_2 = *bias++; + q31_t sum_3 = *bias++; + q31_t sum_4 = *bias++; + + uint16_t col_count = (kernel_x * kernel_y) / 2; + q15_t *col_pos = col_buffer_start + row_shift; + const q7_t *row_pos = kernel + row_shift; + row_shift += 4; + + while (col_count) + { + /* General idea is to read 4 + 4 (input, kernel) pair and re-arrange them in the right order to + use in a SMLAD instruction . One run of this loop produces 4 partial outputs with 8 MACs. */ + /* Note: variable names can be improved here to align with rows and columns. */ + q31_t ip_a1, ip_a2, ip_b1, ip_b2, op_a, op_b, op_c; + /* Read 4 weights */ + ip_b1 = arm_nn_read_q7x4(row_pos); + ip_a1 = arm_nn_read_q7x4(row_pos + input_ch); + op_a = arm_nn_read_q15x2(col_pos); + op_b = arm_nn_read_q15x2(col_pos + input_ch); + + ip_a2 = __SXTB16(ip_b1); + ip_b1 = __SXTB16(__ROR(ip_b1, 8)); + + ip_b2 = __SXTB16(ip_a1); + ip_a1 = __SXTB16(__ROR(ip_a1, 8)); + + op_c = __PKHBT(op_b, op_a, 16); + op_a = __PKHTB(op_b, op_a, 16); + op_b = __PKHBT(ip_b2, ip_a2, 16); + sum = __SMLAD(op_c, op_b, sum); + + op_b = __PKHBT(ip_b1, ip_a1, 16); + sum_2 = __SMLAD(op_a, op_b, sum_2); + + op_a = arm_nn_read_q15x2(col_pos + 2); + op_b = arm_nn_read_q15x2(col_pos + input_ch + 2); + + op_c = __PKHBT(op_b, op_a, 16); + op_a = __PKHTB(op_b, op_a, 16); + op_b = __PKHTB(ip_a2, ip_b2, 16); + sum_3 = __SMLAD(op_c, op_b, sum_3); + + op_b = __PKHTB(ip_a1, ip_b1, 16); + sum_4 = __SMLAD(op_a, op_b, sum_4); + + row_pos += input_ch << 1; + col_pos += input_ch << 1; + col_count--; + } + + col_count = (kernel_x * kernel_y) & 0x1; + while (col_count) + { + sum += row_pos[0] * col_pos[0]; + sum_2 += row_pos[1] * col_pos[1]; + sum_3 += row_pos[2] * col_pos[2]; + sum_4 += row_pos[3] * col_pos[3]; + + row_pos += input_ch; + col_pos += input_ch; + + col_count--; + } + sum = arm_nn_requantize(sum, *output_mult++, *output_shift++); + sum += output_offset; + sum = MAX(sum, output_activation_min); + sum = MIN(sum, output_activation_max); + *output++ = (q7_t)sum; + + sum_2 = arm_nn_requantize(sum_2, *output_mult++, *output_shift++); + sum_2 += output_offset; + sum_2 = MAX(sum_2, output_activation_min); + sum_2 = MIN(sum_2, output_activation_max); + *output++ = (q7_t)sum_2; + sum_3 = arm_nn_requantize(sum_3, *output_mult++, *output_shift++); + sum_3 += output_offset; + sum_3 = MAX(sum_3, output_activation_min); + sum_3 = MIN(sum_3, output_activation_max); + *output++ = (q7_t)sum_3; + + sum_4 = arm_nn_requantize(sum_4, *output_mult++, *output_shift++); + sum_4 += output_offset; + sum_4 = MAX(sum_4, output_activation_min); + sum_4 = MIN(sum_4, output_activation_max); + *output++ = (q7_t)sum_4; + + row_count--; + } + + row_count = output_ch & 0x3; + while (row_count) + { + q15_t *col_pos = col_buffer_start + row_shift; + const q7_t *row_pos = kernel + row_shift; + q31_t sum = *bias++; + const uint16_t col_count = (kernel_x * kernel_y); + row_shift += 1; + + for (int i = 0; i < col_count; i++) + { + sum += row_pos[i * input_ch] * col_pos[i * input_ch]; + } + sum = arm_nn_requantize(sum, *output_mult++, *output_shift++); + sum += output_offset; + sum = MAX(sum, output_activation_min); + sum = MIN(sum, output_activation_max); + *output++ = (q7_t)sum; + + row_count--; + } + + // clear counter and pointers + col_buffer = col_buffer_start; + } + } +#endif +#else + /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ + return arm_depthwise_conv_s8(ctx, + dw_conv_params, + quant_params, + input_dims, + input, + filter_dims, + kernel, + bias_dims, + bias, + output_dims, + output); +#endif /* ARM_MATH_MVEI | ARM_MATH_DSP */ + + /* Return to application */ + return ARM_MATH_SUCCESS; +} + +int32_t arm_depthwise_conv_s8_opt_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims) +{ +#if defined(ARM_MATH_MVEI) + /* The + 4 accounts for out of bounds read of the lhs buffers in the *_nt_t_* functions. */ + return (2 * input_dims->c * filter_dims->w * filter_dims->h) * (int32_t)sizeof(int16_t) + 4; +#elif defined(ARM_MATH_DSP) + return (input_dims->c * filter_dims->w * filter_dims->h) * sizeof(int16_t); +#else + (void)input_dims; + (void)filter_dims; + return 0; +#endif +} + +/** + * @} end of NNConv group + */ diff --git a/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_u8_basic_ver1.c b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_u8_basic_ver1.c new file mode 100644 index 0000000..c9d0afc --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_u8_basic_ver1.c @@ -0,0 +1,336 @@ +/* + * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_depthwise_conv_u8_basic_ver1.c + * Description: u8 depthwise convolution function + * + * $Date: 09. October 2020 + * $Revision: V.1.1.1 + * + * Target : Cortex-M CPUs + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup NNConv + * @{ + */ + +static void depthwise_conv_u8_mult_4(const uint8_t *input, + const int32_t input_x, + const int32_t input_y, + const int32_t input_ch, + const uint8_t *kernel, + const int32_t output_ch, + const int32_t ch_mult, + const int32_t kernel_x, + const int32_t kernel_y, + const int32_t pad_x, + const int32_t pad_y, + const int32_t stride_x, + const int32_t stride_y, + const int32_t *bias, + uint8_t *output, + const int32_t output_shift, + const int32_t output_mult, + const int32_t output_x, + const int32_t output_y, + const int32_t output_offset, + const int32_t input_offset, + const int32_t filter_offset, + const int32_t output_activation_min, + const int32_t output_activation_max) +{ + for (int32_t in_h = -pad_y, out_h = 0, out_idx = 0; out_h < output_y; in_h += stride_y, ++out_h) + { + for (int32_t in_w = -pad_x, out_w = 0, ker_h_start = MAX(0, -in_h); out_w < output_x; in_w += stride_x, ++out_w) + { + for (int32_t in_ch = 0, out_ch = 0, ker_w_start = MAX(0, -in_w); out_ch < output_ch; + ++in_ch, out_ch += ch_mult) + { + for (int mult_tile = 0; mult_tile < ch_mult; mult_tile += 4) + { + int32_t out_buff[4]; + + out_buff[0] = 0; + out_buff[1] = 0; + out_buff[2] = 0; + out_buff[3] = 0; + + for (int32_t ker_h = ker_h_start; ker_h < MIN(kernel_y, input_y - in_h); ++ker_h) + { + int32_t ker_idx = ker_h * (output_ch * kernel_x) + ker_w_start * output_ch + out_ch; + int32_t in_idx = (in_h + ker_h) * (input_ch * input_x) + in_w * input_ch + in_ch; + + for (int32_t ker_w = ker_w_start; ker_w < MIN(kernel_x, input_x - in_w); + ++ker_w, ker_idx += output_ch) + { + int32_t in_val = input[in_idx + ker_w * input_ch] + input_offset; + out_buff[0] += in_val * (kernel[ker_idx + 0 + mult_tile] + filter_offset); + out_buff[1] += in_val * (kernel[ker_idx + 1 + mult_tile] + filter_offset); + out_buff[2] += in_val * (kernel[ker_idx + 2 + mult_tile] + filter_offset); + out_buff[3] += in_val * (kernel[ker_idx + 3 + mult_tile] + filter_offset); + } + } + + if (bias != NULL) + { + out_buff[0] += bias[out_ch + 0 + mult_tile]; + out_buff[1] += bias[out_ch + 1 + mult_tile]; + out_buff[2] += bias[out_ch + 2 + mult_tile]; + out_buff[3] += bias[out_ch + 3 + mult_tile]; + } + out_buff[0] = arm_nn_requantize(out_buff[0], output_mult, output_shift); + out_buff[1] = arm_nn_requantize(out_buff[1], output_mult, output_shift); + out_buff[2] = arm_nn_requantize(out_buff[2], output_mult, output_shift); + out_buff[3] = arm_nn_requantize(out_buff[3], output_mult, output_shift); + + out_buff[0] += output_offset; + out_buff[1] += output_offset; + out_buff[2] += output_offset; + out_buff[3] += output_offset; + + out_buff[0] = MIN(MAX(out_buff[0], output_activation_min), output_activation_max); + out_buff[1] = MIN(MAX(out_buff[1], output_activation_min), output_activation_max); + out_buff[2] = MIN(MAX(out_buff[2], output_activation_min), output_activation_max); + out_buff[3] = MIN(MAX(out_buff[3], output_activation_min), output_activation_max); + + output[out_idx++] = (uint8_t)out_buff[0]; + output[out_idx++] = (uint8_t)out_buff[1]; + output[out_idx++] = (uint8_t)out_buff[2]; + output[out_idx++] = (uint8_t)out_buff[3]; + } + } + } + } +} + +static void depthwise_conv_u8_generic(const uint8_t *input, + const int32_t input_x, + const int32_t input_y, + const int32_t input_ch, + const uint8_t *kernel, + const int32_t output_ch, + const int32_t ch_mult, + const int32_t kernel_x, + const int32_t kernel_y, + const int32_t pad_x, + const int32_t pad_y, + const int32_t stride_x, + const int32_t stride_y, + const int32_t *bias, + uint8_t *output, + const int32_t output_shift, + const int32_t output_mult, + const int32_t output_x, + const int32_t output_y, + const int32_t output_offset, + const int32_t input_offset, + const int32_t filter_offset, + const int32_t output_activation_min, + const int32_t output_activation_max) +{ + (void)output_ch; + int i_out = 0; + for (int i_out_y = 0; i_out_y < output_y; i_out_y++) + { + const int16_t base_idx_y = (i_out_y * stride_y) - pad_y; + for (int i_out_x = 0; i_out_x < output_x; i_out_x++) + { + const int16_t base_idx_x = (i_out_x * stride_x) - pad_x; + for (int i_input_ch = 0; i_input_ch < input_ch; i_input_ch++) + { + for (int i_ch_mult = 0; i_ch_mult < ch_mult; i_ch_mult++) + { + const int idx_out_ch = i_ch_mult + i_input_ch * ch_mult; + int32_t acc_0; + /* Condition for kernel start dimension: (base_idx_<x,y> + ker_<x,y>_start) >= 0 */ + const int ker_y_start = MAX(0, -base_idx_y); + const int ker_x_start = MAX(0, -base_idx_x); + /* Condition for kernel end dimension: (base_idx_<x,y> + ker_<x,y>_end) < input_<x,y> */ + const int ker_y_end = MIN(kernel_y, input_y - base_idx_y); + const int ker_x_end = MIN(kernel_x, input_x - base_idx_x); + acc_0 = 0; + + for (int i_ker_y = ker_y_start; i_ker_y < ker_y_end; i_ker_y++) + { + const int32_t idx_y = base_idx_y + i_ker_y; + for (int i_ker_x = ker_x_start; i_ker_x < ker_x_end; i_ker_x++) + { + const int32_t idx_x = base_idx_x + i_ker_x; + int32_t idx_0 = (idx_y * input_x + idx_x) * input_ch + i_input_ch; + int32_t ker_idx_0 = (i_ker_y * kernel_x + i_ker_x) * (input_ch * ch_mult) + idx_out_ch; + + acc_0 += (input[idx_0] + input_offset) * (kernel[ker_idx_0] + filter_offset); + } + } + if (bias != NULL) + { + acc_0 += bias[idx_out_ch]; + } + + /* Requantize and clamp output to provided range */ + acc_0 = arm_nn_requantize(acc_0, output_mult, output_shift); + acc_0 += output_offset; + acc_0 = MAX(acc_0, output_activation_min); + acc_0 = MIN(acc_0, output_activation_max); + + output[i_out++] = acc_0; + } + } + } + } +} + +/** + * @brief uint8 depthwise convolution function with asymmetric quantization + * + * @param[in] input Pointer to input tensor + * @param[in] input_x Width of input tensor + * @param[in] input_y Height of input tensor + * @param[in] input_ch Channels in input tensor + * @param[in] kernel Pointer to kernel weights + * @param[in] kernel_x Width of kernel + * @param[in] kernel_y Height of kernel + * @param[in] ch_mult Number of channel multiplier + * @param[in] pad_x Padding sizes x + * @param[in] pad_y Padding sizes y + * @param[in] stride_x Convolution stride along the width + * @param[in] stride_y Convolution stride along the height + * @param[in] dilation_x Dilation along width. Not used and intended for future enhancement. + * @param[in] dilation_y Dilation along height. Not used and intended for future enhancement. + * @param[in] bias Pointer to optional bias values. If no bias is + * available, NULL is expected + * @param[in] input_offset Input tensor zero offset + * @param[in] filter_offset Kernel tensor zero offset + * @param[in] output_offset Output tensor zero offset + * @param[in,out] output Pointer to output tensor + * @param[in] output_x Width of output tensor + * @param[in] output_y Height of output tensor + * @param[in] output_activation_min Minimum value to clamp the output to. Range : {0, 255} + * @param[in] output_activation_max Minimum value to clamp the output to. Range : {0, 255} + * @param[in] output_shift Amount of right-shift for output + * @param[in] output_mult Output multiplier for requantization + * @return The function returns one of the following + * <code>ARM_MATH_SIZE_MISMATCH</code> - Not supported dimension of tensors + * <code>ARM_MATH_SUCCESS</code> - Successful operation + * <code>ARM_MATH_ARGUMENT_ERROR</code> - Implementation not available + * + * + */ + +arm_status arm_depthwise_conv_u8_basic_ver1(const uint8_t *input, + const uint16_t input_x, + const uint16_t input_y, + const uint16_t input_ch, + const uint8_t *kernel, + const uint16_t kernel_x, + const uint16_t kernel_y, + const int16_t ch_mult, + const int16_t pad_x, + const int16_t pad_y, + const int16_t stride_x, + const int16_t stride_y, + const int16_t dilation_x, + const int16_t dilation_y, + const int32_t *bias, + const int32_t input_offset, + const int32_t filter_offset, + const int32_t output_offset, + uint8_t *output, + const uint16_t output_x, + const uint16_t output_y, + const int32_t output_activation_min, + const int32_t output_activation_max, + const int32_t output_shift, + const int32_t output_mult) +{ + (void)dilation_x; + (void)dilation_y; + + if (ch_mult % 4 == 0) + { + depthwise_conv_u8_mult_4(input, + input_x, + input_y, + input_ch, + kernel, + ch_mult * input_ch, + ch_mult, + kernel_x, + kernel_y, + pad_x, + pad_y, + stride_x, + stride_y, + bias, + output, + output_shift, + output_mult, + output_x, + output_y, + output_offset, + input_offset, + filter_offset, + output_activation_min, + output_activation_max); + } + else + { + depthwise_conv_u8_generic(input, + input_x, + input_y, + input_ch, + kernel, + ch_mult * input_ch, + ch_mult, + kernel_x, + kernel_y, + pad_x, + pad_y, + stride_x, + stride_y, + bias, + output, + output_shift, + output_mult, + output_x, + output_y, + output_offset, + input_offset, + filter_offset, + output_activation_min, + output_activation_max); + } + + /* Return to application */ + return ARM_MATH_SUCCESS; +} + +/** + * @} end of NNConv group + */ diff --git a/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_wrapper_s8.c b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_wrapper_s8.c new file mode 100644 index 0000000..23c8e46 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_wrapper_s8.c @@ -0,0 +1,135 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_depthwise_conv_wrapper_s8.c + * Description: Wrapper API to select appropriate depthwise conv API based + * on dimensions. + * + * $Date: 20. Dec 2021 + * $Revision: V.1.4.0 + * + * Target Processor: Cortex-M CPUs + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup NNConv + * @{ + */ + +/* + * s8 Depthwise conv wrapper function + * + * Refer header file for details. + * + */ +arm_status arm_depthwise_conv_wrapper_s8(const cmsis_nn_context *ctx, + const cmsis_nn_dw_conv_params *dw_conv_params, + const cmsis_nn_per_channel_quant_params *quant_params, + const cmsis_nn_dims *input_dims, + const q7_t *input, + const cmsis_nn_dims *filter_dims, + const q7_t *filter, + const cmsis_nn_dims *bias_dims, + const int32_t *bias, + const cmsis_nn_dims *output_dims, + q7_t *output) +{ + arm_status status = ARM_MATH_SUCCESS; + if (1 == dw_conv_params->ch_mult && input_dims->n == 1 && dw_conv_params->dilation.w == 1 && + dw_conv_params->dilation.h == 1) + { +#if !defined(ARM_MATH_MVEI) + if ((filter_dims->w == 3) && (filter_dims->h == 3) && (dw_conv_params->padding.h <= 1) && + (dw_conv_params->padding.w <= 1)) + { + status = arm_depthwise_conv_3x3_s8(ctx, + dw_conv_params, + quant_params, + input_dims, + input, + filter_dims, + filter, + bias_dims, + bias, + output_dims, + output); + } + else +#endif + { + status = arm_depthwise_conv_s8_opt(ctx, + dw_conv_params, + quant_params, + input_dims, + input, + filter_dims, + filter, + bias_dims, + bias, + output_dims, + output); + } + } + else + { + status = arm_depthwise_conv_s8(ctx, + dw_conv_params, + quant_params, + input_dims, + input, + filter_dims, + filter, + bias_dims, + bias, + output_dims, + output); + } + + /* Return to application */ + return status; +} + +int32_t arm_depthwise_conv_wrapper_s8_get_buffer_size(const cmsis_nn_dw_conv_params *dw_conv_params, + const cmsis_nn_dims *input_dims, + const cmsis_nn_dims *filter_dims, + const cmsis_nn_dims *output_dims) +{ + (void)dw_conv_params; + int32_t size = 0; + + if (input_dims->c == output_dims->c && input_dims->n == 1 && dw_conv_params->dilation.w == 1 && + dw_conv_params->dilation.h == 1) + { + size = arm_depthwise_conv_s8_opt_get_buffer_size(input_dims, filter_dims); + } + + return size; +} + +/** + * @} end of NNConv group + */ diff --git a/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_separable_conv_HWC_q7.c b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_separable_conv_HWC_q7.c new file mode 100644 index 0000000..729147f --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_separable_conv_HWC_q7.c @@ -0,0 +1,422 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_depthwise_separable_conv_HWC_q7.c + * Description: Q7 depthwise separable convolution function + * + * $Date: July 20, 2021 + * $Revision: V.1.1.2 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup NNConv + * @{ + */ + +/** + * @brief Q7 depthwise separable convolution function + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimension + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferB pointer to buffer space for output + * @return The function returns either + * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking. + * + * @details + * + * <b>Buffer size:</b> + * + * bufferA size: 2*ch_im_in*dim_kernel*dim_kernel + * + * bufferB size: 0 + * + * <b>Input dimension constraints:</b> + * + * ch_im_in equals ch_im_out + * + * Implementation: + * There are 3 nested loop here: + * Inner loop: calculate each output value with MAC instruction over an accumulator + * Mid loop: loop over different output channel + * Outer loop: loop over different output (x, y) + */ + +arm_status arm_depthwise_separable_conv_HWC_q7(const q7_t *Im_in, + const uint16_t dim_im_in, + const uint16_t ch_im_in, + const q7_t *wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel, + const uint16_t padding, + const uint16_t stride, + const q7_t *bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q7_t *Im_out, + const uint16_t dim_im_out, + q15_t *bufferA, + q7_t *bufferB) +{ + (void)bufferB; +#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) + /* Run the following code for Cortex-M4 and Cortex-M7 */ + + int16_t i_out_y, i_out_x; + int16_t i_ker_y, i_ker_x; + q7_t *colBuffer = (q7_t *)bufferA; + q7_t *pBuffer = colBuffer; + const q7_t *pBias = bias; + q7_t *pOut = Im_out; + uint16_t rowCnt; + uint16_t row_shift; + + /* do some checking here, basically ch_im_in == ch_im_out */ + if (ch_im_in != ch_im_out) + { + return ARM_MATH_SIZE_MISMATCH; + } + + for (i_out_y = 0; i_out_y < dim_im_out; i_out_y++) + { + for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++) + { + /* we first do im2col here */ + for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++) + { + for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++) + { + if (i_ker_y < 0 || i_ker_y >= dim_im_in || i_ker_x < 0 || i_ker_x >= dim_im_in) + { + /* arm_fill_q7(0, pBuffer, ch_im_in); */ + memset(pBuffer, 0, ch_im_in); + } + else + { + /* arm_copy_q7((q7_t *) Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, pBuffer, ch_im_in); + */ + memcpy(pBuffer, (q7_t *)Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, ch_im_in); + } + pBuffer += ch_im_in; + } + } + + /* we will do the computation here for each channel */ + rowCnt = ch_im_out >> 2; + row_shift = 0; + pBias = bias; + + while (rowCnt) + { + q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + q31_t sum3 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + + uint16_t colCnt = (dim_kernel * dim_kernel) >> 1; + q7_t *pB = colBuffer + row_shift; + const q7_t *pA = wt + row_shift; + row_shift += 4; + +#ifdef USE_INTRINSIC + +#ifndef ARM_MATH_BIG_ENDIAN + + while (colCnt) + { + q31_t inA1, inA2, inB1, inB2, opA, opB; + + inB1 = arm_nn_read_q7x4(pB); + pB += ch_im_in; + opB = arm_nn_read_q7x4(pB); + pB += ch_im_in; + inB2 = __PKHTB(opB, inB1, 16); + inB1 = __PKHBT(inB1, opB, 16); + inA1 = arm_nn_read_q7x4(pA); + pA += ch_im_in; + opB = arm_nn_read_q7x4(pA); + pA += ch_im_in; + inA2 = __PKHTB(opB, inA1, 16); + inA1 = __PKHBT(inA1, opB, 16); + opA = __SXTB16(inA1); + opB = __SXTB16(inB1); + sum = __SMLAD(opA, opB, sum); + opA = __SXTB16(__ROR(inA1, 8)); + opB = __SXTB16(__ROR(inB1, 8)); + sum2 = __SMLAD(opA, opB, sum2); + opA = __SXTB16(inA2); + opB = __SXTB16(inB2); + sum3 = __SMLAD(opA, opB, sum3); + opA = __SXTB16(__ROR(inA2, 8)); + opB = __SXTB16(__ROR(inB2, 8)); + sum4 = __SMLAD(opA, opB, sum4); + colCnt--; + } +#else + + while (colCnt) + { + q31_t inA1, inA2, inB1, inB2, opA, opB; + + inB1 = arm_nn_read_q7x4(pB); + pB += ch_im_in; + opB = arm_nn_read_q7x4(pB); + pB += ch_im_in; + inB2 = __PKHBT(opB, inB1, 16); + inB1 = __PKHTB(inB1, opB, 16); + inA1 = arm_nn_read_q7x4(pA); + pA += ch_im_in; + opB = arm_nn_read_q7x4(pA); + pA += ch_im_in; + inA2 = __PKHBT(opB, inA1, 16); + inA1 = __PKHTB(inA1, opB, 16); + opA = __SXTB16(inA1); + opB = __SXTB16(inB1); + sum2 = __SMLAD(opA, opB, sum2); + opA = __SXTB16(__ROR(inA1, 8)); + opB = __SXTB16(__ROR(inB1, 8)); + sum = __SMLAD(opA, opB, sum); + opA = __SXTB16(inA2); + opB = __SXTB16(inB2); + sum4 = __SMLAD(opA, opB, sum4); + opA = __SXTB16(__ROR(inA2, 8)); + opB = __SXTB16(__ROR(inB2, 8)); + sum3 = __SMLAD(opA, opB, sum3); + colCnt--; + } + +#endif /* ARM_MATH_BIG_ENDIAN */ + +#else + +#ifndef ARM_MATH_BIG_ENDIAN + /* + * r0 r1 r2 r3 r4 r5 + * inA1, inA2, inB1, inB2, opA, opB + */ + + asm volatile("COL_LOOP_%=:\n" + "ldr.w r2, [%[pB], #0]\n" + "add.w %[pB], %[pB], %[ch_im_in]\n" + "ldr.w r5, [%[pB], #0]\n" + "add.w %[pB], %[pB], %[ch_im_in]\n" + "pkhtb r3, r5, r2, ASR #16\n" + "pkhbt r2, r2, r5, LSL #16\n" + "ldr.w r0, [%[pA], #0]\n" + "add.w %[pA], %[pA], %[ch_im_in]\n" + "ldr.w r5, [%[pA], #0]\n" + "add.w %[pA], %[pA], %[ch_im_in]\n" + "pkhtb r1, r5, r0, ASR #16\n" + "pkhbt r0, r0, r5, LSL #16\n" + "sxtb16 r4, r0\n" + "sxtb16 r5, r2\n" + "smlad %[sum], r4, r5, %[sum]\n" + "mov.w r4, r0, ror #8\n" + "mov.w r5, r2, ror #8\n" + "sxtb16 r4, r4\n" + "sxtb16 r5, r5\n" + "smlad %[sum2], r4, r5, %[sum2]\n" + "sxtb16 r4, r1\n" + "sxtb16 r5, r3\n" + "smlad %[sum3], r4, r5, %[sum3]\n" + "mov.w r4, r1, ror #8\n" + "mov.w r5, r3, ror #8\n" + "sxtb16 r4, r4\n" + "sxtb16 r5, r5\n" + "smlad %[sum4], r4, r5, %[sum4]\n" + "subs %[colCnt], #1\n" + "bne COL_LOOP_%=\n" + : [ sum ] "+r"(sum), + [ sum2 ] "+r"(sum2), + [ sum3 ] "+r"(sum3), + [ sum4 ] "+r"(sum4), + [ pB ] "+r"(pB), + [ pA ] "+r"(pA) + : [ colCnt ] "r"(colCnt), [ ch_im_in ] "r"(ch_im_in) + : "r0", "r1", "r2", "r3", "r4", "r5"); +#else + /* + * r0 r1 r2 r3 r4 r5 + * inA1, inA2, inB1, inB2, opA, opB + */ + asm volatile("COL_LOOP_%=:\n" + "ldr.w r2, [%[pB], #0]\n" + "add.w %[pB], %[pB], %[ch_im_in]\n" + "ldr.w r5, [%[pB], #0]\n" + "add.w %[pB], %[pB], %[ch_im_in]\n" + "pkhbt r3, r5, r2, LSL #16\n" + "pkhtb r2, r2, r5, ASR #16\n" + "ldr.w r0, [%[pA], #0]\n" + "add.w %[pA], %[pA], %[ch_im_in]\n" + "ldr.w r5, [%[pA], #0]\n" + "add.w %[pA], %[pA], %[ch_im_in]\n" + "pkhbt r1, r5, r0, LSL #16\n" + "pkhtb r0, r0, r5, ASR #16\n" + "sxtb16 r4, r0\n" + "sxtb16 r5, r2\n" + "smlad %[sum2], r4, r5, %[sum2]\n" + "mov.w r4, r0, ror #8\n" + "mov.w r5, r2, ror #8\n" + "sxtb16 r4, r4\n" + "sxtb16 r5, r5\n" + "smlad %[sum], r4, r5, %[sum]\n" + "sxtb16 r4, r1\n" + "sxtb16 r5, r3\n" + "smlad %[sum4], r4, r5, %[sum4]\n" + "mov.w r4, r1, ror #8\n" + "mov.w r5, r3, ror #8\n" + "sxtb16 r4, r4\n" + "sxtb16 r5, r5\n" + "smlad %[sum3], r4, r5, %[sum3]\n" + "subs %[colCnt], #1\n" + "bne COL_LOOP_%=\n" + : [ sum ] "+r"(sum), + [ sum2 ] "+r"(sum2), + [ sum3 ] "+r"(sum3), + [ sum4 ] "+r"(sum4), + [ pB ] "+r"(pB), + [ pA ] "+r"(pA) + : [ colCnt ] "r"(colCnt), [ ch_im_in ] "r"(ch_im_in) + : "r0", "r1", "r2", "r3", "r4", "r5"); + +#endif /* ARM_MATH_BIG_ENDIAN */ + +#endif /* USE_INTRINSIC */ + + colCnt = (dim_kernel * dim_kernel) & 0x1; + while (colCnt) + { + union arm_nnword inA, inB; + inA.word = arm_nn_read_q7x4(pA); + pA += ch_im_in; + inB.word = arm_nn_read_q7x4(pB); + pB += ch_im_in; + sum += inA.bytes[0] * inB.bytes[0]; + sum2 += inA.bytes[1] * inB.bytes[1]; + sum3 += inA.bytes[2] * inB.bytes[2]; + sum4 += inA.bytes[3] * inB.bytes[3]; + colCnt--; + } + + *pOut++ = (q7_t)__SSAT((sum >> out_shift), 8); + *pOut++ = (q7_t)__SSAT((sum2 >> out_shift), 8); + *pOut++ = (q7_t)__SSAT((sum3 >> out_shift), 8); + *pOut++ = (q7_t)__SSAT((sum4 >> out_shift), 8); + + rowCnt--; + } + + rowCnt = ch_im_out & 0x3; + while (rowCnt) + { + q7_t *pB = colBuffer + row_shift; + const q7_t *pA = wt + row_shift; + q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + uint16_t colCnt = (dim_kernel * dim_kernel); + + row_shift += 1; + + while (colCnt) + { + q7_t A1 = *pA; + q7_t B1 = *pB; + pA += ch_im_in; + pB += ch_im_in; + sum += A1 * B1; + + colCnt--; + } + *pOut++ = (q7_t)__SSAT((sum >> out_shift), 8); + rowCnt--; + } + + /* clear counter and pointers */ + pBuffer = colBuffer; + } + } + +#else + (void)bufferA; + /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ + int i_out_y, i_out_x, i_ch_out, i_ker_x, i_ker_y; + int conv_out; + + /* do some checking here, basically ch_im_in == ch_im_out */ + if (ch_im_in != ch_im_out) + { + return ARM_MATH_SIZE_MISMATCH; + } + + for (i_out_y = 0; i_out_y < dim_im_out; i_out_y++) + { + for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++) + { + for (i_ch_out = 0; i_ch_out < ch_im_out; i_ch_out++) + { + // for each output + conv_out = ((q31_t)(bias[i_ch_out]) << bias_shift) + NN_ROUND(out_shift); + for (i_ker_y = 0; i_ker_y < dim_kernel; i_ker_y++) + { + for (i_ker_x = 0; i_ker_x < dim_kernel; i_ker_x++) + { + int in_row = stride * i_out_y + i_ker_y - padding; + int in_col = stride * i_out_x + i_ker_x - padding; + if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in && in_col < dim_im_in) + { + conv_out += Im_in[(in_row * dim_im_in + in_col) * ch_im_in + i_ch_out] * + wt[(i_ker_y * dim_kernel + i_ker_x) * ch_im_out + i_ch_out]; + } + } + } + Im_out[(i_out_y * dim_im_out + i_out_x) * ch_im_out + i_ch_out] = + (q7_t)__SSAT((conv_out >> out_shift), 8); + } + } + } + +#endif /* ARM_MATH_DSP */ + + /* Return to application */ + return ARM_MATH_SUCCESS; +} + +/** + * @} end of NNConv group + */ diff --git a/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_separable_conv_HWC_q7_nonsquare.c b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_separable_conv_HWC_q7_nonsquare.c new file mode 100644 index 0000000..829acf9 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_separable_conv_HWC_q7_nonsquare.c @@ -0,0 +1,427 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_depthwise_separable_conv_HWC_q7_nonsquare.c + * Description: Q7 depthwise separable convolution function (non-square shape) + * + * $Date: July 20, 2021 + * $Revision: V.1.1.2 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup NNConv + * @{ + */ + +/** + * @brief Q7 depthwise separable convolution function (non-square shape) + * @param[in] Im_in pointer to input tensor + * @param[in] dim_im_in_x input tensor dimension x + * @param[in] dim_im_in_y input tensor dimension y + * @param[in] ch_im_in number of input tensor channels + * @param[in] wt pointer to kernel weights + * @param[in] ch_im_out number of filters, i.e., output tensor channels + * @param[in] dim_kernel_x filter kernel size x + * @param[in] dim_kernel_y filter kernel size y + * @param[in] padding_x padding sizes x + * @param[in] padding_y padding sizes y + * @param[in] stride_x convolution stride x + * @param[in] stride_y convolution stride y + * @param[in] bias pointer to bias + * @param[in] bias_shift amount of left-shift for bias + * @param[in] out_shift amount of right-shift for output + * @param[in,out] Im_out pointer to output tensor + * @param[in] dim_im_out_x output tensor dimension x + * @param[in] dim_im_out_y output tensor dimension y + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] bufferB pointer to buffer space for output + * @return The function returns either + * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking. + * + * This function is the version with full list of optimization tricks, but with + * some constraints: + * ch_im_in is equal to ch_im_out + * + */ + +arm_status arm_depthwise_separable_conv_HWC_q7_nonsquare(const q7_t *Im_in, + const uint16_t dim_im_in_x, + const uint16_t dim_im_in_y, + const uint16_t ch_im_in, + const q7_t *wt, + const uint16_t ch_im_out, + const uint16_t dim_kernel_x, + const uint16_t dim_kernel_y, + const uint16_t padding_x, + const uint16_t padding_y, + const uint16_t stride_x, + const uint16_t stride_y, + const q7_t *bias, + const uint16_t bias_shift, + const uint16_t out_shift, + q7_t *Im_out, + const uint16_t dim_im_out_x, + const uint16_t dim_im_out_y, + q15_t *bufferA, + q7_t *bufferB) +{ + + (void)bufferB; + +#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) + /* Run the following code for Cortex-M4 and Cortex-M7 */ + + /* + * Implementation: + * There are 3 nested loop here: + * Inner loop: calculate each output value with MAC instruction over an accumulator + * Mid loop: loop over different output channel + * Outer loop: loop over different output (x, y) + * + */ + + int16_t i_out_y, i_out_x; + int16_t i_ker_y, i_ker_x; + q7_t *colBuffer = (q7_t *)bufferA; + q7_t *pBuffer = colBuffer; + const q7_t *pBias = bias; + q7_t *pOut = Im_out; + uint16_t rowCnt; + uint16_t row_shift; + + /* do some checking here, basically ch_im_in == ch_im_out */ + if (ch_im_in != ch_im_out) + { + return ARM_MATH_SIZE_MISMATCH; + } + + for (i_out_y = 0; i_out_y < dim_im_out_y; i_out_y++) + { + for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++) + { + /* we first do im2col here */ + for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y; + i_ker_y++) + { + for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x; + i_ker_x++) + { + if (i_ker_y < 0 || i_ker_y >= dim_im_in_y || i_ker_x < 0 || i_ker_x >= dim_im_in_x) + { + /* arm_fill_q7(0, pBuffer, ch_im_in); */ + memset(pBuffer, 0, ch_im_in); + } + else + { + /* arm_copy_q7((q7_t *) Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, pBuffer, + * ch_im_in); */ + memcpy(pBuffer, (q7_t *)Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, ch_im_in); + } + pBuffer += ch_im_in; + } + } + + /* we will do the computation here for each channel */ + rowCnt = ch_im_out >> 2; + row_shift = 0; + pBias = bias; + + while (rowCnt) + { + q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + q31_t sum3 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + + uint16_t colCnt = (dim_kernel_x * dim_kernel_y) >> 1; + q7_t *pB = colBuffer + row_shift; + const q7_t *pA = wt + row_shift; + row_shift += 4; + +#ifdef USE_INTRINSIC + +#ifndef ARM_MATH_BIG_ENDIAN + + while (colCnt) + { + q31_t inA1, inA2, inB1, inB2, opA, opB; + + inB1 = arm_nn_read_q7x4(pB); + pB += ch_im_in; + opB = arm_nn_read_q7x4(pB); + pB += ch_im_in; + inB2 = __PKHTB(opB, inB1, 16); + inB1 = __PKHBT(inB1, opB, 16); + inA1 = arm_nn_read_q7x4(pA); + pA += ch_im_in; + opB = arm_nn_read_q7x4(pA); + pA += ch_im_in; + inA2 = __PKHTB(opB, inA1, 16); + inA1 = __PKHBT(inA1, opB, 16); + opA = __SXTB16(inA1); + opB = __SXTB16(inB1); + sum = __SMLAD(opA, opB, sum); + opA = __SXTB16(__ROR(inA1, 8)); + opB = __SXTB16(__ROR(inB1, 8)); + sum2 = __SMLAD(opA, opB, sum2); + opA = __SXTB16(inA2); + opB = __SXTB16(inB2); + sum3 = __SMLAD(opA, opB, sum3); + opA = __SXTB16(__ROR(inA2, 8)); + opB = __SXTB16(__ROR(inB2, 8)); + sum4 = __SMLAD(opA, opB, sum4); + colCnt--; + } +#else + + while (colCnt) + { + q31_t inA1, inA2, inB1, inB2, opA, opB; + + inB1 = arm_nn_read_q7x4(pB); + pB += ch_im_in; + opB = arm_nn_read_q7x4(pB); + pB += ch_im_in; + inB2 = __PKHBT(opB, inB1, 16); + inB1 = __PKHTB(inB1, opB, 16); + inA1 = arm_nn_read_q7x4(pA); + pA += ch_im_in; + opB = arm_nn_read_q7x4(pA); + pA += ch_im_in; + inA2 = __PKHBT(opB, inA1, 16); + inA1 = __PKHTB(inA1, opB, 16); + opA = __SXTB16(inA1); + opB = __SXTB16(inB1); + sum2 = __SMLAD(opA, opB, sum2); + opA = __SXTB16(__ROR(inA1, 8)); + opB = __SXTB16(__ROR(inB1, 8)); + sum = __SMLAD(opA, opB, sum); + opA = __SXTB16(inA2); + opB = __SXTB16(inB2); + sum4 = __SMLAD(opA, opB, sum4); + opA = __SXTB16(__ROR(inA2, 8)); + opB = __SXTB16(__ROR(inB2, 8)); + sum3 = __SMLAD(opA, opB, sum3); + colCnt--; + } + +#endif /* ARM_MATH_BIG_ENDIAN */ + +#else + +#ifndef ARM_MATH_BIG_ENDIAN + // r0 r1 r2 r3 r4 r5 + // inA1, inA2, inB1, inB2, opA, opB + asm volatile("COL_LOOP:\n" + "ldr.w r2, [%[pB], #0]\n" + "add.w %[pB], %[pB], %[ch_im_in]\n" + "ldr.w r5, [%[pB], #0]\n" + "add.w %[pB], %[pB], %[ch_im_in]\n" + "pkhtb r3, r5, r2, ASR #16\n" + "pkhbt r2, r2, r5, LSL #16\n" + "ldr.w r0, [%[pA], #0]\n" + "add.w %[pA], %[pA], %[ch_im_in]\n" + "ldr.w r5, [%[pA], #0]\n" + "add.w %[pA], %[pA], %[ch_im_in]\n" + "pkhtb r1, r5, r0, ASR #16\n" + "pkhbt r0, r0, r5, LSL #16\n" + "sxtb16 r4, r0\n" + "sxtb16 r5, r2\n" + "smlad %[sum], r4, r5, %[sum]\n" + "mov.w r4, r0, ror #8\n" + "mov.w r5, r2, ror #8\n" + "sxtb16 r4, r4\n" + "sxtb16 r5, r5\n" + "smlad %[sum2], r4, r5, %[sum2]\n" + "sxtb16 r4, r1\n" + "sxtb16 r5, r3\n" + "smlad %[sum3], r4, r5, %[sum3]\n" + "mov.w r4, r1, ror #8\n" + "mov.w r5, r3, ror #8\n" + "sxtb16 r4, r4\n" + "sxtb16 r5, r5\n" + "smlad %[sum4], r4, r5, %[sum4]\n" + "subs %[colCnt], #1\n" + "bne COL_LOOP\n" + : [ sum ] "+r"(sum), + [ sum2 ] "+r"(sum2), + [ sum3 ] "+r"(sum3), + [ sum4 ] "+r"(sum4), + [ pB ] "+r"(pB), + [ pA ] "+r"(pA) + : [ colCnt ] "r"(colCnt), [ ch_im_in ] "r"(ch_im_in) + : "r0", "r1", "r2", "r3", "r4", "r5"); +#else + // r0 r1 r2 r3 r4 r5 + // inA1, inA2, inB1, inB2, opA, opB + asm volatile("COL_LOOP:\n" + "ldr.w r2, [%[pB], #0]\n" + "add.w %[pB], %[pB], %[ch_im_in]\n" + "ldr.w r5, [%[pB], #0]\n" + "add.w %[pB], %[pB], %[ch_im_in]\n" + "pkhbt r3, r5, r2, LSL #16\n" + "pkhtb r2, r2, r5, ASR #16\n" + "ldr.w r0, [%[pA], #0]\n" + "add.w %[pA], %[pA], %[ch_im_in]\n" + "ldr.w r5, [%[pA], #0]\n" + "add.w %[pA], %[pA], %[ch_im_in]\n" + "pkhbt r1, r5, r0, LSL #16\n" + "pkhtb r0, r0, r5, ASR #16\n" + "sxtb16 r4, r0\n" + "sxtb16 r5, r2\n" + "smlad %[sum2], r4, r5, %[sum2]\n" + "mov.w r4, r0, ror #8\n" + "mov.w r5, r2, ror #8\n" + "sxtb16 r4, r4\n" + "sxtb16 r5, r5\n" + "smlad %[sum], r4, r5, %[sum]\n" + "sxtb16 r4, r1\n" + "sxtb16 r5, r3\n" + "smlad %[sum4], r4, r5, %[sum4]\n" + "mov.w r4, r1, ror #8\n" + "mov.w r5, r3, ror #8\n" + "sxtb16 r4, r4\n" + "sxtb16 r5, r5\n" + "smlad %[sum3], r4, r5, %[sum3]\n" + "subs %[colCnt], #1\n" + "bne COL_LOOP\n" + : [ sum ] "+r"(sum), + [ sum2 ] "+r"(sum2), + [ sum3 ] "+r"(sum3), + [ sum4 ] "+r"(sum4), + [ pB ] "+r"(pB), + [ pA ] "+r"(pA) + : [ colCnt ] "r"(colCnt), [ ch_im_in ] "r"(ch_im_in) + : "r0", "r1", "r2", "r3", "r4", "r5"); +#endif /*ARM_MATH_BIG_ENDIAN */ + +#endif /* USE_INTRINSIC */ + + colCnt = (dim_kernel_x * dim_kernel_y) & 0x1; + while (colCnt) + { + union arm_nnword inA, inB; + inA.word = arm_nn_read_q7x4(pA); + pA += ch_im_in; + inB.word = arm_nn_read_q7x4(pB); + pB += ch_im_in; + sum += inA.bytes[0] * inB.bytes[0]; + sum2 += inA.bytes[1] * inB.bytes[1]; + sum3 += inA.bytes[2] * inB.bytes[2]; + sum4 += inA.bytes[3] * inB.bytes[3]; + colCnt--; + } + + *pOut++ = (q7_t)__SSAT((sum >> out_shift), 8); + *pOut++ = (q7_t)__SSAT((sum2 >> out_shift), 8); + *pOut++ = (q7_t)__SSAT((sum3 >> out_shift), 8); + *pOut++ = (q7_t)__SSAT((sum4 >> out_shift), 8); + + rowCnt--; + } + + rowCnt = ch_im_out & 0x3; + while (rowCnt) + { + q7_t *pB = colBuffer + row_shift; + const q7_t *pA = wt + row_shift; + q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + uint16_t colCnt = (dim_kernel_x * dim_kernel_y); + + row_shift += 1; + + while (colCnt) + { + q7_t A1 = *pA; + q7_t B1 = *pB; + pA += ch_im_in; + pB += ch_im_in; + sum += A1 * B1; + + colCnt--; + } + *pOut++ = (q7_t)__SSAT((sum >> out_shift), 8); + rowCnt--; + } + + // clear counter and pointers + pBuffer = colBuffer; + } + } + +#else + (void)bufferA; + + /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ + int i_out_y, i_out_x, i_ch_out; + int i_ker_y, i_ker_x; + + /* do some checking here, basically ch_im_in == ch_im_out */ + if (ch_im_in != ch_im_out) + { + return ARM_MATH_SIZE_MISMATCH; + } + + for (i_out_y = 0; i_out_y < dim_im_out_y; i_out_y++) + { + for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++) + { + for (i_ch_out = 0; i_ch_out < ch_im_out; i_ch_out++) + { + // for each output + int conv_out = ((q31_t)(bias[i_ch_out]) << bias_shift) + NN_ROUND(out_shift); + for (i_ker_y = 0; i_ker_y < dim_kernel_y; i_ker_y++) + { + for (i_ker_x = 0; i_ker_x < dim_kernel_x; i_ker_x++) + { + int in_row = stride_y * i_out_y + i_ker_y - padding_y; + int in_col = stride_x * i_out_x + i_ker_x - padding_x; + if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in_y && in_col < dim_im_in_x) + { + conv_out += Im_in[(in_row * dim_im_in_x + in_col) * ch_im_in + i_ch_out] * + wt[(i_ker_y * dim_kernel_x + i_ker_x) * ch_im_out + i_ch_out]; + } + } + } + Im_out[(i_out_y * dim_im_out_x + i_out_x) * ch_im_out + i_ch_out] = + (q7_t)__SSAT((conv_out >> out_shift), 8); + } + } + } + +#endif /* ARM_MATH_DSP */ + + /* Return to application */ + return ARM_MATH_SUCCESS; +} + +/** + * @} end of NNConv group + */ diff --git a/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_depthwise_conv_s8_core.c b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_depthwise_conv_s8_core.c new file mode 100644 index 0000000..481eeba --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_depthwise_conv_s8_core.c @@ -0,0 +1,218 @@ +/* + * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_nn_depthwise_conv_s8_core.c + * Description: Depthwise convolution on im2col buffers. + * + * $Date: 09. October 2020 + * $Revision: V.1.0.4 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ + +#include "arm_nnsupportfunctions.h" + +/* + * Depthwise conv on an im2col buffer where the input channel equals + * output channel. + * + * Refer header file for details. + * + */ + +q7_t *arm_nn_depthwise_conv_s8_core(const q7_t *row, + const q15_t *col, + const uint16_t num_ch, + const int32_t *out_shift, + const int32_t *out_mult, + const int32_t out_offset, + const int32_t activation_min, + const int32_t activation_max, + const uint16_t kernel_size, + const int32_t *const output_bias, + q7_t *out) +{ +#if defined(ARM_MATH_MVEI) + int32_t ch_per_loop = num_ch / 4; + + const int32_t *bias = output_bias; + int8_t *out_tmp = out; + + int32_t idx = 0; + + while (ch_per_loop > 0) + { + int32x4_t ip_0; + int32x4_t ip_1; + int32_t ker_loop = kernel_size / 3; + int32x4_t out_0 = vldrwq_s32(bias); + int32x4_t out_1 = out_0; + bias += 4; + + const int32_t offset = idx * 4; + const int8_t *row_0 = row + offset; + const int16_t *col_0 = col + offset; + const int16_t *col_1 = col + kernel_size * num_ch + offset; + + int32x4_t ker_0 = vldrbq_s32(row_0); + + while (ker_loop > 0) + { + const int8_t *row_1 = row_0 + num_ch; + const int8_t *row_2 = row_0 + 2 * num_ch; + const int32x4_t ker_1 = vldrbq_s32(row_1); + const int32x4_t ker_2 = vldrbq_s32(row_2); + + ip_0 = vldrhq_s32(col_0); + ip_1 = vldrhq_s32(col_1); + col_0 += num_ch; + col_1 += num_ch; + + out_0 += vmulq_s32(ip_0, ker_0); + out_1 += vmulq_s32(ip_1, ker_0); + + ip_0 = vldrhq_s32(col_0); + ip_1 = vldrhq_s32(col_1); + col_0 += num_ch; + col_1 += num_ch; + + out_0 += vmulq_s32(ip_0, ker_1); + out_1 += vmulq_s32(ip_1, ker_1); + + ip_0 = vldrhq_s32(col_0); + ip_1 = vldrhq_s32(col_1); + col_0 += num_ch; + col_1 += num_ch; + + out_0 += vmulq_s32(ip_0, ker_2); + out_1 += vmulq_s32(ip_1, ker_2); + row_0 += 3 * num_ch; + + ker_0 = vldrbq_s32(row_0); + ker_loop--; + } + + idx++; + /* Handle tail kernel elements */ + ker_loop = kernel_size - ((kernel_size / 3) * 3); + while (ker_loop > 0) + { + ip_0 = vldrhq_s32(col_0); + ip_1 = vldrhq_s32(col_1); + + out_0 += vmulq_s32(ip_0, ker_0); + out_1 += vmulq_s32(ip_1, ker_0); + + col_0 += num_ch; + col_1 += num_ch; + + ip_0 = vldrhq_s32(col_0); + ip_1 = vldrhq_s32(col_1); + + row_0 += num_ch; + ker_0 = vldrbq_s32(row_0); + ker_loop--; + } + const int32x4_t mult = vldrwq_s32(out_mult); + const int32x4_t shift = vldrwq_s32(out_shift); + out_mult += 4; + out_shift += 4; + + out_0 = arm_requantize_mve_32x4(out_0, mult, shift); + out_1 = arm_requantize_mve_32x4(out_1, mult, shift); + + out_0 = vaddq_n_s32(out_0, out_offset); + out_0 = vmaxq_s32(out_0, vdupq_n_s32(activation_min)); + out_0 = vminq_s32(out_0, vdupq_n_s32(activation_max)); + vstrbq_s32(out_tmp, out_0); + + out_1 = vaddq_n_s32(out_1, out_offset); + out_1 = vmaxq_s32(out_1, vdupq_n_s32(activation_min)); + out_1 = vminq_s32(out_1, vdupq_n_s32(activation_max)); + vstrbq_s32(out_tmp + num_ch, out_1); + + out_tmp += 4; + ch_per_loop--; + } + + int32_t tail_ch = num_ch & 3; + if (tail_ch != 0) + { + int32_t ch_idx = (num_ch & ~3); + int32x4_t col_0_sum; + int32x4_t col_1_sum; + + const int32_t single_buffer_size = kernel_size * num_ch; + for (int i = 0; i < tail_ch; i++) + { + const int16_t *col_pos_0 = col + ch_idx; + const int16_t *col_pos_1 = col_pos_0 + single_buffer_size; + + const int8_t *row_pos = row + ch_idx; + int32_t sum_0 = bias[i]; + int32_t sum_1 = bias[i]; + + for (int j = 0; j < kernel_size; j++) + { + const int8_t row_val = row_pos[j * num_ch]; + sum_0 += row_val * col_pos_0[j * num_ch]; + sum_1 += row_val * col_pos_1[j * num_ch]; + } + col_0_sum[i] = sum_0; + col_1_sum[i] = sum_1; + + ch_idx++; + } + const mve_pred16_t p = vctp32q((uint32_t)tail_ch); + const int32x4_t mult = vldrwq_z_s32(out_mult, p); + const int32x4_t shift = vldrwq_z_s32(out_shift, p); + + col_0_sum = arm_requantize_mve_32x4(col_0_sum, mult, shift); + col_1_sum = arm_requantize_mve_32x4(col_1_sum, mult, shift); + + col_0_sum = vaddq_n_s32(col_0_sum, out_offset); + col_0_sum = vmaxq_s32(col_0_sum, vdupq_n_s32(activation_min)); + col_0_sum = vminq_s32(col_0_sum, vdupq_n_s32(activation_max)); + vstrbq_p_s32(out_tmp, col_0_sum, p); + + col_1_sum = vaddq_n_s32(col_1_sum, out_offset); + col_1_sum = vmaxq_s32(col_1_sum, vdupq_n_s32(activation_min)); + col_1_sum = vminq_s32(col_1_sum, vdupq_n_s32(activation_max)); + vstrbq_p_s32(out_tmp + num_ch, col_1_sum, p); + + out_tmp += tail_ch; + } + + return out_tmp + num_ch; +#else + (void)row; + (void)col; + (void)num_ch; + (void)out_shift; + (void)out_mult; + (void)out_offset; + (void)activation_min; + (void)activation_max; + (void)kernel_size; + (void)output_bias; + (void)out; + return NULL; +#endif +} diff --git a/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_q7_q15.c b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_q7_q15.c new file mode 100644 index 0000000..05c95b6 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_q7_q15.c @@ -0,0 +1,186 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_nn_mat_mult_kernel_q7_q15.c + * Description: Matrix-multiplication function for convolution + * + * $Date: January 26, 2021 + * $Revision: V.1.0.2 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @brief Matrix-multiplication function for convolution. + * + * @details Refer to header file for details. + * + */ + +q7_t *arm_nn_mat_mult_kernel_q7_q15(const q7_t *pA, + const q15_t *pInBuffer, + const uint16_t ch_im_out, + const uint16_t numCol_A, + const uint16_t bias_shift, + const uint16_t out_shift, + const q7_t *bias, + q7_t *pOut) +{ +#if defined(ARM_MATH_DSP) + /* set up the second output pointers */ + q7_t *pOut2 = pOut + ch_im_out; + const q7_t *pBias = bias; + + uint16_t rowCnt = ch_im_out >> 1; + /* this loop over rows in A */ + while (rowCnt) + { + /* setup pointers for B */ + const q15_t *pB = pInBuffer; + const q15_t *pB2 = pB + numCol_A; + + /* align the second pointer for A */ + const q7_t *pA2 = pA + numCol_A; + + /* init the sum with bias */ + q31_t sum = ((q31_t)(*pBias) << bias_shift) + NN_ROUND(out_shift); + q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + q31_t sum3 = ((q31_t)(*pBias) << bias_shift) + NN_ROUND(out_shift); + q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + + uint16_t colCnt = numCol_A >> 2; + /* accumulate over the vector */ + while (colCnt) + { + q31_t inA11, inA12, inA21, inA22; + + q31_t inB1 = arm_nn_read_q15x2_ia(&pB); + q31_t inB2 = arm_nn_read_q15x2_ia(&pB2); + + pA = read_and_pad(pA, &inA11, &inA12); + pA2 = read_and_pad(pA2, &inA21, &inA22); + + sum = __SMLAD(inA11, inB1, sum); + sum2 = __SMLAD(inA11, inB2, sum2); + sum3 = __SMLAD(inA21, inB1, sum3); + sum4 = __SMLAD(inA21, inB2, sum4); + + inB1 = arm_nn_read_q15x2_ia(&pB); + inB2 = arm_nn_read_q15x2_ia(&pB2); + + sum = __SMLAD(inA12, inB1, sum); + sum2 = __SMLAD(inA12, inB2, sum2); + sum3 = __SMLAD(inA22, inB1, sum3); + sum4 = __SMLAD(inA22, inB2, sum4); + + colCnt--; + } /* while over colCnt */ + colCnt = numCol_A & 0x3; + while (colCnt) + { + q7_t inA1 = *pA++; + q15_t inB1 = *pB++; + q7_t inA2 = *pA2++; + q15_t inB2 = *pB2++; + + sum += inA1 * inB1; + sum2 += inA1 * inB2; + sum3 += inA2 * inB1; + sum4 += inA2 * inB2; + colCnt--; + } /* while over colCnt */ + *pOut++ = (q7_t)__SSAT((sum >> out_shift), 8); + *pOut++ = (q7_t)__SSAT((sum3 >> out_shift), 8); + *pOut2++ = (q7_t)__SSAT((sum2 >> out_shift), 8); + *pOut2++ = (q7_t)__SSAT((sum4 >> out_shift), 8); + + /* skip the row computed with A2 */ + pA += numCol_A; + rowCnt--; + } /* for over ch_im_out */ + + /* compute left-over row if any */ + if (ch_im_out & 0x1) + { + /* setup pointers for B */ + const q15_t *pB = pInBuffer; + const q15_t *pB2 = pB + numCol_A; + + /* load the bias */ + q31_t sum = ((q31_t)(*pBias) << bias_shift) + NN_ROUND(out_shift); + q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); + + uint16_t colCnt = numCol_A >> 2; + while (colCnt) + { + q31_t inA11, inA12; + + q31_t inB1 = arm_nn_read_q15x2_ia(&pB); + q31_t inB2 = arm_nn_read_q15x2_ia(&pB2); + + pA = read_and_pad(pA, &inA11, &inA12); + + sum = __SMLAD(inA11, inB1, sum); + sum2 = __SMLAD(inA11, inB2, sum2); + + inB1 = arm_nn_read_q15x2_ia(&pB); + inB2 = arm_nn_read_q15x2_ia(&pB2); + + sum = __SMLAD(inA12, inB1, sum); + sum2 = __SMLAD(inA12, inB2, sum2); + + colCnt--; + } + colCnt = numCol_A & 0x3; + while (colCnt) + { + q7_t inA1 = *pA++; + q15_t inB1 = *pB++; + q15_t inB2 = *pB2++; + + sum += inA1 * inB1; + sum2 += inA1 * inB2; + colCnt--; + } + + *pOut++ = (q7_t)__SSAT((sum >> out_shift), 8); + *pOut2++ = (q7_t)__SSAT((sum2 >> out_shift), 8); + } + + pOut += ch_im_out; + + /* return the new output pointer with offset */ + return pOut; +#else + (void)pA; + (void)pInBuffer; + (void)ch_im_out; + (void)numCol_A; + (void)bias_shift; + (void)out_shift; + (void)bias; + (void)pOut; + /* To be completed */ + return NULL; +#endif /* ARM_MATH_DSP */ +} diff --git a/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_q7_q15_reordered.c b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_q7_q15_reordered.c new file mode 100644 index 0000000..0870ac3 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_q7_q15_reordered.c @@ -0,0 +1,137 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_nn_mat_mult_kernel_q7_q15_reordered.c + * Description: Matrix-multiplication function for convolution with reordered columns + * + * $Date: January 26, 2021 + * $Revision: V.1.0.2 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @brief Matrix-multiplication function for convolution with re-ordered input. + * + * @details Refer to header file for details. + * + */ + +q7_t *arm_nn_mat_mult_kernel_q7_q15_reordered(const q7_t *pA, + const q15_t *pInBuffer, + const uint16_t ch_im_out, + const uint16_t numCol_A, + const uint16_t bias_shift, + const uint16_t out_shift, + const q7_t *bias, + q7_t *pOut) +{ + +#if defined(ARM_MATH_DSP) + /* set up the second output pointers */ + q7_t *pOut2 = pOut + ch_im_out; + int i; + + /* this loop over rows in A */ + for (i = 0; i < ch_im_out; i += 2) + { + /* setup pointers for B */ + const q15_t *pB = pInBuffer; + const q15_t *pB2 = pB + numCol_A; + + /* align the second pointer for A */ + const q7_t *pA2 = pA + numCol_A; + + /* init the sum with bias */ + q31_t sum = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift); + q31_t sum2 = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift); + q31_t sum3 = ((q31_t)(bias[i + 1]) << bias_shift) + NN_ROUND(out_shift); + q31_t sum4 = ((q31_t)(bias[i + 1]) << bias_shift) + NN_ROUND(out_shift); + + uint16_t colCnt = numCol_A >> 2; + /* accumulate over the vector */ + while (colCnt) + { + q31_t inA11, inA12, inA21, inA22; + + q31_t inB1 = arm_nn_read_q15x2_ia(&pB); + q31_t inB2 = arm_nn_read_q15x2_ia(&pB2); + + pA = read_and_pad_reordered(pA, &inA11, &inA12); + pA2 = read_and_pad_reordered(pA2, &inA21, &inA22); + + sum = __SMLAD(inA11, inB1, sum); + sum2 = __SMLAD(inA11, inB2, sum2); + sum3 = __SMLAD(inA21, inB1, sum3); + sum4 = __SMLAD(inA21, inB2, sum4); + + inB1 = arm_nn_read_q15x2_ia(&pB); + inB2 = arm_nn_read_q15x2_ia(&pB2); + + sum = __SMLAD(inA12, inB1, sum); + sum2 = __SMLAD(inA12, inB2, sum2); + sum3 = __SMLAD(inA22, inB1, sum3); + sum4 = __SMLAD(inA22, inB2, sum4); + + colCnt--; + } /* while over colCnt */ + colCnt = numCol_A & 0x3; + while (colCnt) + { + q7_t inA1 = *pA++; + q15_t inB1 = *pB++; + q7_t inA2 = *pA2++; + q15_t inB2 = *pB2++; + + sum += inA1 * inB1; + sum2 += inA1 * inB2; + sum3 += inA2 * inB1; + sum4 += inA2 * inB2; + colCnt--; + } /* while over colCnt */ + *pOut++ = (q7_t)__SSAT((sum >> out_shift), 8); + *pOut++ = (q7_t)__SSAT((sum3 >> out_shift), 8); + *pOut2++ = (q7_t)__SSAT((sum2 >> out_shift), 8); + *pOut2++ = (q7_t)__SSAT((sum4 >> out_shift), 8); + + /* skip the row computed with A2 */ + pA += numCol_A; + } /* for over ch_im_out */ + + pOut += ch_im_out; + + /* return the new output pointer with offset */ + return pOut; +#else + (void)pA; + (void)pInBuffer; + (void)ch_im_out; + (void)numCol_A; + (void)bias_shift; + (void)out_shift; + (void)bias; + (void)pOut; + /* To be completed */ + return NULL; +#endif /* ARM_MATH_DSP */ +} diff --git a/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_s8_s16.c b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_s8_s16.c new file mode 100644 index 0000000..cb30068 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_s8_s16.c @@ -0,0 +1,245 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_nn_mat_mult_kernel_s8_s16.c + * Description: Matrix-multiplication function for convolution + * + * $Date: 14. December 2021 + * $Revision: V.1.1.0 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/* + * Matrix-multiplication function for convolution with per-channel requantization. + * + * Refer header file for details. + * + */ + +q7_t *arm_nn_mat_mult_kernel_s8_s16(const q7_t *input_a, + const q15_t *input_b, + const uint16_t output_ch, + const int32_t *out_shift, + const int32_t *out_mult, + const int32_t out_offset, + const int16_t activation_min, + const int16_t activation_max, + const uint16_t num_col_a, + const int32_t *const output_bias, + q7_t *out_0) +{ +#if !defined(ARM_MATH_MVEI) + /* set up the second output pointers */ + q7_t *out_1 = out_0 + output_ch; + const int32_t *bias = output_bias; + + uint16_t row_count = output_ch / 2; + const q7_t *ip_a0 = input_a; + /* this loop over rows in A */ + while (row_count) + { + /* setup pointers for B */ + const q15_t *ip_b0 = input_b; + const q15_t *ip_b1 = ip_b0 + num_col_a; + + /* align the second pointer for A */ + const q7_t *ip_a1 = ip_a0 + num_col_a; + + q31_t ch_0_out_0 = 0; + q31_t ch_0_out_1 = 0; + q31_t ch_1_out_0 = 0; + q31_t ch_1_out_1 = 0; + /* Init accumulator with bias for channel N and N + 1 */ + if (bias) + { + ch_0_out_0 = *bias; + ch_0_out_1 = *bias++; + ch_1_out_0 = *bias; + ch_1_out_1 = *bias++; + } + +#if defined(ARM_MATH_DSP) + uint16_t col_count = num_col_a / 4; + /* accumulate over the vector */ + while (col_count) + { + q31_t a01, a02, a11, a12; + q31_t b0 = arm_nn_read_q15x2_ia(&ip_b0); + q31_t b1 = arm_nn_read_q15x2_ia(&ip_b1); + + ip_a0 = read_and_pad(ip_a0, &a01, &a02); + ip_a1 = read_and_pad(ip_a1, &a11, &a12); + + ch_0_out_0 = __SMLAD(a01, b0, ch_0_out_0); + ch_0_out_1 = __SMLAD(a01, b1, ch_0_out_1); + ch_1_out_0 = __SMLAD(a11, b0, ch_1_out_0); + ch_1_out_1 = __SMLAD(a11, b1, ch_1_out_1); + + b0 = arm_nn_read_q15x2_ia(&ip_b0); + b1 = arm_nn_read_q15x2_ia(&ip_b1); + + ch_0_out_0 = __SMLAD(a02, b0, ch_0_out_0); + ch_0_out_1 = __SMLAD(a02, b1, ch_0_out_1); + ch_1_out_0 = __SMLAD(a12, b0, ch_1_out_0); + ch_1_out_1 = __SMLAD(a12, b1, ch_1_out_1); + + col_count--; + } /* while over col_count */ + col_count = num_col_a & 0x3; +#else + uint16_t col_count = num_col_a; +#endif + while (col_count) + { + q7_t a0 = *ip_a0++; + q15_t b0 = *ip_b0++; + q7_t a1 = *ip_a1++; + q15_t b1 = *ip_b1++; + + ch_0_out_0 += a0 * b0; + ch_0_out_1 += a0 * b1; + ch_1_out_0 += a1 * b0; + ch_1_out_1 += a1 * b1; + col_count--; + } /* while over col_count */ + + ch_0_out_0 = arm_nn_requantize(ch_0_out_0, *out_mult, *out_shift); + ch_0_out_0 += out_offset; + ch_0_out_0 = MAX(ch_0_out_0, activation_min); + ch_0_out_0 = MIN(ch_0_out_0, activation_max); + *out_0++ = (q7_t)ch_0_out_0; + + ch_0_out_1 = arm_nn_requantize(ch_0_out_1, *out_mult, *out_shift); + ch_0_out_1 += out_offset; + ch_0_out_1 = MAX(ch_0_out_1, activation_min); + ch_0_out_1 = MIN(ch_0_out_1, activation_max); + *out_1++ = (q7_t)ch_0_out_1; + out_mult++; + out_shift++; + + ch_1_out_0 = arm_nn_requantize(ch_1_out_0, *out_mult, *out_shift); + ch_1_out_0 += out_offset; + ch_1_out_0 = MAX(ch_1_out_0, activation_min); + ch_1_out_0 = MIN(ch_1_out_0, activation_max); + *out_0++ = (q7_t)ch_1_out_0; + + ch_1_out_1 = arm_nn_requantize(ch_1_out_1, *out_mult, *out_shift); + ch_1_out_1 += out_offset; + ch_1_out_1 = MAX(ch_1_out_1, activation_min); + ch_1_out_1 = MIN(ch_1_out_1, activation_max); + *out_1++ = (q7_t)ch_1_out_1; + out_mult++; + out_shift++; + + /* skip row */ + ip_a0 += num_col_a; + row_count--; + } + + /* compute the last odd numbered row if any */ + if (output_ch & 0x1) + { + /* setup pointers for B */ + const q15_t *ip_b0 = input_b; + const q15_t *ip_b1 = ip_b0 + num_col_a; + + q31_t ch_0_out_0 = 0; + q31_t ch_0_out_1 = 0; + + /* load the bias */ + if (bias) + { + ch_0_out_0 = *bias; + ch_0_out_1 = *bias++; + } + +#if defined(ARM_MATH_DSP) + uint16_t col_count = num_col_a >> 2; + while (col_count) + { + q31_t a01, a02; + q31_t b0 = arm_nn_read_q15x2_ia(&ip_b0); + q31_t b1 = arm_nn_read_q15x2_ia(&ip_b1); + + ip_a0 = read_and_pad(ip_a0, &a01, &a02); + + ch_0_out_0 = __SMLAD(a01, b0, ch_0_out_0); + ch_0_out_1 = __SMLAD(a01, b1, ch_0_out_1); + + b0 = arm_nn_read_q15x2_ia(&ip_b0); + b1 = arm_nn_read_q15x2_ia(&ip_b1); + ch_0_out_0 = __SMLAD(a02, b0, ch_0_out_0); + ch_0_out_1 = __SMLAD(a02, b1, ch_0_out_1); + + col_count--; + } + col_count = num_col_a & 0x3; +#else + uint16_t col_count = num_col_a; +#endif + while (col_count) + { + q7_t a0 = *ip_a0++; + q15_t b0 = *ip_b0++; + q15_t b1 = *ip_b1++; + + ch_0_out_0 += a0 * b0; + ch_0_out_1 += a0 * b1; + col_count--; + } + ch_0_out_0 = arm_nn_requantize(ch_0_out_0, *out_mult, *out_shift); + ch_0_out_0 += out_offset; + ch_0_out_0 = MAX(ch_0_out_0, activation_min); + ch_0_out_0 = MIN(ch_0_out_0, activation_max); + *out_0++ = (q7_t)ch_0_out_0; + + ch_0_out_1 = arm_nn_requantize(ch_0_out_1, *out_mult, *out_shift); + ch_0_out_1 += out_offset; + ch_0_out_1 = MAX(ch_0_out_1, activation_min); + ch_0_out_1 = MIN(ch_0_out_1, activation_max); + *out_1++ = (q7_t)ch_0_out_1; + out_mult++; + out_shift++; + } + + out_0 += output_ch; + + /* return the new output pointer with offset */ + return out_0; +#else + (void)input_a; + (void)input_b; + (void)output_ch; + (void)out_shift; + (void)out_mult; + (void)out_offset; + (void)activation_min; + (void)activation_max; + (void)num_col_a; + (void)output_bias; + (void)out_0; + /* To be completed */ + return NULL; +#endif +} diff --git a/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_s8_s16_reordered.c b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_s8_s16_reordered.c new file mode 100644 index 0000000..842a180 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_s8_s16_reordered.c @@ -0,0 +1,201 @@ +/* + * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_nn_mat_mult_kernel_s8_s16_reordered.c + * Description: Matrix-multiplication function for convolution with reordered columns + * + * $Date: 09. October 2020 + * $Revision: V.1.0.3 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/* + * Matrix-multiplication with re-ordered input and bias inputs for convolution with per-channel + * requantization. The re-ordering is a consequence of sign extension is done by the SXTB16 command. + * + * Refer header file for details. This function differs from arm_nn_mat_mult_kernel_s8_s16(), in that it uses + * read_and_pad_reordered() instead of arm_nn_mat_mult_kernel_s8_s16(). Investigating the cycles impact and + * unifying these two functions is a potential future improvement. + * + */ + +q7_t *arm_nn_mat_mult_kernel_s8_s16_reordered(const q7_t *input_a, + const q15_t *input_b, + const uint16_t output_ch, + const int32_t *out_shift, + const int32_t *out_mult, + const int32_t out_offset, + const int16_t activation_min, + const int16_t activation_max, + const uint16_t num_col_a, + const int32_t *const output_bias, + q7_t *out_0) +{ +#if defined(ARM_MATH_DSP) + /* set up the second output pointers */ + q7_t *out_1 = out_0 + output_ch; + const int32_t *bias = output_bias; + + uint16_t row_count = output_ch / 2; + const q7_t *ip_a0 = input_a; + /* this loop over rows in A */ + while (row_count) + { + /* setup pointers for B */ + const q15_t *ip_b0 = input_b; + const q15_t *ip_b1 = ip_b0 + num_col_a; + + /* align the second pointer for A */ + const q7_t *ip_a1 = ip_a0 + num_col_a; + + /* Init accumulator with bias for channel N and N + 1 */ + q31_t ch_0_out_0 = *bias; + q31_t ch_0_out_1 = *bias++; + q31_t ch_1_out_0 = *bias; + q31_t ch_1_out_1 = *bias++; + + uint16_t col_count = num_col_a / 4; + /* accumulate over the vector */ + while (col_count) + { + q31_t a01, a02, a11, a12; + q31_t b0 = arm_nn_read_q15x2_ia(&ip_b0); + q31_t b1 = arm_nn_read_q15x2_ia(&ip_b1); + + ip_a0 = read_and_pad_reordered(ip_a0, &a01, &a02); + ip_a1 = read_and_pad_reordered(ip_a1, &a11, &a12); + + ch_0_out_0 = __SMLAD(a01, b0, ch_0_out_0); + ch_0_out_1 = __SMLAD(a01, b1, ch_0_out_1); + ch_1_out_0 = __SMLAD(a11, b0, ch_1_out_0); + ch_1_out_1 = __SMLAD(a11, b1, ch_1_out_1); + + b0 = arm_nn_read_q15x2_ia(&ip_b0); + b1 = arm_nn_read_q15x2_ia(&ip_b1); + + ch_0_out_0 = __SMLAD(a02, b0, ch_0_out_0); + ch_0_out_1 = __SMLAD(a02, b1, ch_0_out_1); + ch_1_out_0 = __SMLAD(a12, b0, ch_1_out_0); + ch_1_out_1 = __SMLAD(a12, b1, ch_1_out_1); + + col_count--; + } /* while over col_count */ + + ch_0_out_0 = arm_nn_requantize(ch_0_out_0, *out_mult, *out_shift); + ch_0_out_0 += out_offset; + ch_0_out_0 = MAX(ch_0_out_0, activation_min); + ch_0_out_0 = MIN(ch_0_out_0, activation_max); + *out_0++ = (q7_t)ch_0_out_0; + + ch_0_out_1 = arm_nn_requantize(ch_0_out_1, *out_mult, *out_shift); + ch_0_out_1 += out_offset; + ch_0_out_1 = MAX(ch_0_out_1, activation_min); + ch_0_out_1 = MIN(ch_0_out_1, activation_max); + *out_1++ = (q7_t)ch_0_out_1; + out_mult++; + out_shift++; + + ch_1_out_0 = arm_nn_requantize(ch_1_out_0, *out_mult, *out_shift); + ch_1_out_0 += out_offset; + ch_1_out_0 = MAX(ch_1_out_0, activation_min); + ch_1_out_0 = MIN(ch_1_out_0, activation_max); + *out_0++ = (q7_t)ch_1_out_0; + + ch_1_out_1 = arm_nn_requantize(ch_1_out_1, *out_mult, *out_shift); + ch_1_out_1 += out_offset; + ch_1_out_1 = MAX(ch_1_out_1, activation_min); + ch_1_out_1 = MIN(ch_1_out_1, activation_max); + *out_1++ = (q7_t)ch_1_out_1; + out_mult++; + out_shift++; + + /* skip row */ + ip_a0 += num_col_a; + row_count--; + } + + if (output_ch & 1) + { + /* setup pointers for B */ + const q15_t *ip_b0 = input_b; + const q15_t *ip_b1 = ip_b0 + num_col_a; + + /* Init accumulator with bias for channel N + 1 */ + q31_t ch_0_out_0 = *bias; + q31_t ch_0_out_1 = ch_0_out_0; + + int32_t col_count = num_col_a / 4; + while (col_count) + { + q31_t a01, a02; + q31_t b0 = arm_nn_read_q15x2_ia(&ip_b0); + q31_t b1 = arm_nn_read_q15x2_ia(&ip_b1); + + ip_a0 = read_and_pad_reordered(ip_a0, &a01, &a02); + + ch_0_out_0 = __SMLAD(a01, b0, ch_0_out_0); + ch_0_out_1 = __SMLAD(a01, b1, ch_0_out_1); + + b0 = arm_nn_read_q15x2_ia(&ip_b0); + b1 = arm_nn_read_q15x2_ia(&ip_b1); + + ch_0_out_0 = __SMLAD(a02, b0, ch_0_out_0); + ch_0_out_1 = __SMLAD(a02, b1, ch_0_out_1); + + col_count--; + } /* while over col_count */ + + ch_0_out_0 = arm_nn_requantize(ch_0_out_0, *out_mult, *out_shift); + ch_0_out_0 += out_offset; + ch_0_out_0 = MAX(ch_0_out_0, activation_min); + ch_0_out_0 = MIN(ch_0_out_0, activation_max); + *out_0++ = (q7_t)ch_0_out_0; + + ch_0_out_1 = arm_nn_requantize(ch_0_out_1, *out_mult, *out_shift); + ch_0_out_1 += out_offset; + ch_0_out_1 = MAX(ch_0_out_1, activation_min); + ch_0_out_1 = MIN(ch_0_out_1, activation_max); + *out_1++ = (q7_t)ch_0_out_1; + } + + out_0 += output_ch; + + /* return the new output pointer with offset */ + return out_0; +#else + (void)input_a; + (void)input_b; + (void)output_ch; + (void)out_shift; + (void)out_mult; + (void)out_offset; + (void)activation_min; + (void)activation_max; + (void)num_col_a; + (void)output_bias; + (void)out_0; + /* To be completed */ + return NULL; +#endif +} diff --git a/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_s8.c b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_s8.c new file mode 100644 index 0000000..adfa702 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_s8.c @@ -0,0 +1,180 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_nn_mat_mult_s8.c + * Description: General Matrix-multiplication function + * + * $Date: 27. October 2021 + * $Revision: V.2.0.6 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ + +#include "arm_nnsupportfunctions.h" + +/* + * s8 General matrix multiplication function with per-channel requantization for upto 4 column batches. + * + * Refer header file for details. + * + */ + +q7_t *arm_nn_mat_mult_s8(const q7_t *input_row, + const q7_t *input_col, + const uint16_t output_ch, + const uint16_t col_batches, + const int32_t *output_shift, + const int32_t *output_mult, + const int32_t out_offset, + const int32_t col_offset, + const int32_t row_offset, + const int16_t activation_min, + const int16_t activation_max, + const uint16_t row_len, + const int32_t *const bias, + q7_t *out) +{ +#if defined(ARM_MATH_MVEI) + (void)row_offset; + if (col_batches == 4) + { + for (int i_out_ch = 0; i_out_ch < output_ch; i_out_ch++) + { + int32_t row_len_tmp = row_len; + const int8_t *ip_r0 = input_row + (i_out_ch * row_len); + const int8_t *ip_c0 = input_col; + const int8_t *ip_c1 = input_col + row_len; + const int8_t *ip_c2 = input_col + (2 * row_len); + const int8_t *ip_c3 = input_col + (3 * row_len); + + int32_t acc_0 = 0; + int32_t acc_1 = 0; + int32_t acc_2 = 0; + int32_t acc_3 = 0; + const int32_t row_loop_cnt = (row_len + 7) / 8; + + for (int i_row_loop = 0; i_row_loop < row_loop_cnt; i_row_loop++) + { + mve_pred16_t p = vctp16q((uint32_t)row_len_tmp); + const int16x8_t offset = vdupq_m_n_s16(vuninitializedq_s16(), col_offset, p); + row_len_tmp -= 8; + + int16x8_t c0 = vldrbq_s16(ip_c0); + ip_c0 += 8; + c0 = vaddq_s16(c0, offset); + + int16x8_t c1 = vldrbq_s16(ip_c1); + ip_c1 += 8; + c1 = vaddq_s16(c1, offset); + + int16x8_t c2 = vldrbq_s16(ip_c2); + ip_c2 += 8; + c2 = vaddq_s16(c2, offset); + + int16x8_t c3 = vldrbq_s16(ip_c3); + ip_c3 += 8; + c3 = vaddq_s16(c3, offset); + + int16x8_t r0 = vldrbq_z_s16(ip_r0, p); + ip_r0 += 8; + + acc_0 = vmladavaq_p_s16(acc_0, r0, c0, p); + acc_1 = vmladavaq_p_s16(acc_1, r0, c1, p); + acc_2 = vmladavaq_p_s16(acc_2, r0, c2, p); + acc_3 = vmladavaq_p_s16(acc_3, r0, c3, p); + } + + int32x4_t res = {acc_0, acc_1, acc_2, acc_3}; + if (bias) + { + res = vaddq_n_s32(res, bias[i_out_ch]); + } + res = arm_requantize_mve(res, output_mult[i_out_ch], output_shift[i_out_ch]); + res = vaddq_n_s32(res, out_offset); + + res = vmaxq_s32(res, vdupq_n_s32(activation_min)); + res = vminq_s32(res, vdupq_n_s32(activation_max)); + + const uint32x4_t scatter_offset = {0, output_ch, output_ch * 2, output_ch * 3}; + vstrbq_scatter_offset_s32(&out[i_out_ch], scatter_offset, res); + } + out += 4 * output_ch; + } + else + { + for (int i_col_batch = (col_batches & ~0x3); i_col_batch < (col_batches & 0x3); i_col_batch++) + { + for (int i_out_ch = 0; i_out_ch < output_ch; i_out_ch++) + { + int32_t row_len_tmp = row_len; + + const int8_t *ip_r0 = input_row + (i_out_ch * row_len); + const int8_t *ip_c0 = input_col + (i_col_batch * row_len); + int32_t acc_0 = 0; + const int32_t row_loop_cnt = (row_len + 7) / 8; + + for (int i_row_loop = 0; i_row_loop < row_loop_cnt; i_row_loop++) + { + const mve_pred16_t p = vctp16q((uint32_t)row_len_tmp); + const int16x8_t offset = vdupq_m_n_s16(vuninitializedq_s16(), col_offset, p); + row_len_tmp -= 8; + + int16x8_t c0 = vldrbq_s16(ip_c0); + ip_c0 += 8; + c0 = vaddq_s16(c0, offset); + + int16x8_t r0 = vldrbq_z_s16(ip_r0, p); + ip_r0 += 8; + acc_0 = vmladavaq_p_s16(acc_0, r0, c0, p); + } + + if (bias) + { + acc_0 += bias[i_out_ch]; + } + acc_0 = arm_nn_requantize(acc_0, output_mult[i_out_ch], output_shift[i_out_ch]); + acc_0 += out_offset; + acc_0 = MAX(acc_0, activation_min); + acc_0 = MIN(acc_0, activation_max); + out[i_out_ch] = (q7_t)acc_0; + } + out += output_ch; + } + } + return out; + +#else + (void)input_row; + (void)input_col; + (void)output_ch; + (void)col_batches; + (void)output_shift; + (void)output_mult; + (void)out_offset; + (void)col_offset; + (void)row_offset; + (void)activation_min; + (void)activation_max; + (void)row_len; + (void)bias; + (void)out; + return NULL; +#endif +} |