From 5b81bc8ccbd342b8566d88fc9f17a73aec03b5b6 Mon Sep 17 00:00:00 2001 From: Clyne Sullivan Date: Wed, 29 Jan 2025 21:34:25 -0500 Subject: initial commit --- .../NN/Source/PoolingFunctions/CMakeLists.txt | 24 ++ .../NN/Source/PoolingFunctions/arm_avgpool_s16.c | 128 ++++++ .../NN/Source/PoolingFunctions/arm_avgpool_s8.c | 401 ++++++++++++++++++ .../NN/Source/PoolingFunctions/arm_max_pool_s16.c | 180 ++++++++ .../NN/Source/PoolingFunctions/arm_max_pool_s8.c | 229 ++++++++++ .../NN/Source/PoolingFunctions/arm_pool_q7_HWC.c | 464 +++++++++++++++++++++ 6 files changed, 1426 insertions(+) create mode 100644 Drivers/CMSIS/NN/Source/PoolingFunctions/CMakeLists.txt create mode 100644 Drivers/CMSIS/NN/Source/PoolingFunctions/arm_avgpool_s16.c create mode 100644 Drivers/CMSIS/NN/Source/PoolingFunctions/arm_avgpool_s8.c create mode 100644 Drivers/CMSIS/NN/Source/PoolingFunctions/arm_max_pool_s16.c create mode 100644 Drivers/CMSIS/NN/Source/PoolingFunctions/arm_max_pool_s8.c create mode 100644 Drivers/CMSIS/NN/Source/PoolingFunctions/arm_pool_q7_HWC.c (limited to 'Drivers/CMSIS/NN/Source/PoolingFunctions') diff --git a/Drivers/CMSIS/NN/Source/PoolingFunctions/CMakeLists.txt b/Drivers/CMSIS/NN/Source/PoolingFunctions/CMakeLists.txt new file mode 100644 index 0000000..a37503b --- /dev/null +++ b/Drivers/CMSIS/NN/Source/PoolingFunctions/CMakeLists.txt @@ -0,0 +1,24 @@ +# +# Copyright (c) 2019-2022 Arm Limited. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +file(GLOB SRC "./*_s8.c") +file(GLOB SRC_S16 "./*_s16.c") +target_sources(cmsis-nn PRIVATE ${SRC} ${SRC_S16}) + + + diff --git a/Drivers/CMSIS/NN/Source/PoolingFunctions/arm_avgpool_s16.c b/Drivers/CMSIS/NN/Source/PoolingFunctions/arm_avgpool_s16.c new file mode 100644 index 0000000..5cd2b1c --- /dev/null +++ b/Drivers/CMSIS/NN/Source/PoolingFunctions/arm_avgpool_s16.c @@ -0,0 +1,128 @@ +/* + * Copyright (C) 2022 Arm Limited or its affiliates. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_avgpool_s16.c + * Description: Pooling function implementations + * + * $Date: 3. February 2022 + * $Revision: V.1.0.1 + * + * Target Processor: Cortex-M CPUs + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup Pooling + * @{ + */ + +/* + * s16 average pooling function + * + * Refer to header file for details. + * + */ +arm_status arm_avgpool_s16(const cmsis_nn_context *ctx, + const cmsis_nn_pool_params *pool_params, + const cmsis_nn_dims *input_dims, + const q15_t *src, + const cmsis_nn_dims *filter_dims, + const cmsis_nn_dims *output_dims, + q15_t *dst) +{ + (void)ctx; + const int32_t input_y = input_dims->h; + const int32_t input_x = input_dims->w; + const int32_t output_y = output_dims->h; + const int32_t output_x = output_dims->w; + const int32_t stride_y = pool_params->stride.h; + const int32_t stride_x = pool_params->stride.w; + const int32_t kernel_y = filter_dims->h; + const int32_t kernel_x = filter_dims->w; + const int32_t pad_y = pool_params->padding.h; + const int32_t pad_x = pool_params->padding.w; + const int32_t act_min = pool_params->activation.min; + const int32_t act_max = pool_params->activation.max; + const int32_t ch_src = input_dims->c; + + /* Reference C code adapted from CMSIS-NN arm_avgpool_s8.c. + */ + + for (int i_y = 0, base_idx_y = -pad_y; i_y < output_y; base_idx_y += stride_y, i_y++) + { + for (int i_x = 0, base_idx_x = -pad_x; i_x < output_x; base_idx_x += stride_x, i_x++) + { + /* Condition for kernel start dimension: (base_idx_ + kernel__start) >= 0 */ + const int32_t ker_y_start = MAX(0, -base_idx_y); + const int32_t ker_x_start = MAX(0, -base_idx_x); + + /* Condition for kernel end dimension: (base_idx_ + kernel__end) < dim_src_ */ + const int32_t kernel_y_end = MIN(kernel_y, input_y - base_idx_y); + const int32_t kernel_x_end = MIN(kernel_x, input_x - base_idx_x); + + for (int i_ch_in = 0; i_ch_in < ch_src; i_ch_in++) + { + int sum = 0; + int count = 0; + + for (int k_y = ker_y_start; k_y < kernel_y_end; k_y++) + { + for (int k_x = ker_x_start; k_x < kernel_x_end; k_x++) + { + sum += src[i_ch_in + ch_src * (k_x + base_idx_x + (k_y + base_idx_y) * input_x)]; + count++; + } + } + + // Prevent static code issue DIVIDE_BY_ZERO. + if (count == 0) + { + return ARM_MATH_ARGUMENT_ERROR; + } + + sum = sum > 0 ? (sum + count / 2) / count : (sum - count / 2) / count; + sum = MAX(sum, act_min); + sum = MIN(sum, act_max); + + dst[i_ch_in + ch_src * (i_x + i_y * output_x)] = sum; + } + } + } + + return ARM_MATH_SUCCESS; +} + +int32_t arm_avgpool_s16_get_buffer_size(const int output_x, const int ch_src) +{ + (void)output_x; + (void)ch_src; + return 0; +} + +/** + * @} end of Pooling group + */ diff --git a/Drivers/CMSIS/NN/Source/PoolingFunctions/arm_avgpool_s8.c b/Drivers/CMSIS/NN/Source/PoolingFunctions/arm_avgpool_s8.c new file mode 100644 index 0000000..3e9861e --- /dev/null +++ b/Drivers/CMSIS/NN/Source/PoolingFunctions/arm_avgpool_s8.c @@ -0,0 +1,401 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_avgpool_s8.c + * Description: Pooling function implementations + * + * $Date: 01. March 2021 + * $Revision: V.2.0.4 + * + * Target Processor: Cortex-M CPUs + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) + +static void scale_q31_to_q7_and_clamp(const q31_t *buffer, + q7_t *target, + int32_t length, + const int32_t count, + const int act_min, + const int act_max) +{ + const int half_count = count / 2; + + // Prevent static code issue DIVIDE_BY_ZERO. + if (count == 0) + { + return; + } + + for (int i = 0; i < length; i++) + { + int32_t sum = buffer[i] > 0 ? (buffer[i] + half_count) : (buffer[i] - half_count); + sum = sum / count; + sum = MAX(sum, act_min); + sum = MIN(sum, act_max); + + target[i] = (q7_t)sum; + } +} +#endif + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup Pooling + * @{ + */ + +/* + * s8 average pooling function + * + * Refer to header file for details. + * + */ + +#if defined(ARM_MATH_MVEI) + +arm_status arm_avgpool_s8(const cmsis_nn_context *ctx, + const cmsis_nn_pool_params *pool_params, + const cmsis_nn_dims *input_dims, + const q7_t *src, + const cmsis_nn_dims *filter_dims, + const cmsis_nn_dims *output_dims, + q7_t *dst) +{ + (void)ctx; + const int32_t input_y = input_dims->h; + const int32_t input_x = input_dims->w; + const int32_t output_y = output_dims->h; + const int32_t output_x = output_dims->w; + const int32_t stride_y = pool_params->stride.h; + const int32_t stride_x = pool_params->stride.w; + const int32_t kernel_y = filter_dims->h; + const int32_t kernel_x = filter_dims->w; + const int32_t pad_y = pool_params->padding.h; + const int32_t pad_x = pool_params->padding.w; + const int32_t act_min = pool_params->activation.min; + const int32_t act_max = pool_params->activation.max; + const int32_t ch_src = input_dims->c; + + int32_t i_x, i_y; + int32_t k_x, k_y; + + for (i_y = 0; i_y < output_y; i_y++) + { + for (i_x = 0; i_x < output_x; i_x++) + { + + int32_t k_y_start, k_y_end; + int32_t k_x_start, k_x_end; + int32_t chCnt; + const int8_t *pTmp, *pTmpInner; + int8_t *pDst; + + k_y_start = MAX(0, i_y * stride_y - pad_y); + k_y_end = MIN(i_y * stride_y - pad_y + kernel_y, input_y); + + k_x_start = MAX(0, i_x * stride_x - pad_x); + k_x_end = MIN(i_x * stride_x - pad_x + kernel_x, input_x); + + pTmp = src; + pDst = &dst[ch_src * (i_x + i_y * output_x)]; + + chCnt = ch_src >> 4; + while (chCnt > 0) + { + int32x4_t sumV1, sumV2, sumV3, sumV4; + + int8x16_t tempV; + int16x8_t tempVLO, tempVHI; + int32x4_t tempVLOLO, tempVLOHI, tempVHILO, tempVHIHI; + int32_t count = 0; + + sumV1 = vdupq_n_s32(0); + sumV2 = vdupq_n_s32(0); + sumV3 = vdupq_n_s32(0); + sumV4 = vdupq_n_s32(0); + + for (k_y = k_y_start; k_y < k_y_end; k_y++) + { + for (k_x = k_x_start; k_x < k_x_end; k_x++) + { + pTmpInner = pTmp + (ch_src * (k_x + k_y * input_x)); + tempV = vldrbq_s8(pTmpInner); + + tempVLO = vmovlbq_s8(tempV); + tempVHI = vmovltq_s8(tempV); + + tempVLOLO = vmovlbq_s16(tempVLO); + tempVLOHI = vmovltq_s16(tempVLO); + + tempVHILO = vmovlbq_s16(tempVHI); + tempVHIHI = vmovltq_s16(tempVHI); + + sumV1 = vaddq_s32(sumV1, tempVLOLO); + sumV2 = vaddq_s32(sumV2, tempVLOHI); + sumV3 = vaddq_s32(sumV3, tempVHILO); + sumV4 = vaddq_s32(sumV4, tempVHIHI); + + count++; + } + } + + // Prevent static code issue DIVIDE_BY_ZERO. + if (count == 0) + { + return ARM_MATH_ARGUMENT_ERROR; + } + + sumV1[0] = sumV1[0] > 0 ? (sumV1[0] + count / 2) / count : (sumV1[0] - count / 2) / count; + sumV1[1] = sumV1[1] > 0 ? (sumV1[1] + count / 2) / count : (sumV1[1] - count / 2) / count; + sumV1[2] = sumV1[2] > 0 ? (sumV1[2] + count / 2) / count : (sumV1[2] - count / 2) / count; + sumV1[3] = sumV1[3] > 0 ? (sumV1[3] + count / 2) / count : (sumV1[3] - count / 2) / count; + + sumV2[0] = sumV2[0] > 0 ? (sumV2[0] + count / 2) / count : (sumV2[0] - count / 2) / count; + sumV2[1] = sumV2[1] > 0 ? (sumV2[1] + count / 2) / count : (sumV2[1] - count / 2) / count; + sumV2[2] = sumV2[2] > 0 ? (sumV2[2] + count / 2) / count : (sumV2[2] - count / 2) / count; + sumV2[3] = sumV2[3] > 0 ? (sumV2[3] + count / 2) / count : (sumV2[3] - count / 2) / count; + + sumV3[0] = sumV3[0] > 0 ? (sumV3[0] + count / 2) / count : (sumV3[0] - count / 2) / count; + sumV3[1] = sumV3[1] > 0 ? (sumV3[1] + count / 2) / count : (sumV3[1] - count / 2) / count; + sumV3[2] = sumV3[2] > 0 ? (sumV3[2] + count / 2) / count : (sumV3[2] - count / 2) / count; + sumV3[3] = sumV3[3] > 0 ? (sumV3[3] + count / 2) / count : (sumV3[3] - count / 2) / count; + + sumV4[0] = sumV4[0] > 0 ? (sumV4[0] + count / 2) / count : (sumV4[0] - count / 2) / count; + sumV4[1] = sumV4[1] > 0 ? (sumV4[1] + count / 2) / count : (sumV4[1] - count / 2) / count; + sumV4[2] = sumV4[2] > 0 ? (sumV4[2] + count / 2) / count : (sumV4[2] - count / 2) / count; + sumV4[3] = sumV4[3] > 0 ? (sumV4[3] + count / 2) / count : (sumV4[3] - count / 2) / count; + + sumV1 = vmaxq_s32(sumV1, vdupq_n_s32(act_min)); + sumV1 = vminq_s32(sumV1, vdupq_n_s32(act_max)); + + sumV2 = vmaxq_s32(sumV2, vdupq_n_s32(act_min)); + sumV2 = vminq_s32(sumV2, vdupq_n_s32(act_max)); + + sumV3 = vmaxq_s32(sumV3, vdupq_n_s32(act_min)); + sumV3 = vminq_s32(sumV3, vdupq_n_s32(act_max)); + + sumV4 = vmaxq_s32(sumV4, vdupq_n_s32(act_min)); + sumV4 = vminq_s32(sumV4, vdupq_n_s32(act_max)); + + tempVLO = vmovnbq_s32(tempVLO, sumV1); + tempVLO = vmovntq_s32(tempVLO, sumV2); + + tempVHI = vmovnbq_s32(tempVHI, sumV3); + tempVHI = vmovntq_s32(tempVHI, sumV4); + + tempV = vmovnbq_s16(tempV, tempVLO); + tempV = vmovntq_s16(tempV, tempVHI); + + vstrbq_s8(pDst, tempV); + pDst += 16; + + chCnt--; + pTmp += 16; + } + + chCnt = ch_src & 0xF; + while (chCnt > 0) + { + int32_t sum = 0; + int32_t count = 0; + + for (k_y = k_y_start; k_y < k_y_end; k_y++) + { + for (k_x = k_x_start; k_x < k_x_end; k_x++) + { + sum += pTmp[ch_src * (k_x + k_y * input_x)]; + count++; + } + } + + // Prevent static code issue DIVIDE_BY_ZERO. + if (count == 0) + { + return ARM_MATH_ARGUMENT_ERROR; + } + + sum = sum > 0 ? (sum + count / 2) / count : (sum - count / 2) / count; + sum = MAX(sum, act_min); + sum = MIN(sum, act_max); + + *pDst++ = sum; + + chCnt--; + pTmp++; + } + } + } + return ARM_MATH_SUCCESS; +} + +#else +arm_status arm_avgpool_s8(const cmsis_nn_context *ctx, + const cmsis_nn_pool_params *pool_params, + const cmsis_nn_dims *input_dims, + const q7_t *src, + const cmsis_nn_dims *filter_dims, + const cmsis_nn_dims *output_dims, + q7_t *dst) +{ + const int32_t input_y = input_dims->h; + const int32_t input_x = input_dims->w; + const int32_t output_y = output_dims->h; + const int32_t output_x = output_dims->w; + const int32_t stride_y = pool_params->stride.h; + const int32_t stride_x = pool_params->stride.w; + const int32_t kernel_y = filter_dims->h; + const int32_t kernel_x = filter_dims->w; + const int32_t pad_y = pool_params->padding.h; + const int32_t pad_x = pool_params->padding.w; + const int32_t act_min = pool_params->activation.min; + const int32_t act_max = pool_params->activation.max; + const int32_t ch_src = input_dims->c; + + if (ctx->buf == NULL && arm_avgpool_s8_get_buffer_size(output_dims->w, input_dims->c)) + { + return ARM_MATH_ARGUMENT_ERROR; + } + q31_t *buffer = (q31_t *)ctx->buf; + +#if defined(ARM_MATH_DSP) + + /* Run the following code for CPU's with DSP extension + */ + for (int i_y = 0, idx_y = -pad_y; i_y < output_y; idx_y += stride_y, i_y++) + { + for (int i_x = 0, idx_x = -pad_x; i_x < output_x; idx_x += stride_x, i_x++) + { + /* Condition for kernel start dimension: + (base_idx_ + kernel__start) >= 0 */ + const int32_t kernel_y_start = MAX(0, -idx_y); + const int32_t kernel_x_start = MAX(0, -idx_x); + + /* Condition for kernel end dimension: + (base_idx_ + kernel__end) < dim_src_ */ + const int32_t kernel_y_end = MIN(kernel_y, input_y - idx_y); + const int32_t kernel_x_end = MIN(kernel_x, input_x - idx_x); + + int count = 0; + + for (int k_y = kernel_y_start; k_y < kernel_y_end; k_y++) + { + for (int k_x = kernel_x_start; k_x < kernel_x_end; k_x++) + { + const q7_t *start = src + ch_src * (k_x + idx_x + (k_y + idx_y) * input_x); + + if (count == 0) + { + for (int i = 0; i < ch_src; i++) + { + buffer[i] = start[i]; + } + } + else + { + for (int i = 0; i < ch_src; i++) + { + buffer[i] = __QADD(start[i], buffer[i]); + } + } + count++; + } + } + + // Prevent static code issue DIVIDE_BY_ZERO. + if (count == 0) + { + return ARM_MATH_ARGUMENT_ERROR; + } + + scale_q31_to_q7_and_clamp(buffer, dst, ch_src, count, act_min, act_max); + dst += ch_src; + } + } +#else + + /* Reference C code adapted from CMSIS-NN arm_avepool_q7_HWC. + */ + (void)buffer; + int16_t i_ch_in, i_x, i_y; + int16_t k_x, k_y; + + for (i_y = 0; i_y < output_y; i_y++) + { + for (i_x = 0; i_x < output_x; i_x++) + { + for (i_ch_in = 0; i_ch_in < ch_src; i_ch_in++) + { + int sum = 0; + int count = 0; + for (k_y = i_y * stride_y - pad_y; k_y < i_y * stride_y - pad_y + kernel_y; k_y++) + { + for (k_x = i_x * stride_x - pad_x; k_x < i_x * stride_x - pad_x + kernel_x; k_x++) + { + if (k_y >= 0 && k_x >= 0 && k_y < input_y && k_x < input_x) + { + sum += src[i_ch_in + ch_src * (k_x + k_y * input_x)]; + count++; + } + } + } + + // Prevent static code issue DIVIDE_BY_ZERO. + if (count == 0) + { + return ARM_MATH_ARGUMENT_ERROR; + } + + sum = sum > 0 ? (sum + count / 2) / count : (sum - count / 2) / count; + sum = MAX(sum, act_min); + sum = MIN(sum, act_max); + + dst[i_ch_in + ch_src * (i_x + i_y * output_x)] = sum; + } + } + } + +#endif + return ARM_MATH_SUCCESS; +} + +#endif /* ARM_MATH_MVEI */ + +int32_t arm_avgpool_s8_get_buffer_size(const int output_x, const int ch_src) +{ + (void)output_x; + +#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) + return (ch_src * sizeof(int32_t)); +#else + (void)ch_src; + return 0; +#endif +} +/** + * @} end of Pooling group + */ diff --git a/Drivers/CMSIS/NN/Source/PoolingFunctions/arm_max_pool_s16.c b/Drivers/CMSIS/NN/Source/PoolingFunctions/arm_max_pool_s16.c new file mode 100644 index 0000000..483f874 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/PoolingFunctions/arm_max_pool_s16.c @@ -0,0 +1,180 @@ +/* + * Copyright (C) 2022 Arm Limited or its affiliates. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_max_pool_s16.c + * Description: Pooling function implementations + * + * $Date: 24. January 2022 + * $Revision: V.1.0.0 + * + * Target Processor: Cortex-M CPUs + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +static void compare_and_replace_if_larger(int16_t *base, const int16_t *target, int32_t length) +{ + q15_t *dst = base; + const q15_t *src = target; + union arm_nnword ref_max; + union arm_nnword comp_max; + int32_t cnt = length >> 1; + + while (cnt > 0l) + { + ref_max.word = arm_nn_read_q15x2(dst); + comp_max.word = arm_nn_read_q15x2_ia(&src); + + if (comp_max.half_words[0] > ref_max.half_words[0]) + { + ref_max.half_words[0] = comp_max.half_words[0]; + } + if (comp_max.half_words[1] > ref_max.half_words[1]) + { + ref_max.half_words[1] = comp_max.half_words[1]; + } + + arm_nn_write_q15x2_ia(&dst, ref_max.word); + + cnt--; + } + + if (length & 0x1) + { + if (*src > *dst) + { + *dst = *src; + } + } +} + +static void clamp_output(int16_t *source, int32_t length, const int16_t act_min, const int16_t act_max) +{ + union arm_nnword in; + int32_t cnt = length >> 1; + + while (cnt > 0l) + { + in.word = arm_nn_read_q15x2(source); + + in.half_words[0] = MAX(in.half_words[0], act_min); + in.half_words[0] = MIN(in.half_words[0], act_max); + in.half_words[1] = MAX(in.half_words[1], act_min); + in.half_words[1] = MIN(in.half_words[1], act_max); + + arm_nn_write_q15x2_ia(&source, in.word); + cnt--; + } + + if (length & 0x1) + { + int16_t comp = *source; + comp = MAX(comp, act_min); + comp = MIN(comp, act_max); + *source = comp; + } +} + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup Pooling + * @{ + */ + +/* + * Optimized s16 max pooling function + * + * Refer to header file for details. + * + */ + +arm_status arm_max_pool_s16(const cmsis_nn_context *ctx, + const cmsis_nn_pool_params *pool_params, + const cmsis_nn_dims *input_dims, + const int16_t *src, + const cmsis_nn_dims *filter_dims, + const cmsis_nn_dims *output_dims, + int16_t *dst) +{ + const int32_t input_y = input_dims->h; + const int32_t input_x = input_dims->w; + const int32_t output_y = output_dims->h; + const int32_t output_x = output_dims->w; + const int32_t stride_y = pool_params->stride.h; + const int32_t stride_x = pool_params->stride.w; + const int32_t kernel_y = filter_dims->h; + const int32_t kernel_x = filter_dims->w; + const int32_t pad_y = pool_params->padding.h; + const int32_t pad_x = pool_params->padding.w; + const int16_t act_min = pool_params->activation.min; + const int16_t act_max = pool_params->activation.max; + const int32_t channel_in = input_dims->c; + (void)ctx; + int16_t *dst_base = dst; + + for (int i_y = 0, base_idx_y = -pad_y; i_y < output_y; base_idx_y += stride_y, i_y++) + { + for (int i_x = 0, base_idx_x = -pad_x; i_x < output_x; base_idx_x += stride_x, i_x++) + { + /* Condition for kernel start dimension: (base_idx_ + kernel__start) >= 0 */ + const int32_t ker_y_start = MAX(0, -base_idx_y); + const int32_t ker_x_start = MAX(0, -base_idx_x); + + /* Condition for kernel end dimension: (base_idx_ + kernel__end) < dim_src_ */ + const int32_t kernel_y_end = MIN(kernel_y, input_y - base_idx_y); + const int32_t kernel_x_end = MIN(kernel_x, input_x - base_idx_x); + + int count = 0; + + for (int k_y = ker_y_start; k_y < kernel_y_end; k_y++) + { + for (int k_x = ker_x_start; k_x < kernel_x_end; k_x++) + { + const int16_t *start = src + channel_in * (k_x + base_idx_x + (k_y + base_idx_y) * input_x); + + if (count == 0) + { + memcpy(dst, start, channel_in * sizeof(int16_t)); + count++; + } + else + { + compare_and_replace_if_larger(dst, start, channel_in); + } + } + } + /* 'count' is expected to be non-zero here. */ + dst += channel_in; + } + } + + clamp_output(dst_base, output_x * output_y * channel_in, act_min, act_max); + + return ARM_MATH_SUCCESS; +} + +/** + * @} end of Pooling group + */ diff --git a/Drivers/CMSIS/NN/Source/PoolingFunctions/arm_max_pool_s8.c b/Drivers/CMSIS/NN/Source/PoolingFunctions/arm_max_pool_s8.c new file mode 100644 index 0000000..4fbbc91 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/PoolingFunctions/arm_max_pool_s8.c @@ -0,0 +1,229 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_max_pool_s8.c + * Description: Pooling function implementations + * + * $Date: 20. July 2021 + * $Revision: V.2.0.3 + * + * Target Processor: Cortex-M CPUs + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +static void compare_and_replace_if_larger_q7(q7_t *base, const q7_t *target, int32_t length) +{ +#if defined(ARM_MATH_MVEI) + int32_t loop_count = (length + 15) / 16; + for (int i = 0; i < loop_count; i++) + { + mve_pred16_t p = vctp8q((uint32_t)length); + const int8x16_t op_1 = vldrbq_z_s8(base, p); + const int8x16_t op_2 = vldrbq_z_s8(target, p); + const int8x16_t max = vmaxq_m_s8(vuninitializedq_s8(), op_1, op_2, p); + vstrbq_p_s8(base, max, p); + base += 16; + target += 16; + length -= 16; + } +#else + q7_t *dst = base; + const q7_t *src = target; + union arm_nnword ref_max; + union arm_nnword comp_max; + int32_t cnt = length >> 2; + + while (cnt > 0l) + { + ref_max.word = arm_nn_read_q7x4(dst); + comp_max.word = arm_nn_read_q7x4_ia(&src); + + if (comp_max.bytes[0] > ref_max.bytes[0]) + { + ref_max.bytes[0] = comp_max.bytes[0]; + } + if (comp_max.bytes[1] > ref_max.bytes[1]) + { + ref_max.bytes[1] = comp_max.bytes[1]; + } + if (comp_max.bytes[2] > ref_max.bytes[2]) + { + ref_max.bytes[2] = comp_max.bytes[2]; + } + if (comp_max.bytes[3] > ref_max.bytes[3]) + { + ref_max.bytes[3] = comp_max.bytes[3]; + } + + arm_nn_write_q7x4_ia(&dst, ref_max.word); + + cnt--; + } + + cnt = length & 0x3; + while (cnt > 0l) + { + if (*src > *dst) + { + *dst = *src; + } + dst++; + src++; + cnt--; + } +#endif +} + +static void clamp_output(q7_t *source, int32_t length, const int32_t act_min, const int32_t act_max) +{ +#if defined(ARM_MATH_MVEI) + int32_t loop_count = (length + 15) / 16; + for (int i = 0; i < loop_count; i++) + { + mve_pred16_t p = vctp8q((uint32_t)length); + length -= 16; + const int8x16_t src = vldrbq_z_s8(source, p); + const int8x16_t predicated_min = vdupq_m_n_s8(vuninitializedq_s8(), (int8_t)act_min, p); + const int8x16_t predicated_max = vdupq_m_n_s8(vuninitializedq_s8(), (int8_t)act_max, p); + int8x16_t res = vmaxq_m_s8(vuninitializedq_s8(), src, predicated_min, p); + res = vminq_m_s8(vuninitializedq_s8(), res, predicated_max, p); + vstrbq_p_s8(source, res, p); + source += 16; + } +#else + union arm_nnword in; + int32_t cnt = length >> 2; + + while (cnt > 0l) + { + in.word = arm_nn_read_q7x4(source); + + in.bytes[0] = MAX(in.bytes[0], act_min); + in.bytes[0] = MIN(in.bytes[0], act_max); + in.bytes[1] = MAX(in.bytes[1], act_min); + in.bytes[1] = MIN(in.bytes[1], act_max); + in.bytes[2] = MAX(in.bytes[2], act_min); + in.bytes[2] = MIN(in.bytes[2], act_max); + in.bytes[3] = MAX(in.bytes[3], act_min); + in.bytes[3] = MIN(in.bytes[3], act_max); + + arm_nn_write_q7x4_ia(&source, in.word); + cnt--; + } + + cnt = length & 0x3; + while (cnt > 0l) + { + int32_t comp = *source; + comp = MAX(comp, act_min); + comp = MIN(comp, act_max); + *source++ = (int8_t)comp; + cnt--; + } +#endif +} + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup Pooling + * @{ + */ + +/* + * Optimized s8 max pooling function + * + * Refer to header file for details. + * + */ + +arm_status arm_max_pool_s8(const cmsis_nn_context *ctx, + const cmsis_nn_pool_params *pool_params, + const cmsis_nn_dims *input_dims, + const q7_t *src, + const cmsis_nn_dims *filter_dims, + const cmsis_nn_dims *output_dims, + q7_t *dst) +{ + const int32_t input_y = input_dims->h; + const int32_t input_x = input_dims->w; + const int32_t output_y = output_dims->h; + const int32_t output_x = output_dims->w; + const int32_t stride_y = pool_params->stride.h; + const int32_t stride_x = pool_params->stride.w; + const int32_t kernel_y = filter_dims->h; + const int32_t kernel_x = filter_dims->w; + const int32_t pad_y = pool_params->padding.h; + const int32_t pad_x = pool_params->padding.w; + const int32_t act_min = pool_params->activation.min; + const int32_t act_max = pool_params->activation.max; + const int32_t channel_in = input_dims->c; + (void)ctx; + q7_t *dst_base = dst; + + for (int i_y = 0, base_idx_y = -pad_y; i_y < output_y; base_idx_y += stride_y, i_y++) + { + for (int i_x = 0, base_idx_x = -pad_x; i_x < output_x; base_idx_x += stride_x, i_x++) + { + /* Condition for kernel start dimension: (base_idx_ + kernel__start) >= 0 */ + const int32_t ker_y_start = MAX(0, -base_idx_y); + const int32_t ker_x_start = MAX(0, -base_idx_x); + + /* Condition for kernel end dimension: (base_idx_ + kernel__end) < dim_src_ */ + const int32_t kernel_y_end = MIN(kernel_y, input_y - base_idx_y); + const int32_t kernel_x_end = MIN(kernel_x, input_x - base_idx_x); + + int count = 0; + + for (int k_y = ker_y_start; k_y < kernel_y_end; k_y++) + { + for (int k_x = ker_x_start; k_x < kernel_x_end; k_x++) + { + const q7_t *start = src + channel_in * (k_x + base_idx_x + (k_y + base_idx_y) * input_x); + + if (count == 0) + { + arm_memcpy_q7(dst, start, channel_in); + count++; + } + else + { + compare_and_replace_if_larger_q7(dst, start, channel_in); + } + } + } + /* 'count' is expected to be non-zero here. */ + dst += channel_in; + } + } + + clamp_output(dst_base, output_x * output_y * channel_in, act_min, act_max); + + return ARM_MATH_SUCCESS; +} + +/** + * @} end of Pooling group + */ diff --git a/Drivers/CMSIS/NN/Source/PoolingFunctions/arm_pool_q7_HWC.c b/Drivers/CMSIS/NN/Source/PoolingFunctions/arm_pool_q7_HWC.c new file mode 100644 index 0000000..5a3b1af --- /dev/null +++ b/Drivers/CMSIS/NN/Source/PoolingFunctions/arm_pool_q7_HWC.c @@ -0,0 +1,464 @@ +/* + * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_pool_q7_HWC.c + * Description: Pooling function implementations + * + * $Date: 20. July 2021 + * $Revision: V.1.1.1 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) + +/** + * @brief A few utility functions used by pooling functions + * + * + */ + +static void buffer_scale_back_q15_to_q7(q15_t *buffer, q7_t *target, uint16_t length, uint16_t scale) +{ + int i; + + for (i = 0; i < length; i++) + { + target[i] = (q7_t)(buffer[i] / scale); + } +} + +static void compare_and_replace_if_larger_q7(q7_t *base, // base data + const q7_t *target, // compare target + const uint16_t length // data length +) +{ + q7_t *pIn = base; + const q7_t *pCom = target; + union arm_nnword in; + union arm_nnword com; + uint16_t cnt = length >> 2; + + while (cnt > 0u) + { + in.word = arm_nn_read_q7x4((const q7_t *)pIn); + com.word = arm_nn_read_q7x4_ia((const q7_t **)&pCom); + + // if version + if (com.bytes[0] > in.bytes[0]) + in.bytes[0] = com.bytes[0]; + if (com.bytes[1] > in.bytes[1]) + in.bytes[1] = com.bytes[1]; + if (com.bytes[2] > in.bytes[2]) + in.bytes[2] = com.bytes[2]; + if (com.bytes[3] > in.bytes[3]) + in.bytes[3] = com.bytes[3]; + + arm_nn_write_q7x4_ia(&pIn, in.word); + + cnt--; + } + + cnt = length & 0x3; + while (cnt > 0u) + { + if (*pCom > *pIn) + { + *pIn = *pCom; + } + pIn++; + pCom++; + cnt--; + } +} + +static void accumulate_q7_to_q15(q15_t *base, q7_t *target, const uint16_t length) +{ + q15_t *pCnt = base; + q7_t *pV = target; + q31_t v1, v2, vo1, vo2; + uint16_t cnt = length >> 2; + q31_t in; + + while (cnt > 0u) + { + q31_t value = arm_nn_read_q7x4_ia((const q7_t **)&pV); + v1 = __SXTB16(__ROR(value, 8)); + v2 = __SXTB16(value); +#ifndef ARM_MATH_BIG_ENDIAN + + vo2 = __PKHTB(v1, v2, 16); + vo1 = __PKHBT(v2, v1, 16); + +#else + + vo1 = __PKHTB(v1, v2, 16); + vo2 = __PKHBT(v2, v1, 16); + +#endif + + in = arm_nn_read_q15x2(pCnt); + arm_nn_write_q15x2_ia(&pCnt, __QADD16(vo1, in)); + + in = arm_nn_read_q15x2(pCnt); + arm_nn_write_q15x2_ia(&pCnt, __QADD16(vo2, in)); + + cnt--; + } + cnt = length & 0x3; + while (cnt > 0u) + { + *pCnt++ += *pV++; + cnt--; + } +} + +#endif // ARM_MATH_DSP + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup Pooling + * @{ + */ + +/** + * @brief Q7 max pooling function + * @param[in, out] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimention + * @param[in] ch_im_in number of input tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA Not used + * @param[in,out] Im_out pointer to output tensor + * + * @details + * + * The pooling function is implemented as split x-pooling then + * y-pooling. + * + * This pooling function is input-destructive. Input data is undefined + * after calling this function. + * + */ + +void arm_maxpool_q7_HWC(q7_t *Im_in, + const uint16_t dim_im_in, + const uint16_t ch_im_in, + const uint16_t dim_kernel, + const uint16_t padding, + const uint16_t stride, + const uint16_t dim_im_out, + q7_t *bufferA, + q7_t *Im_out) +{ + (void)bufferA; +#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) + /* Run the following code for Cortex-M4 and Cortex-M7 */ + + int16_t i_x, i_y; + + /* first does the pooling along x axis */ + for (i_y = 0; i_y < dim_im_in; i_y++) + { + + for (i_x = 0; i_x < dim_im_out; i_x++) + { + /* for each output pixel */ + q7_t *target = Im_in + (i_y * dim_im_in + i_x) * ch_im_in; + q7_t *win_start; + q7_t *win_stop; + if (i_x * stride - padding < 0) + { + win_start = target; + } + else + { + win_start = Im_in + (i_y * dim_im_in + i_x * stride - padding) * ch_im_in; + } + + if (i_x * stride - padding + dim_kernel >= dim_im_in) + { + win_stop = Im_in + (i_y * dim_im_in + dim_im_in) * ch_im_in; + } + else + { + win_stop = Im_in + (i_y * dim_im_in + i_x * stride - padding + dim_kernel) * ch_im_in; + } + + /* first step is to copy over initial data */ + /* arm_copy_q7(win_start, target, ch_im_in); */ + memmove(target, win_start, ch_im_in); + + /* start the max operation from the second part */ + win_start += ch_im_in; + for (; win_start < win_stop; win_start += ch_im_in) + { + compare_and_replace_if_larger_q7(target, win_start, ch_im_in); + } + } + } + + /* then does the pooling along y axis */ + for (i_y = 0; i_y < dim_im_out; i_y++) + { + + /* for each output row */ + q7_t *target = Im_out + i_y * dim_im_out * ch_im_in; + q7_t *row_start; + q7_t *row_end; + /* setting the starting row */ + if (i_y * stride - padding < 0) + { + row_start = Im_in; + } + else + { + row_start = Im_in + (i_y * stride - padding) * dim_im_in * ch_im_in; + } + /* setting the stopping row */ + if (i_y * stride - padding + dim_kernel >= dim_im_in) + { + row_end = Im_in + dim_im_in * dim_im_in * ch_im_in; + } + else + { + row_end = Im_in + (i_y * stride - padding + dim_kernel) * dim_im_in * ch_im_in; + } + + /* copy over the first row */ + /* arm_copy_q7(row_start, target, dim_im_out * ch_im_in); */ + memmove(target, row_start, dim_im_out * ch_im_in); + + /* move over to next row */ + row_start += ch_im_in * dim_im_in; + + for (; row_start < row_end; row_start += dim_im_in * ch_im_in) + { + compare_and_replace_if_larger_q7(target, row_start, dim_im_out * ch_im_in); + } + } + +#else + /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ + int16_t i_ch_in, i_x, i_y; + int16_t k_x, k_y; + + for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++) + { + for (i_y = 0; i_y < dim_im_out; i_y++) + { + for (i_x = 0; i_x < dim_im_out; i_x++) + { + int max = -129; + for (k_y = i_y * stride - padding; k_y < i_y * stride - padding + dim_kernel; k_y++) + { + for (k_x = i_x * stride - padding; k_x < i_x * stride - padding + dim_kernel; k_x++) + { + if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in && k_x < dim_im_in) + { + if (Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)] > max) + { + max = Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)]; + } + } + } + } + Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out)] = max; + } + } + } + +#endif /* ARM_MATH_DSP */ +} + +/** + * @brief Q7 average pooling function + * @param[in,out] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimention + * @param[in] ch_im_in number of input tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] Im_out pointer to output tensor + * + * @details + * + * Buffer size: + * + * bufferA size: 2*dim_im_out*ch_im_in + * + * The pooling function is implemented as split x-pooling then + * y-pooling. + * + * This pooling function is input-destructive. Input data is undefined + * after calling this function. + * + */ + +void arm_avepool_q7_HWC(q7_t *Im_in, + const uint16_t dim_im_in, + const uint16_t ch_im_in, + const uint16_t dim_kernel, + const uint16_t padding, + const uint16_t stride, + const uint16_t dim_im_out, + q7_t *bufferA, + q7_t *Im_out) +{ + +#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) + /* Run the following code for Cortex-M4 and Cortex-M7 */ + + q15_t *buffer = (q15_t *)bufferA; + int16_t i_x, i_y; + int16_t count = 0; + + /* first does the pooling along x axis */ + for (i_y = 0; i_y < dim_im_in; i_y++) + { + + for (i_x = 0; i_x < dim_im_out; i_x++) + { + /* for each output pixel */ + q7_t *target = Im_in + (i_y * dim_im_in + i_x) * ch_im_in; + q7_t *win_start; + q7_t *win_stop; + if (i_x * stride - padding < 0) + { + win_start = target; + } + else + { + win_start = Im_in + (i_y * dim_im_in + i_x * stride - padding) * ch_im_in; + } + + if (i_x * stride - padding + dim_kernel >= dim_im_in) + { + win_stop = Im_in + (i_y * dim_im_in + dim_im_in) * ch_im_in; + } + else + { + win_stop = Im_in + (i_y * dim_im_in + i_x * stride - padding + dim_kernel) * ch_im_in; + } + + /* first step is to copy over initial data */ + arm_q7_to_q15_no_shift(win_start, buffer, ch_im_in); + count = 1; + + /* start the max operation from the second part */ + win_start += ch_im_in; + for (; win_start < win_stop; win_start += ch_im_in) + { + accumulate_q7_to_q15(buffer, win_start, ch_im_in); + count++; + } + buffer_scale_back_q15_to_q7(buffer, target, ch_im_in, count); + } + } + + /* then does the pooling along y axis */ + for (i_y = 0; i_y < dim_im_out; i_y++) + { + /* for each output row */ + q7_t *target = Im_out + i_y * dim_im_out * ch_im_in; + q7_t *row_start; + q7_t *row_end; + /* setting the starting row */ + if (i_y * stride - padding < 0) + { + row_start = Im_in; + } + else + { + row_start = Im_in + (i_y * stride - padding) * dim_im_in * ch_im_in; + } + /* setting the stopping row */ + if (i_y * stride - padding + dim_kernel >= dim_im_in) + { + row_end = Im_in + dim_im_in * dim_im_in * ch_im_in; + } + else + { + row_end = Im_in + (i_y * stride - padding + dim_kernel) * dim_im_in * ch_im_in; + } + + /* copy over the first row */ + arm_q7_to_q15_no_shift(row_start, buffer, dim_im_out * ch_im_in); + count = 1; + + /* move over to next row */ + row_start += ch_im_in * dim_im_in; + + for (; row_start < row_end; row_start += dim_im_in * ch_im_in) + { + accumulate_q7_to_q15(buffer, row_start, dim_im_out * ch_im_in); + count++; + } + buffer_scale_back_q15_to_q7(buffer, target, dim_im_out * ch_im_in, count); + } + +#else + /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ + + (void)bufferA; + int16_t i_ch_in, i_x, i_y; + int16_t k_x, k_y; + + for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++) + { + for (i_y = 0; i_y < dim_im_out; i_y++) + { + for (i_x = 0; i_x < dim_im_out; i_x++) + { + int sum = 0; + int count = 0; + for (k_y = i_y * stride - padding; k_y < i_y * stride - padding + dim_kernel; k_y++) + { + for (k_x = i_x * stride - padding; k_x < i_x * stride - padding + dim_kernel; k_x++) + { + if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in && k_x < dim_im_in) + { + sum += Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)]; + count++; + } + } + } + Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out)] = sum / count; + } + } + } + +#endif /* ARM_MATH_DSP */ +} + +/** + * @} end of Pooling group + */ -- cgit v1.2.3