diff options
Diffstat (limited to 'Drivers/CMSIS/NN/Source/PoolingFunctions/arm_pool_q7_HWC.c')
-rw-r--r-- | Drivers/CMSIS/NN/Source/PoolingFunctions/arm_pool_q7_HWC.c | 924 |
1 files changed, 460 insertions, 464 deletions
diff --git a/Drivers/CMSIS/NN/Source/PoolingFunctions/arm_pool_q7_HWC.c b/Drivers/CMSIS/NN/Source/PoolingFunctions/arm_pool_q7_HWC.c index 5a3b1af..71fb771 100644 --- a/Drivers/CMSIS/NN/Source/PoolingFunctions/arm_pool_q7_HWC.c +++ b/Drivers/CMSIS/NN/Source/PoolingFunctions/arm_pool_q7_HWC.c @@ -1,464 +1,460 @@ -/* - * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved. - * - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the License); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an AS IS BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* ---------------------------------------------------------------------- - * Project: CMSIS NN Library - * Title: arm_pool_q7_HWC.c - * Description: Pooling function implementations - * - * $Date: 20. July 2021 - * $Revision: V.1.1.1 - * - * Target Processor: Cortex-M cores - * - * -------------------------------------------------------------------- */ - -#include "arm_nnfunctions.h" -#include "arm_nnsupportfunctions.h" - -#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) - -/** - * @brief A few utility functions used by pooling functions - * - * - */ - -static void buffer_scale_back_q15_to_q7(q15_t *buffer, q7_t *target, uint16_t length, uint16_t scale) -{ - int i; - - for (i = 0; i < length; i++) - { - target[i] = (q7_t)(buffer[i] / scale); - } -} - -static void compare_and_replace_if_larger_q7(q7_t *base, // base data - const q7_t *target, // compare target - const uint16_t length // data length -) -{ - q7_t *pIn = base; - const q7_t *pCom = target; - union arm_nnword in; - union arm_nnword com; - uint16_t cnt = length >> 2; - - while (cnt > 0u) - { - in.word = arm_nn_read_q7x4((const q7_t *)pIn); - com.word = arm_nn_read_q7x4_ia((const q7_t **)&pCom); - - // if version - if (com.bytes[0] > in.bytes[0]) - in.bytes[0] = com.bytes[0]; - if (com.bytes[1] > in.bytes[1]) - in.bytes[1] = com.bytes[1]; - if (com.bytes[2] > in.bytes[2]) - in.bytes[2] = com.bytes[2]; - if (com.bytes[3] > in.bytes[3]) - in.bytes[3] = com.bytes[3]; - - arm_nn_write_q7x4_ia(&pIn, in.word); - - cnt--; - } - - cnt = length & 0x3; - while (cnt > 0u) - { - if (*pCom > *pIn) - { - *pIn = *pCom; - } - pIn++; - pCom++; - cnt--; - } -} - -static void accumulate_q7_to_q15(q15_t *base, q7_t *target, const uint16_t length) -{ - q15_t *pCnt = base; - q7_t *pV = target; - q31_t v1, v2, vo1, vo2; - uint16_t cnt = length >> 2; - q31_t in; - - while (cnt > 0u) - { - q31_t value = arm_nn_read_q7x4_ia((const q7_t **)&pV); - v1 = __SXTB16(__ROR(value, 8)); - v2 = __SXTB16(value); -#ifndef ARM_MATH_BIG_ENDIAN - - vo2 = __PKHTB(v1, v2, 16); - vo1 = __PKHBT(v2, v1, 16); - -#else - - vo1 = __PKHTB(v1, v2, 16); - vo2 = __PKHBT(v2, v1, 16); - -#endif - - in = arm_nn_read_q15x2(pCnt); - arm_nn_write_q15x2_ia(&pCnt, __QADD16(vo1, in)); - - in = arm_nn_read_q15x2(pCnt); - arm_nn_write_q15x2_ia(&pCnt, __QADD16(vo2, in)); - - cnt--; - } - cnt = length & 0x3; - while (cnt > 0u) - { - *pCnt++ += *pV++; - cnt--; - } -} - -#endif // ARM_MATH_DSP - -/** - * @ingroup groupNN - */ - -/** - * @addtogroup Pooling - * @{ - */ - -/** - * @brief Q7 max pooling function - * @param[in, out] Im_in pointer to input tensor - * @param[in] dim_im_in input tensor dimention - * @param[in] ch_im_in number of input tensor channels - * @param[in] dim_kernel filter kernel size - * @param[in] padding padding sizes - * @param[in] stride convolution stride - * @param[in] dim_im_out output tensor dimension - * @param[in,out] bufferA Not used - * @param[in,out] Im_out pointer to output tensor - * - * @details - * - * The pooling function is implemented as split x-pooling then - * y-pooling. - * - * This pooling function is input-destructive. Input data is undefined - * after calling this function. - * - */ - -void arm_maxpool_q7_HWC(q7_t *Im_in, - const uint16_t dim_im_in, - const uint16_t ch_im_in, - const uint16_t dim_kernel, - const uint16_t padding, - const uint16_t stride, - const uint16_t dim_im_out, - q7_t *bufferA, - q7_t *Im_out) -{ - (void)bufferA; -#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) - /* Run the following code for Cortex-M4 and Cortex-M7 */ - - int16_t i_x, i_y; - - /* first does the pooling along x axis */ - for (i_y = 0; i_y < dim_im_in; i_y++) - { - - for (i_x = 0; i_x < dim_im_out; i_x++) - { - /* for each output pixel */ - q7_t *target = Im_in + (i_y * dim_im_in + i_x) * ch_im_in; - q7_t *win_start; - q7_t *win_stop; - if (i_x * stride - padding < 0) - { - win_start = target; - } - else - { - win_start = Im_in + (i_y * dim_im_in + i_x * stride - padding) * ch_im_in; - } - - if (i_x * stride - padding + dim_kernel >= dim_im_in) - { - win_stop = Im_in + (i_y * dim_im_in + dim_im_in) * ch_im_in; - } - else - { - win_stop = Im_in + (i_y * dim_im_in + i_x * stride - padding + dim_kernel) * ch_im_in; - } - - /* first step is to copy over initial data */ - /* arm_copy_q7(win_start, target, ch_im_in); */ - memmove(target, win_start, ch_im_in); - - /* start the max operation from the second part */ - win_start += ch_im_in; - for (; win_start < win_stop; win_start += ch_im_in) - { - compare_and_replace_if_larger_q7(target, win_start, ch_im_in); - } - } - } - - /* then does the pooling along y axis */ - for (i_y = 0; i_y < dim_im_out; i_y++) - { - - /* for each output row */ - q7_t *target = Im_out + i_y * dim_im_out * ch_im_in; - q7_t *row_start; - q7_t *row_end; - /* setting the starting row */ - if (i_y * stride - padding < 0) - { - row_start = Im_in; - } - else - { - row_start = Im_in + (i_y * stride - padding) * dim_im_in * ch_im_in; - } - /* setting the stopping row */ - if (i_y * stride - padding + dim_kernel >= dim_im_in) - { - row_end = Im_in + dim_im_in * dim_im_in * ch_im_in; - } - else - { - row_end = Im_in + (i_y * stride - padding + dim_kernel) * dim_im_in * ch_im_in; - } - - /* copy over the first row */ - /* arm_copy_q7(row_start, target, dim_im_out * ch_im_in); */ - memmove(target, row_start, dim_im_out * ch_im_in); - - /* move over to next row */ - row_start += ch_im_in * dim_im_in; - - for (; row_start < row_end; row_start += dim_im_in * ch_im_in) - { - compare_and_replace_if_larger_q7(target, row_start, dim_im_out * ch_im_in); - } - } - -#else - /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ - int16_t i_ch_in, i_x, i_y; - int16_t k_x, k_y; - - for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++) - { - for (i_y = 0; i_y < dim_im_out; i_y++) - { - for (i_x = 0; i_x < dim_im_out; i_x++) - { - int max = -129; - for (k_y = i_y * stride - padding; k_y < i_y * stride - padding + dim_kernel; k_y++) - { - for (k_x = i_x * stride - padding; k_x < i_x * stride - padding + dim_kernel; k_x++) - { - if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in && k_x < dim_im_in) - { - if (Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)] > max) - { - max = Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)]; - } - } - } - } - Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out)] = max; - } - } - } - -#endif /* ARM_MATH_DSP */ -} - -/** - * @brief Q7 average pooling function - * @param[in,out] Im_in pointer to input tensor - * @param[in] dim_im_in input tensor dimention - * @param[in] ch_im_in number of input tensor channels - * @param[in] dim_kernel filter kernel size - * @param[in] padding padding sizes - * @param[in] stride convolution stride - * @param[in] dim_im_out output tensor dimension - * @param[in,out] bufferA pointer to buffer space for input - * @param[in,out] Im_out pointer to output tensor - * - * @details - * - * <b>Buffer size:</b> - * - * bufferA size: 2*dim_im_out*ch_im_in - * - * The pooling function is implemented as split x-pooling then - * y-pooling. - * - * This pooling function is input-destructive. Input data is undefined - * after calling this function. - * - */ - -void arm_avepool_q7_HWC(q7_t *Im_in, - const uint16_t dim_im_in, - const uint16_t ch_im_in, - const uint16_t dim_kernel, - const uint16_t padding, - const uint16_t stride, - const uint16_t dim_im_out, - q7_t *bufferA, - q7_t *Im_out) -{ - -#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI) - /* Run the following code for Cortex-M4 and Cortex-M7 */ - - q15_t *buffer = (q15_t *)bufferA; - int16_t i_x, i_y; - int16_t count = 0; - - /* first does the pooling along x axis */ - for (i_y = 0; i_y < dim_im_in; i_y++) - { - - for (i_x = 0; i_x < dim_im_out; i_x++) - { - /* for each output pixel */ - q7_t *target = Im_in + (i_y * dim_im_in + i_x) * ch_im_in; - q7_t *win_start; - q7_t *win_stop; - if (i_x * stride - padding < 0) - { - win_start = target; - } - else - { - win_start = Im_in + (i_y * dim_im_in + i_x * stride - padding) * ch_im_in; - } - - if (i_x * stride - padding + dim_kernel >= dim_im_in) - { - win_stop = Im_in + (i_y * dim_im_in + dim_im_in) * ch_im_in; - } - else - { - win_stop = Im_in + (i_y * dim_im_in + i_x * stride - padding + dim_kernel) * ch_im_in; - } - - /* first step is to copy over initial data */ - arm_q7_to_q15_no_shift(win_start, buffer, ch_im_in); - count = 1; - - /* start the max operation from the second part */ - win_start += ch_im_in; - for (; win_start < win_stop; win_start += ch_im_in) - { - accumulate_q7_to_q15(buffer, win_start, ch_im_in); - count++; - } - buffer_scale_back_q15_to_q7(buffer, target, ch_im_in, count); - } - } - - /* then does the pooling along y axis */ - for (i_y = 0; i_y < dim_im_out; i_y++) - { - /* for each output row */ - q7_t *target = Im_out + i_y * dim_im_out * ch_im_in; - q7_t *row_start; - q7_t *row_end; - /* setting the starting row */ - if (i_y * stride - padding < 0) - { - row_start = Im_in; - } - else - { - row_start = Im_in + (i_y * stride - padding) * dim_im_in * ch_im_in; - } - /* setting the stopping row */ - if (i_y * stride - padding + dim_kernel >= dim_im_in) - { - row_end = Im_in + dim_im_in * dim_im_in * ch_im_in; - } - else - { - row_end = Im_in + (i_y * stride - padding + dim_kernel) * dim_im_in * ch_im_in; - } - - /* copy over the first row */ - arm_q7_to_q15_no_shift(row_start, buffer, dim_im_out * ch_im_in); - count = 1; - - /* move over to next row */ - row_start += ch_im_in * dim_im_in; - - for (; row_start < row_end; row_start += dim_im_in * ch_im_in) - { - accumulate_q7_to_q15(buffer, row_start, dim_im_out * ch_im_in); - count++; - } - buffer_scale_back_q15_to_q7(buffer, target, dim_im_out * ch_im_in, count); - } - -#else - /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ - - (void)bufferA; - int16_t i_ch_in, i_x, i_y; - int16_t k_x, k_y; - - for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++) - { - for (i_y = 0; i_y < dim_im_out; i_y++) - { - for (i_x = 0; i_x < dim_im_out; i_x++) - { - int sum = 0; - int count = 0; - for (k_y = i_y * stride - padding; k_y < i_y * stride - padding + dim_kernel; k_y++) - { - for (k_x = i_x * stride - padding; k_x < i_x * stride - padding + dim_kernel; k_x++) - { - if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in && k_x < dim_im_in) - { - sum += Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)]; - count++; - } - } - } - Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out)] = sum / count; - } - } - } - -#endif /* ARM_MATH_DSP */ -} - -/** - * @} end of Pooling group - */ +/*
+ * Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Project: CMSIS NN Library
+ * Title: arm_pool_q7_HWC.c
+ * Description: Pooling function implementations
+ *
+ * $Date: 17. January 2018
+ * $Revision: V.1.0.0
+ *
+ * Target Processor: Cortex-M cores
+ *
+ * -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+#include "arm_nnfunctions.h"
+
+#if defined (ARM_MATH_DSP)
+
+/**
+ * @brief A few utility functions used by pooling functions
+ *
+ *
+ */
+
+static void buffer_scale_back_q15_to_q7(q15_t * buffer, q7_t * target, uint16_t length, uint16_t scale)
+{
+ int i;
+
+ for (i = 0; i < length; i++)
+ {
+ target[i] = (q7_t) (buffer[i] / scale);
+ }
+}
+
+static void compare_and_replace_if_larger_q7(q7_t * base, // base data
+ const q7_t * target, // compare target
+ const uint16_t length // data length
+ )
+{
+ q7_t *pIn = base;
+ const q7_t *pCom = target;
+ union arm_nnword in;
+ union arm_nnword com;
+ uint16_t cnt = length >> 2;
+
+ while (cnt > 0u)
+ {
+ in.word = *__SIMD32(pIn);
+ com.word = *__SIMD32(pCom)++;
+
+ // if version
+ if (com.bytes[0] > in.bytes[0])
+ in.bytes[0] = com.bytes[0];
+ if (com.bytes[1] > in.bytes[1])
+ in.bytes[1] = com.bytes[1];
+ if (com.bytes[2] > in.bytes[2])
+ in.bytes[2] = com.bytes[2];
+ if (com.bytes[3] > in.bytes[3])
+ in.bytes[3] = com.bytes[3];
+
+ *__SIMD32(pIn)++ = in.word;
+
+ cnt--;
+ }
+
+ cnt = length & 0x3;
+ while (cnt > 0u)
+ {
+ if (*pCom > *pIn)
+ {
+ *pIn = *pCom;
+ }
+ pIn++;
+ pCom++;
+ cnt--;
+ }
+}
+
+static void accumulate_q7_to_q15(q15_t * base, q7_t * target, const uint16_t length)
+{
+ q15_t *pCnt = base;
+ q7_t *pV = target;
+ q31_t v1, v2, vo1, vo2;
+ uint16_t cnt = length >> 2;
+ q31_t in;
+
+ while (cnt > 0u)
+ {
+ q31_t value = *__SIMD32(pV)++;
+ v1 = __SXTB16(__ROR(value, 8));
+ v2 = __SXTB16(value);
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ vo2 = __PKHTB(v1, v2, 16);
+ vo1 = __PKHBT(v2, v1, 16);
+
+#else
+
+ vo1 = __PKHTB(v1, v2, 16);
+ vo2 = __PKHBT(v2, v1, 16);
+
+#endif
+
+ in = *__SIMD32(pCnt);
+ *__SIMD32(pCnt)++ = __QADD16(vo1, in);
+
+ in = *__SIMD32(pCnt);
+ *__SIMD32(pCnt)++ = __QADD16(vo2, in);
+
+ cnt--;
+ }
+ cnt = length & 0x3;
+ while (cnt > 0u)
+ {
+ *pCnt++ += *pV++;
+ cnt--;
+ }
+}
+
+#endif // ARM_MATH_DSP
+
+/**
+ * @ingroup groupNN
+ */
+
+/**
+ * @addtogroup Pooling
+ * @{
+ */
+
+ /**
+ * @brief Q7 max pooling function
+ * @param[in, out] Im_in pointer to input tensor
+ * @param[in] dim_im_in input tensor dimention
+ * @param[in] ch_im_in number of input tensor channels
+ * @param[in] dim_kernel filter kernel size
+ * @param[in] padding padding sizes
+ * @param[in] stride convolution stride
+ * @param[in] dim_im_out output tensor dimension
+ * @param[in,out] bufferA pointer to buffer space for input
+ * @param[in,out] Im_out pointer to output tensor
+ * @return none.
+ *
+ * @details
+ *
+ * <b>Buffer size:</b>
+ *
+ * bufferA size: 0
+ *
+ * The pooling function is implemented as split x-pooling then
+ * y-pooling.
+ *
+ * This pooling function is input-destructive. Input data is undefined
+ * after calling this function.
+ *
+ */
+
+void
+arm_maxpool_q7_HWC(q7_t * Im_in,
+ const uint16_t dim_im_in,
+ const uint16_t ch_im_in,
+ const uint16_t dim_kernel,
+ const uint16_t padding,
+ const uint16_t stride, const uint16_t dim_im_out, q7_t * bufferA, q7_t * Im_out)
+{
+
+#if defined (ARM_MATH_DSP)
+ /* Run the following code for Cortex-M4 and Cortex-M7 */
+
+ int16_t i_x, i_y;
+
+ /* first does the pooling along x axis */
+ for (i_y = 0; i_y < dim_im_in; i_y++)
+ {
+
+ for (i_x = 0; i_x < dim_im_out; i_x++)
+ {
+ /* for each output pixel */
+ q7_t *target = Im_in + (i_y * dim_im_in + i_x) * ch_im_in;
+ q7_t *win_start;
+ q7_t *win_stop;
+ if (i_x * stride - padding < 0)
+ {
+ win_start = target;
+ } else
+ {
+ win_start = Im_in + (i_y * dim_im_in + i_x * stride - padding) * ch_im_in;
+ }
+
+ if (i_x * stride - padding + dim_kernel >= dim_im_in)
+ {
+ win_stop = Im_in + (i_y * dim_im_in + dim_im_in) * ch_im_in;
+ } else
+ {
+ win_stop = Im_in + (i_y * dim_im_in + i_x * stride - padding + dim_kernel) * ch_im_in;
+ }
+
+ /* first step is to copy over initial data */
+ /* arm_copy_q7(win_start, target, ch_im_in); */
+ memmove(target, win_start, ch_im_in);
+
+ /* start the max operation from the second part */
+ win_start += ch_im_in;
+ for (; win_start < win_stop; win_start += ch_im_in)
+ {
+ compare_and_replace_if_larger_q7(target, win_start, ch_im_in);
+ }
+ }
+ }
+
+ /* then does the pooling along y axis */
+ for (i_y = 0; i_y < dim_im_out; i_y++)
+ {
+
+ /* for each output row */
+ q7_t *target = Im_out + i_y * dim_im_out * ch_im_in;
+ q7_t *row_start;
+ q7_t *row_end;
+ /* setting the starting row */
+ if (i_y * stride - padding < 0)
+ {
+ row_start = Im_in;
+ } else
+ {
+ row_start = Im_in + (i_y * stride - padding) * dim_im_in * ch_im_in;
+ }
+ /* setting the stopping row */
+ if (i_y * stride - padding + dim_kernel >= dim_im_in)
+ {
+ row_end = Im_in + dim_im_in * dim_im_in * ch_im_in;
+ } else
+ {
+ row_end = Im_in + (i_y * stride - padding + dim_kernel) * dim_im_in * ch_im_in;
+ }
+
+ /* copy over the first row */
+ /* arm_copy_q7(row_start, target, dim_im_out * ch_im_in); */
+ memmove(target, row_start, dim_im_out * ch_im_in);
+
+ /* move over to next row */
+ row_start += ch_im_in * dim_im_in;
+
+ for (; row_start < row_end; row_start += dim_im_in * ch_im_in)
+ {
+ compare_and_replace_if_larger_q7(target, row_start, dim_im_out * ch_im_in);
+ }
+ }
+
+#else
+ /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
+
+ int16_t i_ch_in, i_x, i_y;
+ int16_t k_x, k_y;
+
+ for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
+ {
+ for (i_y = 0; i_y < dim_im_out; i_y++)
+ {
+ for (i_x = 0; i_x < dim_im_out; i_x++)
+ {
+ int max = -129;
+ for (k_y = i_y * stride - padding; k_y < i_y * stride - padding + dim_kernel; k_y++)
+ {
+ for (k_x = i_x * stride - padding; k_x < i_x * stride - padding + dim_kernel; k_x++)
+ {
+ if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in && k_x < dim_im_in)
+ {
+ if (Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)] > max)
+ {
+ max = Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)];
+ }
+ }
+ }
+ }
+ Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out)] = max;
+ }
+ }
+ }
+
+#endif /* ARM_MATH_DSP */
+
+}
+
+ /**
+ * @brief Q7 average pooling function
+ * @param[in,out] Im_in pointer to input tensor
+ * @param[in] dim_im_in input tensor dimention
+ * @param[in] ch_im_in number of input tensor channels
+ * @param[in] dim_kernel filter kernel size
+ * @param[in] padding padding sizes
+ * @param[in] stride convolution stride
+ * @param[in] dim_im_out output tensor dimension
+ * @param[in,out] bufferA pointer to buffer space for input
+ * @param[in,out] Im_out pointer to output tensor
+ * @return none.
+ *
+ * @details
+ *
+ * <b>Buffer size:</b>
+ *
+ * bufferA size: 2*dim_im_out*ch_im_in
+ *
+ * The pooling function is implemented as split x-pooling then
+ * y-pooling.
+ *
+ * This pooling function is input-destructive. Input data is undefined
+ * after calling this function.
+ *
+ */
+
+void
+arm_avepool_q7_HWC(q7_t * Im_in,
+ const uint16_t dim_im_in,
+ const uint16_t ch_im_in,
+ const uint16_t dim_kernel,
+ const uint16_t padding,
+ const uint16_t stride, const uint16_t dim_im_out, q7_t * bufferA, q7_t * Im_out)
+{
+
+#if defined (ARM_MATH_DSP)
+ /* Run the following code for Cortex-M4 and Cortex-M7 */
+
+ q15_t *buffer = (q15_t *) bufferA;
+ int16_t i_x, i_y;
+ int16_t count = 0;
+
+ /* first does the pooling along x axis */
+ for (i_y = 0; i_y < dim_im_in; i_y++)
+ {
+
+ for (i_x = 0; i_x < dim_im_out; i_x++)
+ {
+ /* for each output pixel */
+ q7_t *target = Im_in + (i_y * dim_im_in + i_x) * ch_im_in;
+ q7_t *win_start;
+ q7_t *win_stop;
+ if (i_x * stride - padding < 0)
+ {
+ win_start = target;
+ } else
+ {
+ win_start = Im_in + (i_y * dim_im_in + i_x * stride - padding) * ch_im_in;
+ }
+
+ if (i_x * stride - padding + dim_kernel >= dim_im_in)
+ {
+ win_stop = Im_in + (i_y * dim_im_in + dim_im_in) * ch_im_in;
+ } else
+ {
+ win_stop = Im_in + (i_y * dim_im_in + i_x * stride - padding + dim_kernel) * ch_im_in;
+ }
+
+ /* first step is to copy over initial data */
+ arm_q7_to_q15_no_shift(win_start, buffer, ch_im_in);
+ count = 1;
+
+ /* start the max operation from the second part */
+ win_start += ch_im_in;
+ for (; win_start < win_stop; win_start += ch_im_in)
+ {
+ accumulate_q7_to_q15(buffer, win_start, ch_im_in);
+ count++;
+ }
+ buffer_scale_back_q15_to_q7(buffer, target, ch_im_in, count);
+ }
+ }
+
+ /* then does the pooling along y axis */
+ for (i_y = 0; i_y < dim_im_out; i_y++)
+ {
+ /* for each output row */
+ q7_t *target = Im_out + i_y * dim_im_out * ch_im_in;
+ q7_t *row_start;
+ q7_t *row_end;
+ /* setting the starting row */
+ if (i_y * stride - padding < 0)
+ {
+ row_start = Im_in;
+ } else
+ {
+ row_start = Im_in + (i_y * stride - padding) * dim_im_in * ch_im_in;
+ }
+ /* setting the stopping row */
+ if (i_y * stride - padding + dim_kernel >= dim_im_in)
+ {
+ row_end = Im_in + dim_im_in * dim_im_in * ch_im_in;
+ } else
+ {
+ row_end = Im_in + (i_y * stride - padding + dim_kernel) * dim_im_in * ch_im_in;
+ }
+
+ /* copy over the first row */
+ arm_q7_to_q15_no_shift(row_start, buffer, dim_im_out * ch_im_in);
+ count = 1;
+
+ /* move over to next row */
+ row_start += ch_im_in * dim_im_in;
+
+ for (; row_start < row_end; row_start += dim_im_in * ch_im_in)
+ {
+ accumulate_q7_to_q15(buffer, row_start, dim_im_out * ch_im_in);
+ count++;
+ }
+ buffer_scale_back_q15_to_q7(buffer, target, dim_im_out * ch_im_in, count);
+ }
+
+#else
+ /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
+
+ int16_t i_ch_in, i_x, i_y;
+ int16_t k_x, k_y;
+
+ for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
+ {
+ for (i_y = 0; i_y < dim_im_out; i_y++)
+ {
+ for (i_x = 0; i_x < dim_im_out; i_x++)
+ {
+ int sum = 0;
+ int count = 0;
+ for (k_y = i_y * stride - padding; k_y < i_y * stride - padding + dim_kernel; k_y++)
+ {
+ for (k_x = i_x * stride - padding; k_x < i_x * stride - padding + dim_kernel; k_x++)
+ {
+ if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in && k_x < dim_im_in)
+ {
+ sum += Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)];
+ count++;
+ }
+ }
+ }
+ Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out)] = sum / count;
+ }
+ }
+ }
+
+#endif /* ARM_MATH_DSP */
+
+}
+
+/**
+ * @} end of Pooling group
+ */
|