6 files changed, 1426 insertions, 0 deletions
diff --git a/Drivers/CMSIS/NN/Source/PoolingFunctions/CMakeLists.txt b/Drivers/CMSIS/NN/Source/PoolingFunctions/CMakeLists.txt
new file mode 100644
index 0000000..a37503b
--- /dev/null
+++ b/Drivers/CMSIS/NN/Source/PoolingFunctions/CMakeLists.txt
@@ -0,0 +1,24 @@
+#
+# Copyright (c) 2019-2022 Arm Limited.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+file(GLOB SRC "./*_s8.c")
+file(GLOB SRC_S16 "./*_s16.c")
+target_sources(cmsis-nn PRIVATE ${SRC} ${SRC_S16})
+
+
+
diff --git a/Drivers/CMSIS/NN/Source/PoolingFunctions/arm_avgpool_s16.c b/Drivers/CMSIS/NN/Source/PoolingFunctions/arm_avgpool_s16.c
new file mode 100644
index 0000000..5cd2b1c
--- /dev/null
+++ b/Drivers/CMSIS/NN/Source/PoolingFunctions/arm_avgpool_s16.c
@@ -0,0 +1,128 @@
+/*
+ * Copyright (C) 2022 Arm Limited or its affiliates.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS NN Library
+ * Title:        arm_avgpool_s16.c
+ * Description:  Pooling function implementations
+ *
+ * $Date:        3. February 2022
+ * $Revision:    V.1.0.1
+ *
+ * Target Processor:  Cortex-M CPUs
+ *
+ * -------------------------------------------------------------------- */
+
+#include "arm_nnfunctions.h"
+#include "arm_nnsupportfunctions.h"
+
+/**
+ *  @ingroup groupNN
+ */
+
+/**
+ * @addtogroup Pooling
+ * @{
+ */
+
+/*
+ * s16 average pooling function
+ *
+ * Refer to header file for details.
+ *
+ */
+arm_status arm_avgpool_s16(const cmsis_nn_context *ctx,
+                           const cmsis_nn_pool_params *pool_params,
+                           const cmsis_nn_dims *input_dims,
+                           const q15_t *src,
+                           const cmsis_nn_dims *filter_dims,
+                           const cmsis_nn_dims *output_dims,
+                           q15_t *dst)
+{
+    (void)ctx;
+    const int32_t input_y = input_dims->h;
+    const int32_t input_x = input_dims->w;
+    const int32_t output_y = output_dims->h;
+    const int32_t output_x = output_dims->w;
+    const int32_t stride_y = pool_params->stride.h;
+    const int32_t stride_x = pool_params->stride.w;
+    const int32_t kernel_y = filter_dims->h;
+    const int32_t kernel_x = filter_dims->w;
+    const int32_t pad_y = pool_params->padding.h;
+    const int32_t pad_x = pool_params->padding.w;
+    const int32_t act_min = pool_params->activation.min;
+    const int32_t act_max = pool_params->activation.max;
+    const int32_t ch_src = input_dims->c;
+
+    /* Reference C code adapted from CMSIS-NN arm_avgpool_s8.c.
+     */
+
+    for (int i_y = 0, base_idx_y = -pad_y; i_y < output_y; base_idx_y += stride_y, i_y++)
+    {
+        for (int i_x = 0, base_idx_x = -pad_x; i_x < output_x; base_idx_x += stride_x, i_x++)
+        {
+            /* Condition for kernel start dimension: (base_idx_<x,y> + kernel_<x,y>_start) >= 0 */
+            const int32_t ker_y_start = MAX(0, -base_idx_y);
+            const int32_t ker_x_start = MAX(0, -base_idx_x);
+
+            /* Condition for kernel end dimension: (base_idx_<x,y> + kernel_<x,y>_end) < dim_src_<width,height> */
+            const int32_t kernel_y_end = MIN(kernel_y, input_y - base_idx_y);
+            const int32_t kernel_x_end = MIN(kernel_x, input_x - base_idx_x);
+
+            for (int i_ch_in = 0; i_ch_in < ch_src; i_ch_in++)
+            {
+                int sum = 0;
+                int count = 0;
+
+                for (int k_y = ker_y_start; k_y < kernel_y_end; k_y++)
+                {
+                    for (int k_x = ker_x_start; k_x < kernel_x_end; k_x++)
+                    {
+                        sum += src[i_ch_in + ch_src * (k_x + base_idx_x + (k_y + base_idx_y) * input_x)];
+                        count++;
+                    }
+                }
+
+                // Prevent static code issue DIVIDE_BY_ZERO.
+                if (count == 0)
+                {
+                    return ARM_MATH_ARGUMENT_ERROR;
+                }
+
+                sum = sum > 0 ? (sum + count / 2) / count : (sum - count / 2) / count;
+                sum = MAX(sum, act_min);
+                sum = MIN(sum, act_max);
+
+                dst[i_ch_in + ch_src * (i_x + i_y * output_x)] = sum;
+            }
+        }
+    }
+
+    return ARM_MATH_SUCCESS;
+}
+
+int32_t arm_avgpool_s16_get_buffer_size(const int output_x, const int ch_src)
+{
+    (void)output_x;
+    (void)ch_src;
+    return 0;
+}
+
+/**
+ * @} end of Pooling group
+ */
diff --git a/Drivers/CMSIS/NN/Source/PoolingFunctions/arm_avgpool_s8.c b/Drivers/CMSIS/NN/Source/PoolingFunctions/arm_avgpool_s8.c
new file mode 100644
index 0000000..3e9861e
--- /dev/null
+++ b/Drivers/CMSIS/NN/Source/PoolingFunctions/arm_avgpool_s8.c
@@ -0,0 +1,401 @@
+/*
+ * Copyright (C) 2010-2021 Arm Limited or its affiliates.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS NN Library
+ * Title:        arm_avgpool_s8.c
+ * Description:  Pooling function implementations
+ *
+ * $Date:        01. March 2021
+ * $Revision:    V.2.0.4
+ *
+ * Target Processor:  Cortex-M CPUs
+ *
+ * -------------------------------------------------------------------- */
+
+#include "arm_nnfunctions.h"
+#include "arm_nnsupportfunctions.h"
+
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
+
+static void scale_q31_to_q7_and_clamp(const q31_t *buffer,
+                                      q7_t *target,
+                                      int32_t length,
+                                      const int32_t count,
+                                      const int act_min,
+                                      const int act_max)
+{
+    const int half_count = count / 2;
+
+    // Prevent static code issue DIVIDE_BY_ZERO.
+    if (count == 0)
+    {
+        return;
+    }
+
+    for (int i = 0; i < length; i++)
+    {
+        int32_t sum = buffer[i] > 0 ? (buffer[i] + half_count) : (buffer[i] - half_count);
+        sum = sum / count;
+        sum = MAX(sum, act_min);
+        sum = MIN(sum, act_max);
+
+        target[i] = (q7_t)sum;
+    }
+}
+#endif
+
+/**
+ *  @ingroup groupNN
+ */
+
+/**
+ * @addtogroup Pooling
+ * @{
+ */
+
+/*
+ * s8 average pooling function
+ *
+ * Refer to header file for details.
+ *
+ */
+
+#if defined(ARM_MATH_MVEI)
+
+arm_status arm_avgpool_s8(const cmsis_nn_context *ctx,
+                          const cmsis_nn_pool_params *pool_params,
+                          const cmsis_nn_dims *input_dims,
+                          const q7_t *src,
+                          const cmsis_nn_dims *filter_dims,
+                          const cmsis_nn_dims *output_dims,
+                          q7_t *dst)
+{
+    (void)ctx;
+    const int32_t input_y = input_dims->h;
+    const int32_t input_x = input_dims->w;
+    const int32_t output_y = output_dims->h;
+    const int32_t output_x = output_dims->w;
+    const int32_t stride_y = pool_params->stride.h;
+    const int32_t stride_x = pool_params->stride.w;
+    const int32_t kernel_y = filter_dims->h;
+    const int32_t kernel_x = filter_dims->w;
+    const int32_t pad_y = pool_params->padding.h;
+    const int32_t pad_x = pool_params->padding.w;
+    const int32_t act_min = pool_params->activation.min;
+    const int32_t act_max = pool_params->activation.max;
+    const int32_t ch_src = input_dims->c;
+
+    int32_t i_x, i_y;
+    int32_t k_x, k_y;
+
+    for (i_y = 0; i_y < output_y; i_y++)
+    {
+        for (i_x = 0; i_x < output_x; i_x++)
+        {
+
+            int32_t k_y_start, k_y_end;
+            int32_t k_x_start, k_x_end;
+            int32_t chCnt;
+            const int8_t *pTmp, *pTmpInner;
+            int8_t *pDst;
+
+            k_y_start = MAX(0, i_y * stride_y - pad_y);
+            k_y_end = MIN(i_y * stride_y - pad_y + kernel_y, input_y);
+
+            k_x_start = MAX(0, i_x * stride_x - pad_x);
+            k_x_end = MIN(i_x * stride_x - pad_x + kernel_x, input_x);
+
+            pTmp = src;
+            pDst = &dst[ch_src * (i_x + i_y * output_x)];
+
+            chCnt = ch_src >> 4;
+            while (chCnt > 0)
+            {
+                int32x4_t sumV1, sumV2, sumV3, sumV4;
+
+                int8x16_t tempV;
+                int16x8_t tempVLO, tempVHI;
+                int32x4_t tempVLOLO, tempVLOHI, tempVHILO, tempVHIHI;
+                int32_t count = 0;
+
+                sumV1 = vdupq_n_s32(0);
+                sumV2 = vdupq_n_s32(0);
+                sumV3 = vdupq_n_s32(0);
+                sumV4 = vdupq_n_s32(0);
+
+                for (k_y = k_y_start; k_y < k_y_end; k_y++)
+                {
+                    for (k_x = k_x_start; k_x < k_x_end; k_x++)
+                    {
+                        pTmpInner = pTmp + (ch_src * (k_x + k_y * input_x));
+                        tempV = vldrbq_s8(pTmpInner);
+
+                        tempVLO = vmovlbq_s8(tempV);
+                        tempVHI = vmovltq_s8(tempV);
+
+                        tempVLOLO = vmovlbq_s16(tempVLO);
+                        tempVLOHI = vmovltq_s16(tempVLO);
+
+                        tempVHILO = vmovlbq_s16(tempVHI);
+                        tempVHIHI = vmovltq_s16(tempVHI);
+
+                        sumV1 = vaddq_s32(sumV1, tempVLOLO);
+                        sumV2 = vaddq_s32(sumV2, tempVLOHI);
+                        sumV3 = vaddq_s32(sumV3, tempVHILO);
+                        sumV4 = vaddq_s32(sumV4, tempVHIHI);
+
+                        count++;
+                    }
+                }
+
+                // Prevent static code issue DIVIDE_BY_ZERO.
+                if (count == 0)
+                {
+                    return ARM_MATH_ARGUMENT_ERROR;
+                }
+
+                sumV1[0] = sumV1[0] > 0 ? (sumV1[0] + count / 2) / count : (sumV1[0] - count / 2) / count;
+                sumV1[1] = sumV1[1] > 0 ? (sumV1[1] + count / 2) / count : (sumV1[1] - count / 2) / count;
+                sumV1[2] = sumV1[2] > 0 ? (sumV1[2] + count / 2) / count : (sumV1[2] - count / 2) / count;
+                sumV1[3] = sumV1[3] > 0 ? (sumV1[3] + count / 2) / count : (sumV1[3] - count / 2) / count;
+
+                sumV2[0] = sumV2[0] > 0 ? (sumV2[0] + count / 2) / count : (sumV2[0] - count / 2) / count;
+                sumV2[1] = sumV2[1] > 0 ? (sumV2[1] + count / 2) / count : (sumV2[1] - count / 2) / count;
+                sumV2[2] = sumV2[2] > 0 ? (sumV2[2] + count / 2) / count : (sumV2[2] - count / 2) / count;
+                sumV2[3] = sumV2[3] > 0 ? (sumV2[3] + count / 2) / count : (sumV2[3] - count / 2) / count;
+
+                sumV3[0] = sumV3[0] > 0 ? (sumV3[0] + count / 2) / count : (sumV3[0] - count / 2) / count;
+                sumV3[1] = sumV3[1] > 0 ? (sumV3[1] + count / 2) / count : (sumV3[1] - count / 2) / count;
+                sumV3[2] = sumV3[2] > 0 ? (sumV3[2] + count / 2) / count : (sumV3[2] - count / 2) / count;
+                sumV3[3] = sumV3[3] > 0 ? (sumV3[3] + count / 2) / count : (sumV3[3] - count / 2) / count;
+
+                sumV4[0] = sumV4[0] > 0 ? (sumV4[0] + count / 2) / count : (sumV4[0] - count / 2) / count;
+                sumV4[1] = sumV4[1] > 0 ? (sumV4[1] + count / 2) / count : (sumV4[1] - count / 2) / count;
+                sumV4[2] = sumV4[2] > 0 ? (sumV4[2] + count / 2) / count : (sumV4[2] - count / 2) / count;
+                sumV4[3] = sumV4[3] > 0 ? (sumV4[3] + count / 2) / count : (sumV4[3] - count / 2) / count;
+
+                sumV1 = vmaxq_s32(sumV1, vdupq_n_s32(act_min));
+                sumV1 = vminq_s32(sumV1, vdupq_n_s32(act_max));
+
+                sumV2 = vmaxq_s32(sumV2, vdupq_n_s32(act_min));
+                sumV2 = vminq_s32(sumV2, vdupq_n_s32(act_max));
+
+                sumV3 = vmaxq_s32(sumV3, vdupq_n_s32(act_min));
+                sumV3 = vminq_s32(sumV3, vdupq_n_s32(act_max));
+
+                sumV4 = vmaxq_s32(sumV4, vdupq_n_s32(act_min));
+                sumV4 = vminq_s32(sumV4, vdupq_n_s32(act_max));
+
+                tempVLO = vmovnbq_s32(tempVLO, sumV1);
+                tempVLO = vmovntq_s32(tempVLO, sumV2);
+
+                tempVHI = vmovnbq_s32(tempVHI, sumV3);
+                tempVHI = vmovntq_s32(tempVHI, sumV4);
+
+                tempV = vmovnbq_s16(tempV, tempVLO);
+                tempV = vmovntq_s16(tempV, tempVHI);
+
+                vstrbq_s8(pDst, tempV);
+                pDst += 16;
+
+                chCnt--;
+                pTmp += 16;
+            }
+
+            chCnt = ch_src & 0xF;
+            while (chCnt > 0)
+            {
+                int32_t sum = 0;
+                int32_t count = 0;
+
+                for (k_y = k_y_start; k_y < k_y_end; k_y++)
+                {
+                    for (k_x = k_x_start; k_x < k_x_end; k_x++)
+                    {
+                        sum += pTmp[ch_src * (k_x + k_y * input_x)];
+                        count++;
+                    }
+                }
+
+                // Prevent static code issue DIVIDE_BY_ZERO.
+                if (count == 0)
+                {
+                    return ARM_MATH_ARGUMENT_ERROR;
+                }
+
+                sum = sum > 0 ? (sum + count / 2) / count : (sum - count / 2) / count;
+                sum = MAX(sum, act_min);
+                sum = MIN(sum, act_max);
+
+                *pDst++ = sum;
+
+                chCnt--;
+                pTmp++;
+            }
+        }
+    }
+    return ARM_MATH_SUCCESS;
+}
+
+#else
+arm_status arm_avgpool_s8(const cmsis_nn_context *ctx,
+                          const cmsis_nn_pool_params *pool_params,
+                          const cmsis_nn_dims *input_dims,
+                          const q7_t *src,
+                          const cmsis_nn_dims *filter_dims,
+                          const cmsis_nn_dims *output_dims,
+                          q7_t *dst)
+{
+    const int32_t input_y = input_dims->h;
+    const int32_t input_x = input_dims->w;
+    const int32_t output_y = output_dims->h;
+    const int32_t output_x = output_dims->w;
+    const int32_t stride_y = pool_params->stride.h;
+    const int32_t stride_x = pool_params->stride.w;
+    const int32_t kernel_y = filter_dims->h;
+    const int32_t kernel_x = filter_dims->w;
+    const int32_t pad_y = pool_params->padding.h;
+    const int32_t pad_x = pool_params->padding.w;
+    const int32_t act_min = pool_params->activation.min;
+    const int32_t act_max = pool_params->activation.max;
+    const int32_t ch_src = input_dims->c;
+
+    if (ctx->buf == NULL && arm_avgpool_s8_get_buffer_size(output_dims->w, input_dims->c))
+    {
+        return ARM_MATH_ARGUMENT_ERROR;
+    }
+    q31_t *buffer = (q31_t *)ctx->buf;
+
+#if defined(ARM_MATH_DSP)
+
+    /* Run the following code for CPU's with DSP extension
+     */
+    for (int i_y = 0, idx_y = -pad_y; i_y < output_y; idx_y += stride_y, i_y++)
+    {
+        for (int i_x = 0, idx_x = -pad_x; i_x < output_x; idx_x += stride_x, i_x++)
+        {
+            /* Condition for kernel start dimension:
+                      (base_idx_<x,y> + kernel_<x,y>_start) >= 0 */
+            const int32_t kernel_y_start = MAX(0, -idx_y);
+            const int32_t kernel_x_start = MAX(0, -idx_x);
+
+            /* Condition for kernel end dimension:
+                   (base_idx_<x,y> + kernel_<x,y>_end) < dim_src_<width,height> */
+            const int32_t kernel_y_end = MIN(kernel_y, input_y - idx_y);
+            const int32_t kernel_x_end = MIN(kernel_x, input_x - idx_x);
+
+            int count = 0;
+
+            for (int k_y = kernel_y_start; k_y < kernel_y_end; k_y++)
+            {
+                for (int k_x = kernel_x_start; k_x < kernel_x_end; k_x++)
+                {
+                    const q7_t *start = src + ch_src * (k_x + idx_x + (k_y + idx_y) * input_x);
+
+                    if (count == 0)
+                    {
+                        for (int i = 0; i < ch_src; i++)
+                        {
+                            buffer[i] = start[i];
+                        }
+                    }
+                    else
+                    {
+                        for (int i = 0; i < ch_src; i++)
+                        {
+                            buffer[i] = __QADD(start[i], buffer[i]);
+                        }
+                    }
+                    count++;
+                }
+            }
+
+            // Prevent static code issue DIVIDE_BY_ZERO.
+            if (count == 0)
+            {
+                return ARM_MATH_ARGUMENT_ERROR;
+            }
+
+            scale_q31_to_q7_and_clamp(buffer, dst, ch_src, count, act_min, act_max);
+            dst += ch_src;
+        }
+    }
+#else
+
+    /* Reference C code adapted from CMSIS-NN arm_avepool_q7_HWC.
+     */
+    (void)buffer;
+    int16_t i_ch_in, i_x, i_y;
+    int16_t k_x, k_y;
+
+    for (i_y = 0; i_y < output_y; i_y++)
+    {
+        for (i_x = 0; i_x < output_x; i_x++)
+        {
+            for (i_ch_in = 0; i_ch_in < ch_src; i_ch_in++)
+            {
+                int sum = 0;
+                int count = 0;
+                for (k_y = i_y * stride_y - pad_y; k_y < i_y * stride_y - pad_y + kernel_y; k_y++)
+                {
+                    for (k_x = i_x * stride_x - pad_x; k_x < i_x * stride_x - pad_x + kernel_x; k_x++)
+                    {
+                        if (k_y >= 0 && k_x >= 0 && k_y < input_y && k_x < input_x)
+                        {
+                            sum += src[i_ch_in + ch_src * (k_x + k_y * input_x)];
+                            count++;
+                        }
+                    }
+                }
+
+                // Prevent static code issue DIVIDE_BY_ZERO.
+                if (count == 0)
+                {
+                    return ARM_MATH_ARGUMENT_ERROR;
+                }
+
+                sum = sum > 0 ? (sum + count / 2) / count : (sum - count / 2) / count;
+                sum = MAX(sum, act_min);
+                sum = MIN(sum, act_max);
+
+                dst[i_ch_in + ch_src * (i_x + i_y * output_x)] = sum;
+            }
+        }
+    }
+
+#endif
+    return ARM_MATH_SUCCESS;
+}
+
+#endif /* ARM_MATH_MVEI */
+
+int32_t arm_avgpool_s8_get_buffer_size(const int output_x, const int ch_src)
+{
+    (void)output_x;
+
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
+    return (ch_src * sizeof(int32_t));
+#else
+    (void)ch_src;
+    return 0;
+#endif
+}
+/**
+ * @} end of Pooling group
+ */
diff --git a/Drivers/CMSIS/NN/Source/PoolingFunctions/arm_max_pool_s16.c b/Drivers/CMSIS/NN/Source/PoolingFunctions/arm_max_pool_s16.c
new file mode 100644
index 0000000..483f874
--- /dev/null
+++ b/Drivers/CMSIS/NN/Source/PoolingFunctions/arm_max_pool_s16.c
@@ -0,0 +1,180 @@
+/*
+ * Copyright (C) 2022 Arm Limited or its affiliates.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS NN Library
+ * Title:        arm_max_pool_s16.c
+ * Description:  Pooling function implementations
+ *
+ * $Date:        24. January 2022
+ * $Revision:    V.1.0.0
+ *
+ * Target Processor:  Cortex-M CPUs
+ *
+ * -------------------------------------------------------------------- */
+
+#include "arm_nnfunctions.h"
+#include "arm_nnsupportfunctions.h"
+
+static void compare_and_replace_if_larger(int16_t *base, const int16_t *target, int32_t length)
+{
+    q15_t *dst = base;
+    const q15_t *src = target;
+    union arm_nnword ref_max;
+    union arm_nnword comp_max;
+    int32_t cnt = length >> 1;
+
+    while (cnt > 0l)
+    {
+        ref_max.word = arm_nn_read_q15x2(dst);
+        comp_max.word = arm_nn_read_q15x2_ia(&src);
+
+        if (comp_max.half_words[0] > ref_max.half_words[0])
+        {
+            ref_max.half_words[0] = comp_max.half_words[0];
+        }
+        if (comp_max.half_words[1] > ref_max.half_words[1])
+        {
+            ref_max.half_words[1] = comp_max.half_words[1];
+        }
+
+        arm_nn_write_q15x2_ia(&dst, ref_max.word);
+
+        cnt--;
+    }
+
+    if (length & 0x1)
+    {
+        if (*src > *dst)
+        {
+            *dst = *src;
+        }
+    }
+}
+
+static void clamp_output(int16_t *source, int32_t length, const int16_t act_min, const int16_t act_max)
+{
+    union arm_nnword in;
+    int32_t cnt = length >> 1;
+
+    while (cnt > 0l)
+    {
+        in.word = arm_nn_read_q15x2(source);
+
+        in.half_words[0] = MAX(in.half_words[0], act_min);
+        in.half_words[0] = MIN(in.half_words[0], act_max);
+        in.half_words[1] = MAX(in.half_words[1], act_min);
+        in.half_words[1] = MIN(in.half_words[1], act_max);
+
+        arm_nn_write_q15x2_ia(&source, in.word);
+        cnt--;
+    }
+
+    if (length & 0x1)
+    {
+        int16_t comp = *source;
+        comp = MAX(comp, act_min);
+        comp = MIN(comp, act_max);
+        *source = comp;
+    }
+}
+
+/**
+ *  @ingroup groupNN
+ */
+
+/**
+ * @addtogroup Pooling
+ * @{
+ */
+
+/*
+ * Optimized s16 max pooling function
+ *
+ * Refer to header file for details.
+ *
+ */
+
+arm_status arm_max_pool_s16(const cmsis_nn_context *ctx,
+                            const cmsis_nn_pool_params *pool_params,
+                            const cmsis_nn_dims *input_dims,
+                            const int16_t *src,
+                            const cmsis_nn_dims *filter_dims,
+                            const cmsis_nn_dims *output_dims,
+                            int16_t *dst)
+{
+    const int32_t input_y = input_dims->h;
+    const int32_t input_x = input_dims->w;
+    const int32_t output_y = output_dims->h;
+    const int32_t output_x = output_dims->w;
+    const int32_t stride_y = pool_params->stride.h;
+    const int32_t stride_x = pool_params->stride.w;
+    const int32_t kernel_y = filter_dims->h;
+    const int32_t kernel_x = filter_dims->w;
+    const int32_t pad_y = pool_params->padding.h;
+    const int32_t pad_x = pool_params->padding.w;
+    const int16_t act_min = pool_params->activation.min;
+    const int16_t act_max = pool_params->activation.max;
+    const int32_t channel_in = input_dims->c;
+    (void)ctx;
+    int16_t *dst_base = dst;
+
+    for (int i_y = 0, base_idx_y = -pad_y; i_y < output_y; base_idx_y += stride_y, i_y++)
+    {
+        for (int i_x = 0, base_idx_x = -pad_x; i_x < output_x; base_idx_x += stride_x, i_x++)
+        {
+            /* Condition for kernel start dimension: (base_idx_<x,y> + kernel_<x,y>_start) >= 0 */
+            const int32_t ker_y_start = MAX(0, -base_idx_y);
+            const int32_t ker_x_start = MAX(0, -base_idx_x);
+
+            /* Condition for kernel end dimension: (base_idx_<x,y> + kernel_<x,y>_end) < dim_src_<width,height> */
+            const int32_t kernel_y_end = MIN(kernel_y, input_y - base_idx_y);
+            const int32_t kernel_x_end = MIN(kernel_x, input_x - base_idx_x);
+
+            int count = 0;
+
+            for (int k_y = ker_y_start; k_y < kernel_y_end; k_y++)
+            {
+                for (int k_x = ker_x_start; k_x < kernel_x_end; k_x++)
+                {
+                    const int16_t *start = src + channel_in * (k_x + base_idx_x + (k_y + base_idx_y) * input_x);
+
+                    if (count == 0)
+                    {
+                        memcpy(dst, start, channel_in * sizeof(int16_t));
+                        count++;
+                    }
+                    else
+                    {
+                        compare_and_replace_if_larger(dst, start, channel_in);
+                    }
+                }
+            }
+            /* 'count' is expected to be non-zero here. */
+            dst += channel_in;
+        }
+    }
+
+    clamp_output(dst_base, output_x * output_y * channel_in, act_min, act_max);
+
+    return ARM_MATH_SUCCESS;
+}
+
+/**
+ * @} end of Pooling group
+ */
diff --git a/Drivers/CMSIS/NN/Source/PoolingFunctions/arm_max_pool_s8.c b/Drivers/CMSIS/NN/Source/PoolingFunctions/arm_max_pool_s8.c
new file mode 100644
index 0000000..4fbbc91
--- /dev/null
+++ b/Drivers/CMSIS/NN/Source/PoolingFunctions/arm_max_pool_s8.c
@@ -0,0 +1,229 @@
+/*
+ * Copyright (C) 2010-2021 Arm Limited or its affiliates.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS NN Library
+ * Title:        arm_max_pool_s8.c
+ * Description:  Pooling function implementations
+ *
+ * $Date:        20. July 2021
+ * $Revision:    V.2.0.3
+ *
+ * Target Processor:  Cortex-M CPUs
+ *
+ * -------------------------------------------------------------------- */
+
+#include "arm_nnfunctions.h"
+#include "arm_nnsupportfunctions.h"
+
+static void compare_and_replace_if_larger_q7(q7_t *base, const q7_t *target, int32_t length)
+{
+#if defined(ARM_MATH_MVEI)
+    int32_t loop_count = (length + 15) / 16;
+    for (int i = 0; i < loop_count; i++)
+    {
+        mve_pred16_t p = vctp8q((uint32_t)length);
+        const int8x16_t op_1 = vldrbq_z_s8(base, p);
+        const int8x16_t op_2 = vldrbq_z_s8(target, p);
+        const int8x16_t max = vmaxq_m_s8(vuninitializedq_s8(), op_1, op_2, p);
+        vstrbq_p_s8(base, max, p);
+        base += 16;
+        target += 16;
+        length -= 16;
+    }
+#else
+    q7_t *dst = base;
+    const q7_t *src = target;
+    union arm_nnword ref_max;
+    union arm_nnword comp_max;
+    int32_t cnt = length >> 2;
+
+    while (cnt > 0l)
+    {
+        ref_max.word = arm_nn_read_q7x4(dst);
+        comp_max.word = arm_nn_read_q7x4_ia(&src);
+
+        if (comp_max.bytes[0] > ref_max.bytes[0])
+        {
+            ref_max.bytes[0] = comp_max.bytes[0];
+        }
+        if (comp_max.bytes[1] > ref_max.bytes[1])
+        {
+            ref_max.bytes[1] = comp_max.bytes[1];
+        }
+        if (comp_max.bytes[2] > ref_max.bytes[2])
+        {
+            ref_max.bytes[2] = comp_max.bytes[2];
+        }
+        if (comp_max.bytes[3] > ref_max.bytes[3])
+        {
+            ref_max.bytes[3] = comp_max.bytes[3];
+        }
+
+        arm_nn_write_q7x4_ia(&dst, ref_max.word);
+
+        cnt--;
+    }
+
+    cnt = length & 0x3;
+    while (cnt > 0l)
+    {
+        if (*src > *dst)
+        {
+            *dst = *src;
+        }
+        dst++;
+        src++;
+        cnt--;
+    }
+#endif
+}
+
+static void clamp_output(q7_t *source, int32_t length, const int32_t act_min, const int32_t act_max)
+{
+#if defined(ARM_MATH_MVEI)
+    int32_t loop_count = (length + 15) / 16;
+    for (int i = 0; i < loop_count; i++)
+    {
+        mve_pred16_t p = vctp8q((uint32_t)length);
+        length -= 16;
+        const int8x16_t src = vldrbq_z_s8(source, p);
+        const int8x16_t predicated_min = vdupq_m_n_s8(vuninitializedq_s8(), (int8_t)act_min, p);
+        const int8x16_t predicated_max = vdupq_m_n_s8(vuninitializedq_s8(), (int8_t)act_max, p);
+        int8x16_t res = vmaxq_m_s8(vuninitializedq_s8(), src, predicated_min, p);
+        res = vminq_m_s8(vuninitializedq_s8(), res, predicated_max, p);
+        vstrbq_p_s8(source, res, p);
+        source += 16;
+    }
+#else
+    union arm_nnword in;
+    int32_t cnt = length >> 2;
+
+    while (cnt > 0l)
+    {
+        in.word = arm_nn_read_q7x4(source);
+
+        in.bytes[0] = MAX(in.bytes[0], act_min);
+        in.bytes[0] = MIN(in.bytes[0], act_max);
+        in.bytes[1] = MAX(in.bytes[1], act_min);
+        in.bytes[1] = MIN(in.bytes[1], act_max);
+        in.bytes[2] = MAX(in.bytes[2], act_min);
+        in.bytes[2] = MIN(in.bytes[2], act_max);
+        in.bytes[3] = MAX(in.bytes[3], act_min);
+        in.bytes[3] = MIN(in.bytes[3], act_max);
+
+        arm_nn_write_q7x4_ia(&source, in.word);
+        cnt--;
+    }
+
+    cnt = length & 0x3;
+    while (cnt > 0l)
+    {
+        int32_t comp = *source;
+        comp = MAX(comp, act_min);
+        comp = MIN(comp, act_max);
+        *source++ = (int8_t)comp;
+        cnt--;
+    }
+#endif
+}
+
+/**
+ *  @ingroup groupNN
+ */
+
+/**
+ * @addtogroup Pooling
+ * @{
+ */
+
+/*
+ * Optimized s8 max pooling function
+ *
+ * Refer to header file for details.
+ *
+ */
+
+arm_status arm_max_pool_s8(const cmsis_nn_context *ctx,
+                           const cmsis_nn_pool_params *pool_params,
+                           const cmsis_nn_dims *input_dims,
+                           const q7_t *src,
+                           const cmsis_nn_dims *filter_dims,
+                           const cmsis_nn_dims *output_dims,
+                           q7_t *dst)
+{
+    const int32_t input_y = input_dims->h;
+    const int32_t input_x = input_dims->w;
+    const int32_t output_y = output_dims->h;
+    const int32_t output_x = output_dims->w;
+    const int32_t stride_y = pool_params->stride.h;
+    const int32_t stride_x = pool_params->stride.w;
+    const int32_t kernel_y = filter_dims->h;
+    const int32_t kernel_x = filter_dims->w;
+    const int32_t pad_y = pool_params->padding.h;
+    const int32_t pad_x = pool_params->padding.w;
+    const int32_t act_min = pool_params->activation.min;
+    const int32_t act_max = pool_params->activation.max;
+    const int32_t channel_in = input_dims->c;
+    (void)ctx;
+    q7_t *dst_base = dst;
+
+    for (int i_y = 0, base_idx_y = -pad_y; i_y < output_y; base_idx_y += stride_y, i_y++)
+    {
+        for (int i_x = 0, base_idx_x = -pad_x; i_x < output_x; base_idx_x += stride_x, i_x++)
+        {
+            /* Condition for kernel start dimension: (base_idx_<x,y> + kernel_<x,y>_start) >= 0 */
+            const int32_t ker_y_start = MAX(0, -base_idx_y);
+            const int32_t ker_x_start = MAX(0, -base_idx_x);
+
+            /* Condition for kernel end dimension: (base_idx_<x,y> + kernel_<x,y>_end) < dim_src_<width,height> */
+            const int32_t kernel_y_end = MIN(kernel_y, input_y - base_idx_y);
+            const int32_t kernel_x_end = MIN(kernel_x, input_x - base_idx_x);
+
+            int count = 0;
+
+            for (int k_y = ker_y_start; k_y < kernel_y_end; k_y++)
+            {
+                for (int k_x = ker_x_start; k_x < kernel_x_end; k_x++)
+                {
+                    const q7_t *start = src + channel_in * (k_x + base_idx_x + (k_y + base_idx_y) * input_x);
+
+                    if (count == 0)
+                    {
+                        arm_memcpy_q7(dst, start, channel_in);
+                        count++;
+                    }
+                    else
+                    {
+                        compare_and_replace_if_larger_q7(dst, start, channel_in);
+                    }
+                }
+            }
+            /* 'count' is expected to be non-zero here. */
+            dst += channel_in;
+        }
+    }
+
+    clamp_output(dst_base, output_x * output_y * channel_in, act_min, act_max);
+
+    return ARM_MATH_SUCCESS;
+}
+
+/**
+ * @} end of Pooling group
+ */
diff --git a/Drivers/CMSIS/NN/Source/PoolingFunctions/arm_pool_q7_HWC.c b/Drivers/CMSIS/NN/Source/PoolingFunctions/arm_pool_q7_HWC.c
new file mode 100644
index 0000000..5a3b1af
--- /dev/null
+++ b/Drivers/CMSIS/NN/Source/PoolingFunctions/arm_pool_q7_HWC.c
@@ -0,0 +1,464 @@
+/*
+ * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS NN Library
+ * Title:        arm_pool_q7_HWC.c
+ * Description:  Pooling function implementations
+ *
+ * $Date:        20. July 2021
+ * $Revision:    V.1.1.1
+ *
+ * Target Processor:  Cortex-M cores
+ *
+ * -------------------------------------------------------------------- */
+
+#include "arm_nnfunctions.h"
+#include "arm_nnsupportfunctions.h"
+
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
+
+/**
+ * @brief A few utility functions used by pooling functions
+ *
+ *
+ */
+
+static void buffer_scale_back_q15_to_q7(q15_t *buffer, q7_t *target, uint16_t length, uint16_t scale)
+{
+    int i;
+
+    for (i = 0; i < length; i++)
+    {
+        target[i] = (q7_t)(buffer[i] / scale);
+    }
+}
+
+static void compare_and_replace_if_larger_q7(q7_t *base,           // base data
+                                             const q7_t *target,   // compare target
+                                             const uint16_t length // data length
+)
+{
+    q7_t *pIn = base;
+    const q7_t *pCom = target;
+    union arm_nnword in;
+    union arm_nnword com;
+    uint16_t cnt = length >> 2;
+
+    while (cnt > 0u)
+    {
+        in.word = arm_nn_read_q7x4((const q7_t *)pIn);
+        com.word = arm_nn_read_q7x4_ia((const q7_t **)&pCom);
+
+        // if version
+        if (com.bytes[0] > in.bytes[0])
+            in.bytes[0] = com.bytes[0];
+        if (com.bytes[1] > in.bytes[1])
+            in.bytes[1] = com.bytes[1];
+        if (com.bytes[2] > in.bytes[2])
+            in.bytes[2] = com.bytes[2];
+        if (com.bytes[3] > in.bytes[3])
+            in.bytes[3] = com.bytes[3];
+
+        arm_nn_write_q7x4_ia(&pIn, in.word);
+
+        cnt--;
+    }
+
+    cnt = length & 0x3;
+    while (cnt > 0u)
+    {
+        if (*pCom > *pIn)
+        {
+            *pIn = *pCom;
+        }
+        pIn++;
+        pCom++;
+        cnt--;
+    }
+}
+
+static void accumulate_q7_to_q15(q15_t *base, q7_t *target, const uint16_t length)
+{
+    q15_t *pCnt = base;
+    q7_t *pV = target;
+    q31_t v1, v2, vo1, vo2;
+    uint16_t cnt = length >> 2;
+    q31_t in;
+
+    while (cnt > 0u)
+    {
+        q31_t value = arm_nn_read_q7x4_ia((const q7_t **)&pV);
+        v1 = __SXTB16(__ROR(value, 8));
+        v2 = __SXTB16(value);
+#ifndef ARM_MATH_BIG_ENDIAN
+
+        vo2 = __PKHTB(v1, v2, 16);
+        vo1 = __PKHBT(v2, v1, 16);
+
+#else
+
+        vo1 = __PKHTB(v1, v2, 16);
+        vo2 = __PKHBT(v2, v1, 16);
+
+#endif
+
+        in = arm_nn_read_q15x2(pCnt);
+        arm_nn_write_q15x2_ia(&pCnt, __QADD16(vo1, in));
+
+        in = arm_nn_read_q15x2(pCnt);
+        arm_nn_write_q15x2_ia(&pCnt, __QADD16(vo2, in));
+
+        cnt--;
+    }
+    cnt = length & 0x3;
+    while (cnt > 0u)
+    {
+        *pCnt++ += *pV++;
+        cnt--;
+    }
+}
+
+#endif // ARM_MATH_DSP
+
+/**
+ *  @ingroup groupNN
+ */
+
+/**
+ * @addtogroup Pooling
+ * @{
+ */
+
+/**
+ * @brief Q7 max pooling function
+ * @param[in, out]  Im_in       pointer to input tensor
+ * @param[in]       dim_im_in   input tensor dimention
+ * @param[in]       ch_im_in    number of input tensor channels
+ * @param[in]       dim_kernel  filter kernel size
+ * @param[in]       padding     padding sizes
+ * @param[in]       stride      convolution stride
+ * @param[in]       dim_im_out  output tensor dimension
+ * @param[in,out]   bufferA     Not used
+ * @param[in,out]   Im_out      pointer to output tensor
+ *
+ * @details
+ *
+ * The pooling function is implemented as split x-pooling then
+ * y-pooling.
+ *
+ * This pooling function is input-destructive. Input data is undefined
+ * after calling this function.
+ *
+ */
+
+void arm_maxpool_q7_HWC(q7_t *Im_in,
+                        const uint16_t dim_im_in,
+                        const uint16_t ch_im_in,
+                        const uint16_t dim_kernel,
+                        const uint16_t padding,
+                        const uint16_t stride,
+                        const uint16_t dim_im_out,
+                        q7_t *bufferA,
+                        q7_t *Im_out)
+{
+    (void)bufferA;
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
+    /* Run the following code for Cortex-M4 and Cortex-M7 */
+
+    int16_t i_x, i_y;
+
+    /* first does the pooling along x axis */
+    for (i_y = 0; i_y < dim_im_in; i_y++)
+    {
+
+        for (i_x = 0; i_x < dim_im_out; i_x++)
+        {
+            /* for each output pixel */
+            q7_t *target = Im_in + (i_y * dim_im_in + i_x) * ch_im_in;
+            q7_t *win_start;
+            q7_t *win_stop;
+            if (i_x * stride - padding < 0)
+            {
+                win_start = target;
+            }
+            else
+            {
+                win_start = Im_in + (i_y * dim_im_in + i_x * stride - padding) * ch_im_in;
+            }
+
+            if (i_x * stride - padding + dim_kernel >= dim_im_in)
+            {
+                win_stop = Im_in + (i_y * dim_im_in + dim_im_in) * ch_im_in;
+            }
+            else
+            {
+                win_stop = Im_in + (i_y * dim_im_in + i_x * stride - padding + dim_kernel) * ch_im_in;
+            }
+
+            /* first step is to copy over initial data */
+            /* arm_copy_q7(win_start, target, ch_im_in); */
+            memmove(target, win_start, ch_im_in);
+
+            /* start the max operation from the second part */
+            win_start += ch_im_in;
+            for (; win_start < win_stop; win_start += ch_im_in)
+            {
+                compare_and_replace_if_larger_q7(target, win_start, ch_im_in);
+            }
+        }
+    }
+
+    /* then does the pooling along y axis */
+    for (i_y = 0; i_y < dim_im_out; i_y++)
+    {
+
+        /* for each output row */
+        q7_t *target = Im_out + i_y * dim_im_out * ch_im_in;
+        q7_t *row_start;
+        q7_t *row_end;
+        /* setting the starting row */
+        if (i_y * stride - padding < 0)
+        {
+            row_start = Im_in;
+        }
+        else
+        {
+            row_start = Im_in + (i_y * stride - padding) * dim_im_in * ch_im_in;
+        }
+        /* setting the stopping row */
+        if (i_y * stride - padding + dim_kernel >= dim_im_in)
+        {
+            row_end = Im_in + dim_im_in * dim_im_in * ch_im_in;
+        }
+        else
+        {
+            row_end = Im_in + (i_y * stride - padding + dim_kernel) * dim_im_in * ch_im_in;
+        }
+
+        /* copy over the first row */
+        /* arm_copy_q7(row_start, target, dim_im_out * ch_im_in); */
+        memmove(target, row_start, dim_im_out * ch_im_in);
+
+        /* move over to next row */
+        row_start += ch_im_in * dim_im_in;
+
+        for (; row_start < row_end; row_start += dim_im_in * ch_im_in)
+        {
+            compare_and_replace_if_larger_q7(target, row_start, dim_im_out * ch_im_in);
+        }
+    }
+
+#else
+    /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
+    int16_t i_ch_in, i_x, i_y;
+    int16_t k_x, k_y;
+
+    for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
+    {
+        for (i_y = 0; i_y < dim_im_out; i_y++)
+        {
+            for (i_x = 0; i_x < dim_im_out; i_x++)
+            {
+                int max = -129;
+                for (k_y = i_y * stride - padding; k_y < i_y * stride - padding + dim_kernel; k_y++)
+                {
+                    for (k_x = i_x * stride - padding; k_x < i_x * stride - padding + dim_kernel; k_x++)
+                    {
+                        if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in && k_x < dim_im_in)
+                        {
+                            if (Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)] > max)
+                            {
+                                max = Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)];
+                            }
+                        }
+                    }
+                }
+                Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out)] = max;
+            }
+        }
+    }
+
+#endif /* ARM_MATH_DSP */
+}
+
+/**
+ * @brief Q7 average pooling function
+ * @param[in,out]   Im_in       pointer to input tensor
+ * @param[in]       dim_im_in   input tensor dimention
+ * @param[in]       ch_im_in    number of input tensor channels
+ * @param[in]       dim_kernel  filter kernel size
+ * @param[in]       padding     padding sizes
+ * @param[in]       stride      convolution stride
+ * @param[in]       dim_im_out  output tensor dimension
+ * @param[in,out]   bufferA     pointer to buffer space for input
+ * @param[in,out]   Im_out      pointer to output tensor
+ *
+ * @details
+ *
+ * <b>Buffer size:</b>
+ *
+ * bufferA size:  2*dim_im_out*ch_im_in
+ *
+ * The pooling function is implemented as split x-pooling then
+ * y-pooling.
+ *
+ * This pooling function is input-destructive. Input data is undefined
+ * after calling this function.
+ *
+ */
+
+void arm_avepool_q7_HWC(q7_t *Im_in,
+                        const uint16_t dim_im_in,
+                        const uint16_t ch_im_in,
+                        const uint16_t dim_kernel,
+                        const uint16_t padding,
+                        const uint16_t stride,
+                        const uint16_t dim_im_out,
+                        q7_t *bufferA,
+                        q7_t *Im_out)
+{
+
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
+    /* Run the following code for Cortex-M4 and Cortex-M7 */
+
+    q15_t *buffer = (q15_t *)bufferA;
+    int16_t i_x, i_y;
+    int16_t count = 0;
+
+    /* first does the pooling along x axis */
+    for (i_y = 0; i_y < dim_im_in; i_y++)
+    {
+
+        for (i_x = 0; i_x < dim_im_out; i_x++)
+        {
+            /* for each output pixel */
+            q7_t *target = Im_in + (i_y * dim_im_in + i_x) * ch_im_in;
+            q7_t *win_start;
+            q7_t *win_stop;
+            if (i_x * stride - padding < 0)
+            {
+                win_start = target;
+            }
+            else
+            {
+                win_start = Im_in + (i_y * dim_im_in + i_x * stride - padding) * ch_im_in;
+            }
+
+            if (i_x * stride - padding + dim_kernel >= dim_im_in)
+            {
+                win_stop = Im_in + (i_y * dim_im_in + dim_im_in) * ch_im_in;
+            }
+            else
+            {
+                win_stop = Im_in + (i_y * dim_im_in + i_x * stride - padding + dim_kernel) * ch_im_in;
+            }
+
+            /* first step is to copy over initial data */
+            arm_q7_to_q15_no_shift(win_start, buffer, ch_im_in);
+            count = 1;
+
+            /* start the max operation from the second part */
+            win_start += ch_im_in;
+            for (; win_start < win_stop; win_start += ch_im_in)
+            {
+                accumulate_q7_to_q15(buffer, win_start, ch_im_in);
+                count++;
+            }
+            buffer_scale_back_q15_to_q7(buffer, target, ch_im_in, count);
+        }
+    }
+
+    /* then does the pooling along y axis */
+    for (i_y = 0; i_y < dim_im_out; i_y++)
+    {
+        /* for each output row */
+        q7_t *target = Im_out + i_y * dim_im_out * ch_im_in;
+        q7_t *row_start;
+        q7_t *row_end;
+        /* setting the starting row */
+        if (i_y * stride - padding < 0)
+        {
+            row_start = Im_in;
+        }
+        else
+        {
+            row_start = Im_in + (i_y * stride - padding) * dim_im_in * ch_im_in;
+        }
+        /* setting the stopping row */
+        if (i_y * stride - padding + dim_kernel >= dim_im_in)
+        {
+            row_end = Im_in + dim_im_in * dim_im_in * ch_im_in;
+        }
+        else
+        {
+            row_end = Im_in + (i_y * stride - padding + dim_kernel) * dim_im_in * ch_im_in;
+        }
+
+        /* copy over the first row */
+        arm_q7_to_q15_no_shift(row_start, buffer, dim_im_out * ch_im_in);
+        count = 1;
+
+        /* move over to next row */
+        row_start += ch_im_in * dim_im_in;
+
+        for (; row_start < row_end; row_start += dim_im_in * ch_im_in)
+        {
+            accumulate_q7_to_q15(buffer, row_start, dim_im_out * ch_im_in);
+            count++;
+        }
+        buffer_scale_back_q15_to_q7(buffer, target, dim_im_out * ch_im_in, count);
+    }
+
+#else
+    /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
+
+    (void)bufferA;
+    int16_t i_ch_in, i_x, i_y;
+    int16_t k_x, k_y;
+
+    for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
+    {
+        for (i_y = 0; i_y < dim_im_out; i_y++)
+        {
+            for (i_x = 0; i_x < dim_im_out; i_x++)
+            {
+                int sum = 0;
+                int count = 0;
+                for (k_y = i_y * stride - padding; k_y < i_y * stride - padding + dim_kernel; k_y++)
+                {
+                    for (k_x = i_x * stride - padding; k_x < i_x * stride - padding + dim_kernel; k_x++)
+                    {
+                        if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in && k_x < dim_im_in)
+                        {
+                            sum += Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)];
+                            count++;
+                        }
+                    }
+                }
+                Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out)] = sum / count;
+            }
+        }
+    }
+
+#endif /* ARM_MATH_DSP */
+}
+
+/**
+ * @} end of Pooling group
+ */