1 files changed, 217 insertions, 0 deletions
diff --git a/Drivers/CMSIS/DSP/Source/MatrixFunctions/arm_mat_add_f16.c b/Drivers/CMSIS/DSP/Source/MatrixFunctions/arm_mat_add_f16.c
new file mode 100644
index 0000000..2db2c2a
--- /dev/null
+++ b/Drivers/CMSIS/DSP/Source/MatrixFunctions/arm_mat_add_f16.c
@@ -0,0 +1,217 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_mat_add_f16.c
+ * Description:  Floating-point matrix addition
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+/**
+  @ingroup groupMatrix
+ */
+
+
+/**
+  @addtogroup MatrixAdd
+  @{
+ */
+
+
+/**
+  @brief         Floating-point matrix addition.
+  @param[in]     pSrcA      points to first input matrix structure
+  @param[in]     pSrcB      points to second input matrix structure
+  @param[out]    pDst       points to output matrix structure
+  @return        execution status
+                   - \ref ARM_MATH_SUCCESS       : Operation successful
+                   - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+arm_status arm_mat_add_f16(
+  const arm_matrix_instance_f16 * pSrcA,
+  const arm_matrix_instance_f16 * pSrcB,
+  arm_matrix_instance_f16 * pDst)
+{
+    arm_status status;  
+    uint32_t  numSamples;       /* total number of elements in the matrix  */
+    float16_t *pDataA, *pDataB, *pDataDst;
+    f16x8_t vecA, vecB, vecDst;
+    float16_t const *pSrcAVec;
+    float16_t const *pSrcBVec;
+    uint32_t  blkCnt;           /* loop counters */
+
+    pDataA = pSrcA->pData;
+    pDataB = pSrcB->pData;
+    pDataDst = pDst->pData;
+    pSrcAVec = (float16_t const *) pDataA;
+    pSrcBVec = (float16_t const *) pDataB;
+
+#ifdef ARM_MATH_MATRIX_CHECK
+  /* Check for matrix mismatch condition */
+  if ((pSrcA->numRows != pSrcB->numRows) ||
+     (pSrcA->numCols != pSrcB->numCols) ||
+     (pSrcA->numRows != pDst->numRows) || (pSrcA->numCols != pDst->numCols))
+  {
+    /* Set status as ARM_MATH_SIZE_MISMATCH */
+    status = ARM_MATH_SIZE_MISMATCH;
+  }
+  else
+#endif
+ {
+    /*
+     * Total number of samples in the input matrix
+     */
+    numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
+    blkCnt = numSamples >> 3;
+    while (blkCnt > 0U)
+    {
+        /* C(m,n) = A(m,n) + B(m,n) */
+        /* Add and then store the results in the destination buffer. */
+        vecA = vld1q(pSrcAVec); 
+        pSrcAVec += 8;
+        vecB = vld1q(pSrcBVec); 
+        pSrcBVec += 8;
+        vecDst = vaddq(vecA, vecB);
+        vst1q(pDataDst, vecDst);  
+        pDataDst += 8;
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+    }
+    /*
+     * tail
+     */
+    blkCnt = numSamples & 7;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp16q(blkCnt);
+        vecA = vld1q(pSrcAVec); 
+        vecB = vld1q(pSrcBVec); 
+        vecDst = vaddq_m(vecDst, vecA, vecB, p0);
+        vstrhq_p(pDataDst, vecDst, p0);
+    }
+    /* set status as ARM_MATH_SUCCESS */
+    status = ARM_MATH_SUCCESS;
+  }
+  return (status);
+}
+#else
+
+arm_status arm_mat_add_f16(
+  const arm_matrix_instance_f16 * pSrcA,
+  const arm_matrix_instance_f16 * pSrcB,
+        arm_matrix_instance_f16 * pDst)
+{
+  float16_t *pInA = pSrcA->pData;                /* input data matrix pointer A */
+  float16_t *pInB = pSrcB->pData;                /* input data matrix pointer B */
+  float16_t *pOut = pDst->pData;                 /* output data matrix pointer */
+
+  uint32_t numSamples;                           /* total number of elements in the matrix */
+  uint32_t blkCnt;                               /* loop counters */
+  arm_status status;                             /* status of matrix addition */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+  /* Check for matrix mismatch condition */
+  if ((pSrcA->numRows != pSrcB->numRows) ||
+      (pSrcA->numCols != pSrcB->numCols) ||
+      (pSrcA->numRows != pDst->numRows)  ||
+      (pSrcA->numCols != pDst->numCols)    )
+  {
+    /* Set status as ARM_MATH_SIZE_MISMATCH */
+    status = ARM_MATH_SIZE_MISMATCH;
+  }
+  else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+  {
+    /* Total number of samples in input matrix */
+    numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+    /* Loop unrolling: Compute 4 outputs at a time */
+    blkCnt = numSamples >> 2U;
+
+    while (blkCnt > 0U)
+    {
+      /* C(m,n) = A(m,n) + B(m,n) */
+
+      /* Add and store result in destination buffer. */
+      *pOut++ = (_Float16)*pInA++ + (_Float16)*pInB++;
+
+      *pOut++ = (_Float16)*pInA++ + (_Float16)*pInB++;
+
+      *pOut++ = (_Float16)*pInA++ + (_Float16)*pInB++;
+
+      *pOut++ = (_Float16)*pInA++ + (_Float16)*pInB++;
+
+      /* Decrement loop counter */
+      blkCnt--;
+    }
+
+    /* Loop unrolling: Compute remaining outputs */
+    blkCnt = numSamples % 0x4U;
+
+#else
+
+    /* Initialize blkCnt with number of samples */
+    blkCnt = numSamples;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+    while (blkCnt > 0U)
+    {
+      /* C(m,n) = A(m,n) + B(m,n) */
+
+      /* Add and store result in destination buffer. */
+      *pOut++ = (_Float16)*pInA++ + (_Float16)*pInB++;
+
+      /* Decrement loop counter */
+      blkCnt--;
+    }
+
+    /* Set status as ARM_MATH_SUCCESS */
+    status = ARM_MATH_SUCCESS;
+  }
+
+  /* Return to application */
+  return (status);
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of MatrixAdd group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+