1 files changed, 891 insertions, 0 deletions
diff --git a/Drivers/CMSIS/DSP/Source/MatrixFunctions/arm_mat_inverse_f16.c b/Drivers/CMSIS/DSP/Source/MatrixFunctions/arm_mat_inverse_f16.c
new file mode 100644
index 0000000..90bc06d
--- /dev/null
+++ b/Drivers/CMSIS/DSP/Source/MatrixFunctions/arm_mat_inverse_f16.c
@@ -0,0 +1,891 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_mat_inverse_f16.c
+ * Description:  Floating-point matrix inverse
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+/**
+  @ingroup groupMatrix
+ */
+
+
+/**
+  @addtogroup MatrixInv
+  @{
+ */
+
+/**
+  @brief         Floating-point matrix inverse.
+  @param[in]     pSrc      points to input matrix structure. The source matrix is modified by the function.
+  @param[out]    pDst      points to output matrix structure
+  @return        execution status
+                   - \ref ARM_MATH_SUCCESS       : Operation successful
+                   - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+                   - \ref ARM_MATH_SINGULAR      : Input matrix is found to be singular (non-invertible)
+ */
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+arm_status arm_mat_inverse_f16(
+  const arm_matrix_instance_f16 * pSrc,
+  arm_matrix_instance_f16 * pDst)
+{
+    float16_t *pIn = pSrc->pData;   /* input data matrix pointer */
+    float16_t *pOut = pDst->pData;  /* output data matrix pointer */
+    float16_t *pInT1, *pInT2;   /* Temporary input data matrix pointer */
+    float16_t *pOutT1, *pOutT2; /* Temporary output data matrix pointer */
+    float16_t *pPivotRowIn, *pPRT_in, *pPivotRowDst, *pPRT_pDst;    /* Temporary input and output data matrix pointer */
+
+    uint32_t  numRows = pSrc->numRows;  /* Number of rows in the matrix  */
+    uint32_t  numCols = pSrc->numCols;  /* Number of Cols in the matrix  */
+    float16_t *pTmpA, *pTmpB;
+
+    _Float16 in = 0.0f16;        /* Temporary input values  */
+    uint32_t  i, rowCnt, flag = 0U, j, loopCnt, l;   /* loop counters */
+    arm_status status;          /* status of matrix inverse */
+    uint32_t  blkCnt;
+
+#ifdef ARM_MATH_MATRIX_CHECK
+   /* Check for matrix mismatch condition */
+  if ((pSrc->numRows != pSrc->numCols) || (pDst->numRows != pDst->numCols)
+     || (pSrc->numRows != pDst->numRows))
+  {
+    /* Set status as ARM_MATH_SIZE_MISMATCH */
+    status = ARM_MATH_SIZE_MISMATCH;
+  }
+  else
+#endif /*    #ifdef ARM_MATH_MATRIX_CHECK    */
+  {
+
+    /*--------------------------------------------------------------------------------------------------------------
+     * Matrix Inverse can be solved using elementary row operations.
+     *
+     *  Gauss-Jordan Method:
+     *
+     *     1. First combine the identity matrix and the input matrix separated by a bar to form an
+     *        augmented matrix as follows:
+     *                      _  _          _     _      _   _         _         _
+     *                     |  |  a11  a12  | | | 1   0  |   |       |  X11 X12  |
+     *                     |  |            | | |        |   |   =   |           |
+     *                     |_ |_ a21  a22 _| | |_0   1 _|  _|       |_ X21 X21 _|
+     *
+     *      2. In our implementation, pDst Matrix is used as identity matrix.
+     *
+     *      3. Begin with the first row. Let i = 1.
+     *
+     *      4. Check to see if the pivot for row i is zero.
+     *         The pivot is the element of the main diagonal that is on the current row.
+     *         For instance, if working with row i, then the pivot element is aii.
+     *         If the pivot is zero, exchange that row with a row below it that does not
+     *         contain a zero in column i. If this is not possible, then an inverse
+     *         to that matrix does not exist.
+     *
+     *      5. Divide every element of row i by the pivot.
+     *
+     *      6. For every row below and  row i, replace that row with the sum of that row and
+     *         a multiple of row i so that each new element in column i below row i is zero.
+     *
+     *      7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
+     *         for every element below and above the main diagonal.
+     *
+     *      8. Now an identical matrix is formed to the left of the bar(input matrix, src).
+     *         Therefore, the matrix to the right of the bar is our solution(dst matrix, dst).
+     *----------------------------------------------------------------------------------------------------------------*/
+
+        /*
+         * Working pointer for destination matrix
+         */
+        pOutT1 = pOut;
+        /*
+         * Loop over the number of rows
+         */
+        rowCnt = numRows;
+        /*
+         * Making the destination matrix as identity matrix
+         */
+        while (rowCnt > 0U)
+        {
+            /*
+             * Writing all zeroes in lower triangle of the destination matrix
+             */
+            j = numRows - rowCnt;
+            while (j > 0U)
+            {
+                *pOutT1++ = 0.0f16;
+                j--;
+            }
+            /*
+             * Writing all ones in the diagonal of the destination matrix
+             */
+            *pOutT1++ = 1.0f16;
+            /*
+             * Writing all zeroes in upper triangle of the destination matrix
+             */
+            j = rowCnt - 1U;
+            while (j > 0U)
+            {
+                *pOutT1++ = 0.0f16;
+                j--;
+            }
+            /*
+             * Decrement the loop counter
+             */
+            rowCnt--;
+        }
+
+        /*
+         * Loop over the number of columns of the input matrix.
+         * All the elements in each column are processed by the row operations
+         */
+        loopCnt = numCols;
+        /*
+         * Index modifier to navigate through the columns
+         */
+        l = 0U;
+        while (loopCnt > 0U)
+        {
+            /*
+             * Check if the pivot element is zero..
+             * If it is zero then interchange the row with non zero row below.
+             * If there is no non zero element to replace in the rows below,
+             * then the matrix is Singular.
+             */
+
+            /*
+             * Working pointer for the input matrix that points
+             * * to the pivot element of the particular row
+             */
+            pInT1 = pIn + (l * numCols);
+            /*
+             * Working pointer for the destination matrix that points
+             * * to the pivot element of the particular row
+             */
+            pOutT1 = pOut + (l * numCols);
+            /*
+             * Temporary variable to hold the pivot value
+             */
+            in = *pInT1;
+            
+
+            /*
+             * Check if the pivot element is zero
+             */
+            if ((_Float16)*pInT1 == 0.0f16)
+            {
+                /*
+                 * Loop over the number rows present below
+                 */
+                for (i = 1U; i < numRows-l; i++)
+                {
+                    /*
+                     * Update the input and destination pointers
+                     */
+                    pInT2 = pInT1 + (numCols * i);
+                    pOutT2 = pOutT1 + (numCols * i);
+                    /*
+                     * Check if there is a non zero pivot element to
+                     * * replace in the rows below
+                     */
+                    if ((_Float16)*pInT2 != 0.0f16)
+                    {
+                        f16x8_t vecA, vecB;
+                        /*
+                         * Loop over number of columns
+                         * * to the right of the pilot element
+                         */
+                        pTmpA = pInT1;
+                        pTmpB = pInT2;
+                        blkCnt = (numCols - l) >> 3;
+                        while (blkCnt > 0U)
+                        {
+                            
+                            vecA = vldrhq_f16(pTmpA);
+                            vecB = vldrhq_f16(pTmpB);
+                            vstrhq_f16(pTmpB, vecA);
+                            vstrhq_f16(pTmpA, vecB);
+
+                            pTmpA += 8;
+                            pTmpB += 8;
+                            /*
+                             * Decrement the blockSize loop counter
+                             */
+                            blkCnt--;
+                        }
+                        /*
+                         * tail
+                         * (will be merged thru tail predication)
+                         */
+                        blkCnt = (numCols - l) & 7;
+                        if (blkCnt > 0U)
+                        {
+                            mve_pred16_t p0 = vctp16q(blkCnt);
+
+                            vecA = vldrhq_f16(pTmpA);
+                            vecB = vldrhq_f16(pTmpB);
+                            vstrhq_p_f16(pTmpB, vecA, p0);
+                            vstrhq_p_f16(pTmpA, vecB, p0);
+                        }
+
+                        pInT1 += numCols - l;
+                        pInT2 += numCols - l;
+                        pTmpA = pOutT1;
+                        pTmpB = pOutT2;
+                        blkCnt = numCols >> 3;
+                        while (blkCnt > 0U)
+                        {
+
+                            vecA = vldrhq_f16(pTmpA);
+                            vecB = vldrhq_f16(pTmpB);
+                            vstrhq_f16(pTmpB, vecA);
+                            vstrhq_f16(pTmpA, vecB);
+                            pTmpA += 8;
+                            pTmpB += 8;
+                            /*
+                             * Decrement the blockSize loop counter
+                             */
+                            blkCnt--;
+                        }
+                        /*
+                         * tail
+                         */
+                        blkCnt = numCols & 7;
+                        if (blkCnt > 0U)
+                        {
+                            mve_pred16_t p0 = vctp16q(blkCnt);
+
+                            vecA = vldrhq_f16(pTmpA);
+                            vecB = vldrhq_f16(pTmpB);
+                            vstrhq_p_f16(pTmpB, vecA, p0);
+                            vstrhq_p_f16(pTmpA, vecB, p0);
+                        }
+
+                        pOutT1 += numCols;
+                        pOutT2 += numCols;
+                        /*
+                         * Flag to indicate whether exchange is done or not
+                         */
+                        flag = 1U;
+
+                        /*
+                         * Break after exchange is done
+                         */
+                        break;
+                    }
+              
+                }
+            }
+
+            /*
+             * Update the status if the matrix is singular
+             */
+            if ((flag != 1U) && (in == 0.0f16))
+            {
+                return ARM_MATH_SINGULAR;
+            }
+
+            /*
+             * Points to the pivot row of input and destination matrices
+             */
+            pPivotRowIn = pIn + (l * numCols);
+            pPivotRowDst = pOut + (l * numCols);
+
+            /*
+             * Temporary pointers to the pivot row pointers
+             */
+            pInT1 = pPivotRowIn;
+            pOutT1 = pPivotRowDst;
+
+            /*
+             * Pivot element of the row
+             */
+            in = *(pIn + (l * numCols));
+
+            pTmpA = pInT1;
+
+            f16x8_t invIn = vdupq_n_f16(1.0f16 / in);
+
+            blkCnt = (numCols - l) >> 3;
+            f16x8_t vecA;
+            while (blkCnt > 0U)
+            {
+                *(f16x8_t *) pTmpA = *(f16x8_t *) pTmpA * invIn;
+                pTmpA += 8;
+                /*
+                 * Decrement the blockSize loop counter
+                 */
+                blkCnt--;
+            }
+            /*
+             * tail
+             */
+            blkCnt = (numCols - l) & 7;
+            if (blkCnt > 0U)
+            {
+                mve_pred16_t p0 = vctp16q(blkCnt);
+                
+
+                vecA = vldrhq_f16(pTmpA);
+                vecA = vecA * invIn;
+                vstrhq_p_f16(pTmpA, vecA, p0);
+            }
+
+            pInT1 += numCols - l;
+            /*
+             * Loop over number of columns
+             * * to the right of the pilot element
+             */
+
+            pTmpA = pOutT1;
+            blkCnt = numCols >> 3;
+            while (blkCnt > 0U)
+            {
+                *(f16x8_t *) pTmpA = *(f16x8_t *) pTmpA *invIn;
+                pTmpA += 8;
+                /*
+                 * Decrement the blockSize loop counter
+                 */
+                blkCnt--;
+            }
+            /*
+             * tail
+             * (will be merged thru tail predication)
+             */
+            blkCnt = numCols & 7;
+            if (blkCnt > 0U)
+            {
+                mve_pred16_t p0 = vctp16q(blkCnt);
+
+                vecA = vldrhq_f16(pTmpA);
+                vecA = vecA * invIn;
+                vstrhq_p_f16(pTmpA, vecA, p0);
+            }
+
+            pOutT1 += numCols;
+
+            /*
+             * Replace the rows with the sum of that row and a multiple of row i
+             * * so that each new element in column i above row i is zero.
+             */
+
+            /*
+             * Temporary pointers for input and destination matrices
+             */
+            pInT1 = pIn;
+            pOutT1 = pOut;
+
+            for (i = 0U; i < numRows; i++)
+            {
+                /*
+                 * Check for the pivot element
+                 */
+                if (i == l)
+                {
+                    /*
+                     * If the processing element is the pivot element,
+                     * only the columns to the right are to be processed
+                     */
+                    pInT1 += numCols - l;
+                    pOutT1 += numCols;
+                }
+                else
+                {
+                    /*
+                     * Element of the reference row
+                     */
+
+                    /*
+                     * Working pointers for input and destination pivot rows
+                     */
+                    pPRT_in = pPivotRowIn;
+                    pPRT_pDst = pPivotRowDst;
+                    /*
+                     * Loop over the number of columns to the right of the pivot element,
+                     * to replace the elements in the input matrix
+                     */
+
+                    in = *pInT1;
+                    f16x8_t tmpV = vdupq_n_f16(in);
+
+                    blkCnt = (numCols - l) >> 3;
+                    while (blkCnt > 0U)
+                    {
+                        f16x8_t vec1, vec2;
+                        /*
+                         * Replace the element by the sum of that row
+                         * and a multiple of the reference row
+                         */
+                        vec1 = vldrhq_f16(pInT1);
+                        vec2 = vldrhq_f16(pPRT_in);
+                        vec1 = vfmsq_f16(vec1, tmpV, vec2);
+                        vstrhq_f16(pInT1, vec1);
+                        pPRT_in += 8;
+                        pInT1 += 8;
+                        /*
+                         * Decrement the blockSize loop counter
+                         */
+                        blkCnt--;
+                    }
+                    /*
+                     * tail
+                     * (will be merged thru tail predication)
+                     */
+                    blkCnt = (numCols - l) & 7;
+                    if (blkCnt > 0U)
+                    {
+                        f16x8_t vec1, vec2;
+                        mve_pred16_t p0 = vctp16q(blkCnt);
+
+                        vec1 = vldrhq_f16(pInT1);
+                        vec2 = vldrhq_f16(pPRT_in);
+                        vec1 = vfmsq_f16(vec1, tmpV, vec2);
+                        vstrhq_p_f16(pInT1, vec1, p0);
+                        pInT1 += blkCnt;
+                    }
+
+                    blkCnt = numCols >> 3;
+                    while (blkCnt > 0U)
+                    {
+                        f16x8_t vec1, vec2;
+
+                        /*
+                         * Replace the element by the sum of that row
+                         * and a multiple of the reference row
+                         */
+                        vec1 = vldrhq_f16(pOutT1);
+                        vec2 = vldrhq_f16(pPRT_pDst);
+                        vec1 = vfmsq_f16(vec1, tmpV, vec2);
+                        vstrhq_f16(pOutT1, vec1);
+                        pPRT_pDst += 8;
+                        pOutT1 += 8;
+                        /*
+                         * Decrement the blockSize loop counter
+                         */
+                        blkCnt--;
+                    }
+                    /*
+                     * tail
+                     * (will be merged thru tail predication)
+                     */
+                    blkCnt = numCols & 7;
+                    if (blkCnt > 0U)
+                    {
+                        f16x8_t vec1, vec2;
+                        mve_pred16_t p0 = vctp16q(blkCnt);
+
+                        vec1 = vldrhq_f16(pOutT1);
+                        vec2 = vldrhq_f16(pPRT_pDst);
+                        vec1 = vfmsq_f16(vec1, tmpV, vec2);
+                        vstrhq_p_f16(pOutT1, vec1, p0);
+
+                        pInT2 += blkCnt;
+                        pOutT1 += blkCnt;
+                    }
+                }
+                /*
+                 * Increment the temporary input pointer
+                 */
+                pInT1 = pInT1 + l;
+            }
+            /*
+             * Increment the input pointer
+             */
+            pIn++;
+            /*
+             * Decrement the loop counter
+             */
+            loopCnt--;
+            /*
+             * Increment the index modifier
+             */
+            l++;
+        }
+
+        /*
+         * Set status as ARM_MATH_SUCCESS
+         */
+        status = ARM_MATH_SUCCESS;
+
+        if ((flag != 1U) && (in == 0.0f16))
+        {
+            pIn = pSrc->pData;
+            for (i = 0; i < numRows * numCols; i++)
+            {
+                if ((_Float16)pIn[i] != 0.0f16)
+                    break;
+            }
+
+            if (i == numRows * numCols)
+                status = ARM_MATH_SINGULAR;
+        }
+  }
+  /* Return to application */
+  return (status);
+}
+
+#else
+
+arm_status arm_mat_inverse_f16(
+  const arm_matrix_instance_f16 * pSrc,
+        arm_matrix_instance_f16 * pDst)
+{
+  float16_t *pIn = pSrc->pData;                  /* input data matrix pointer */
+  float16_t *pOut = pDst->pData;                 /* output data matrix pointer */
+  float16_t *pInT1, *pInT2;                      /* Temporary input data matrix pointer */
+  float16_t *pOutT1, *pOutT2;                    /* Temporary output data matrix pointer */
+  float16_t *pPivotRowIn, *pPRT_in, *pPivotRowDst, *pPRT_pDst;  /* Temporary input and output data matrix pointer */
+  uint32_t numRows = pSrc->numRows;              /* Number of rows in the matrix  */
+  uint32_t numCols = pSrc->numCols;              /* Number of Cols in the matrix  */
+
+  _Float16 Xchg, in = 0.0f16, in1;                /* Temporary input values  */
+  uint32_t i, rowCnt, flag = 0U, j, loopCnt, k,l;      /* loop counters */
+  arm_status status;                             /* status of matrix inverse */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+  /* Check for matrix mismatch condition */
+  if ((pSrc->numRows != pSrc->numCols) ||
+      (pDst->numRows != pDst->numCols) ||
+      (pSrc->numRows != pDst->numRows)   )
+  {
+    /* Set status as ARM_MATH_SIZE_MISMATCH */
+    status = ARM_MATH_SIZE_MISMATCH;
+  }
+  else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+  {
+
+    /*--------------------------------------------------------------------------------------------------------------
+     * Matrix Inverse can be solved using elementary row operations.
+     *
+     *  Gauss-Jordan Method:
+     *
+     *      1. First combine the identity matrix and the input matrix separated by a bar to form an
+     *        augmented matrix as follows:
+     *                      _                  _         _         _
+     *                     |  a11  a12 | 1   0  |       |  X11 X12  |
+     *                     |           |        |   =   |           |
+     *                     |_ a21  a22 | 0   1 _|       |_ X21 X21 _|
+     *
+     *      2. In our implementation, pDst Matrix is used as identity matrix.
+     *
+     *      3. Begin with the first row. Let i = 1.
+     *
+     *      4. Check to see if the pivot for row i is zero.
+     *         The pivot is the element of the main diagonal that is on the current row.
+     *         For instance, if working with row i, then the pivot element is aii.
+     *         If the pivot is zero, exchange that row with a row below it that does not
+     *         contain a zero in column i. If this is not possible, then an inverse
+     *         to that matrix does not exist.
+     *
+     *      5. Divide every element of row i by the pivot.
+     *
+     *      6. For every row below and  row i, replace that row with the sum of that row and
+     *         a multiple of row i so that each new element in column i below row i is zero.
+     *
+     *      7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
+     *         for every element below and above the main diagonal.
+     *
+     *      8. Now an identical matrix is formed to the left of the bar(input matrix, pSrc).
+     *         Therefore, the matrix to the right of the bar is our solution(pDst matrix, pDst).
+     *----------------------------------------------------------------------------------------------------------------*/
+
+    /* Working pointer for destination matrix */
+    pOutT1 = pOut;
+
+    /* Loop over the number of rows */
+    rowCnt = numRows;
+
+    /* Making the destination matrix as identity matrix */
+    while (rowCnt > 0U)
+    {
+      /* Writing all zeroes in lower triangle of the destination matrix */
+      j = numRows - rowCnt;
+      while (j > 0U)
+      {
+        *pOutT1++ = 0.0f16;
+        j--;
+      }
+
+      /* Writing all ones in the diagonal of the destination matrix */
+      *pOutT1++ = 1.0f16;
+
+      /* Writing all zeroes in upper triangle of the destination matrix */
+      j = rowCnt - 1U;
+      while (j > 0U)
+      {
+        *pOutT1++ = 0.0f16;
+        j--;
+      }
+
+      /* Decrement loop counter */
+      rowCnt--;
+    }
+
+    /* Loop over the number of columns of the input matrix.
+       All the elements in each column are processed by the row operations */
+    loopCnt = numCols;
+
+    /* Index modifier to navigate through the columns */
+    l = 0U;
+
+    while (loopCnt > 0U)
+    {
+      /* Check if the pivot element is zero..
+       * If it is zero then interchange the row with non zero row below.
+       * If there is no non zero element to replace in the rows below,
+       * then the matrix is Singular. */
+
+      /* Working pointer for the input matrix that points
+       * to the pivot element of the particular row  */
+      pInT1 = pIn + (l * numCols);
+
+      /* Working pointer for the destination matrix that points
+       * to the pivot element of the particular row  */
+      pOutT1 = pOut + (l * numCols);
+
+      /* Temporary variable to hold the pivot value */
+      in = *pInT1;
+
+
+      /* Check if the pivot element is zero */
+      if ((_Float16)*pInT1 == 0.0f16)
+      {
+        /* Loop over the number rows present below */
+
+        for (i = 1U; i < numRows-l; i++)
+        {
+          /* Update the input and destination pointers */
+          pInT2 = pInT1 + (numCols * i);
+          pOutT2 = pOutT1 + (numCols * i);
+
+          /* Check if there is a non zero pivot element to
+           * replace in the rows below */
+          if ((_Float16)*pInT2 != 0.0f16)
+          {
+            /* Loop over number of columns
+             * to the right of the pilot element */
+            j = numCols - l;
+
+            while (j > 0U)
+            {
+              /* Exchange the row elements of the input matrix */
+              Xchg = *pInT2;
+              *pInT2++ = *pInT1;
+              *pInT1++ = Xchg;
+
+              /* Decrement the loop counter */
+              j--;
+            }
+
+            /* Loop over number of columns of the destination matrix */
+            j = numCols;
+
+            while (j > 0U)
+            {
+              /* Exchange the row elements of the destination matrix */
+              Xchg = *pOutT2;
+              *pOutT2++ = *pOutT1;
+              *pOutT1++ = Xchg;
+
+              /* Decrement loop counter */
+              j--;
+            }
+
+            /* Flag to indicate whether exchange is done or not */
+            flag = 1U;
+
+            /* Break after exchange is done */
+            break;
+          }
+
+        }
+      }
+
+      /* Update the status if the matrix is singular */
+      if ((flag != 1U) && (in == 0.0f16))
+      {
+        return ARM_MATH_SINGULAR;
+      }
+
+      /* Points to the pivot row of input and destination matrices */
+      pPivotRowIn = pIn + (l * numCols);
+      pPivotRowDst = pOut + (l * numCols);
+
+      /* Temporary pointers to the pivot row pointers */
+      pInT1 = pPivotRowIn;
+      pInT2 = pPivotRowDst;
+
+      /* Pivot element of the row */
+      in = *pPivotRowIn;
+
+      /* Loop over number of columns
+       * to the right of the pilot element */
+      j = (numCols - l);
+
+      while (j > 0U)
+      {
+        /* Divide each element of the row of the input matrix
+         * by the pivot element */
+        in1 = *pInT1;
+        *pInT1++ = in1 / in;
+
+        /* Decrement the loop counter */
+        j--;
+      }
+
+      /* Loop over number of columns of the destination matrix */
+      j = numCols;
+
+      while (j > 0U)
+      {
+        /* Divide each element of the row of the destination matrix
+         * by the pivot element */
+        in1 = *pInT2;
+        *pInT2++ = in1 / in;
+
+        /* Decrement the loop counter */
+        j--;
+      }
+
+      /* Replace the rows with the sum of that row and a multiple of row i
+       * so that each new element in column i above row i is zero.*/
+
+      /* Temporary pointers for input and destination matrices */
+      pInT1 = pIn;
+      pInT2 = pOut;
+
+      /* index used to check for pivot element */
+      i = 0U;
+
+      /* Loop over number of rows */
+      /*  to be replaced by the sum of that row and a multiple of row i */
+      k = numRows;
+
+      while (k > 0U)
+      {
+        /* Check for the pivot element */
+        if (i == l)
+        {
+          /* If the processing element is the pivot element,
+             only the columns to the right are to be processed */
+          pInT1 += numCols - l;
+
+          pInT2 += numCols;
+        }
+        else
+        {
+          /* Element of the reference row */
+          in = *pInT1;
+
+          /* Working pointers for input and destination pivot rows */
+          pPRT_in = pPivotRowIn;
+          pPRT_pDst = pPivotRowDst;
+
+          /* Loop over the number of columns to the right of the pivot element,
+             to replace the elements in the input matrix */
+          j = (numCols - l);
+
+          while (j > 0U)
+          {
+            /* Replace the element by the sum of that row
+               and a multiple of the reference row  */
+            in1 = *pInT1;
+            *pInT1++ = (_Float16)in1 - ((_Float16)in * (_Float16)*pPRT_in++);
+
+            /* Decrement the loop counter */
+            j--;
+          }
+
+          /* Loop over the number of columns to
+             replace the elements in the destination matrix */
+          j = numCols;
+
+          while (j > 0U)
+          {
+            /* Replace the element by the sum of that row
+               and a multiple of the reference row  */
+            in1 = *pInT2;
+            *pInT2++ = (_Float16)in1 - ((_Float16)in * (_Float16)*pPRT_pDst++);
+
+            /* Decrement loop counter */
+            j--;
+          }
+
+        }
+
+        /* Increment temporary input pointer */
+        pInT1 = pInT1 + l;
+
+        /* Decrement loop counter */
+        k--;
+
+        /* Increment pivot index */
+        i++;
+      }
+
+      /* Increment the input pointer */
+      pIn++;
+
+      /* Decrement the loop counter */
+      loopCnt--;
+
+      /* Increment the index modifier */
+      l++;
+    }
+
+    /* Set status as ARM_MATH_SUCCESS */
+    status = ARM_MATH_SUCCESS;
+
+    if ((flag != 1U) && ((_Float16)in == 0.0f16))
+    {
+      pIn = pSrc->pData;
+      for (i = 0; i < numRows * numCols; i++)
+      {
+        if ((_Float16)pIn[i] != 0.0f16)
+            break;
+      }
+
+      if (i == numRows * numCols)
+        status = ARM_MATH_SINGULAR;
+    }
+  }
+
+  /* Return to application */
+  return (status);
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of MatrixInv group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+