diff --git a/ARM_math/arm_mat_inverse_f32.c b/ARM_math/arm_mat_inverse_f32.c
new file mode 100644
index 0000000..9c0afe9
--- /dev/null
+++ b/ARM_math/arm_mat_inverse_f32.c
@@ -0,0 +1,299 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_mat_inverse_f32.c
+ * Description:  Floating-point matrix inverse
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "matrix_functions.h"
+#include "matrix_utils.h"
+
+
+/**
+  @ingroup groupMatrix
+ */
+
+/**
+  @defgroup MatrixInv Matrix Inverse
+
+  Computes the inverse of a matrix.
+
+  The inverse is defined only if the input matrix is square and non-singular (the determinant is non-zero).
+  The function checks that the input and output matrices are square and of the same size.
+
+  Matrix inversion is numerically sensitive and the CMSIS DSP library only supports matrix
+  inversion of floating-point matrices.
+
+  @par Algorithm
+  The Gauss-Jordan method is used to find the inverse.
+  The algorithm performs a sequence of elementary row-operations until it
+  reduces the input matrix to an identity matrix. Applying the same sequence
+  of elementary row-operations to an identity matrix yields the inverse matrix.
+  If the input matrix is singular, then the algorithm terminates and returns error status
+  <code>ARM_MATH_SINGULAR</code>.
+ 
+  @par Matrix Inverse of a 3 x 3 matrix using Gauss-Jordan Method 
+
+  \f[
+  \begin{pmatrix}
+   a_{1,1} & a_{1,2} & a_{1,3} & | & 1 & 0 & 0\\
+   a_{2,1} & a_{2,2} & a_{2,3} & | & 0 & 1 & 0\\
+   a_{3,1} & a_{3,2} & a_{3,3} & | & 0 & 0 & 1\\
+  \end{pmatrix}
+  \rightarrow
+  \begin{pmatrix}
+   1 & 0 & 0 & | & x_{1,1} & x_{2,1} & x_{3,1} \\
+   0 & 1 & 0 & | & x_{1,2} & x_{2,2} & x_{3,2} \\
+   0 & 0 & 1 & | & x_{1,3} & x_{2,3} & x_{3,3} \\
+  \end{pmatrix}
+  \f]
+ */
+
+/**
+  @addtogroup MatrixInv
+  @{
+ */
+
+/**
+  @brief         Floating-point matrix inverse.
+  @param[in]     pSrc      points to input matrix structure. The source matrix is modified by the function.
+  @param[out]    pDst      points to output matrix structure
+  @return        execution status
+                   - \ref ARM_MATH_SUCCESS       : Operation successful
+                   - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+                   - \ref ARM_MATH_SINGULAR      : Input matrix is found to be singular (non-invertible)
+ */
+arm_status arm_mat_inverse_f32(
+  const arm_matrix_instance_f32 * pSrc,
+        arm_matrix_instance_f32 * pDst)
+{
+  float32_t *pIn = pSrc->pData;                  /* input data matrix pointer */
+  float32_t *pOut = pDst->pData;                 /* output data matrix pointer */
+  
+  float32_t *pTmp;
+  uint32_t numRows = pSrc->numRows;              /* Number of rows in the matrix  */
+  uint32_t numCols = pSrc->numCols;              /* Number of Cols in the matrix  */
+
+
+  float32_t pivot = 0.0f, newPivot=0.0f;                /* Temporary input values  */
+  uint32_t selectedRow,pivotRow,i, rowNb, rowCnt, flag = 0U, j,column;      /* loop counters */
+  arm_status status;                             /* status of matrix inverse */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+  /* Check for matrix mismatch condition */
+  if ((pSrc->numRows != pSrc->numCols) ||
+      (pDst->numRows != pDst->numCols) ||
+      (pSrc->numRows != pDst->numRows)   )
+  {
+    /* Set status as ARM_MATH_SIZE_MISMATCH */
+    status = ARM_MATH_SIZE_MISMATCH;
+  }
+  else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+  {
+    /*--------------------------------------------------------------------------------------------------------------
+     * Matrix Inverse can be solved using elementary row operations.
+     *
+     *  Gauss-Jordan Method:
+     *
+     *      1. First combine the identity matrix and the input matrix separated by a bar to form an
+     *        augmented matrix as follows:
+     *                      _                  _         _         _
+     *                     |  a11  a12 | 1   0  |       |  X11 X12  |
+     *                     |           |        |   =   |           |
+     *                     |_ a21  a22 | 0   1 _|       |_ X21 X21 _|
+     *
+     *      2. In our implementation, pDst Matrix is used as identity matrix.
+     *
+     *      3. Begin with the first row. Let i = 1.
+     *
+     *      4. Check to see if the pivot for row i is zero.
+     *         The pivot is the element of the main diagonal that is on the current row.
+     *         For instance, if working with row i, then the pivot element is aii.
+     *         If the pivot is zero, exchange that row with a row below it that does not
+     *         contain a zero in column i. If this is not possible, then an inverse
+     *         to that matrix does not exist.
+     *
+     *      5. Divide every element of row i by the pivot.
+     *
+     *      6. For every row below and  row i, replace that row with the sum of that row and
+     *         a multiple of row i so that each new element in column i below row i is zero.
+     *
+     *      7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
+     *         for every element below and above the main diagonal.
+     *
+     *      8. Now an identical matrix is formed to the left of the bar(input matrix, pSrc).
+     *         Therefore, the matrix to the right of the bar is our solution(pDst matrix, pDst).
+     *----------------------------------------------------------------------------------------------------------------*/
+
+    /* Working pointer for destination matrix */
+    pTmp = pOut;
+
+    /* Loop over the number of rows */
+    rowCnt = numRows;
+
+    /* Making the destination matrix as identity matrix */
+    while (rowCnt > 0U)
+    {
+      /* Writing all zeroes in lower triangle of the destination matrix */
+      j = numRows - rowCnt;
+      while (j > 0U)
+      {
+        *pTmp++ = 0.0f;
+        j--;
+      }
+
+      /* Writing all ones in the diagonal of the destination matrix */
+      *pTmp++ = 1.0f;
+
+      /* Writing all zeroes in upper triangle of the destination matrix */
+      j = rowCnt - 1U;
+      while (j > 0U)
+      {
+        *pTmp++ = 0.0f;
+        j--;
+      }
+
+      /* Decrement loop counter */
+      rowCnt--;
+    }
+
+    /* Loop over the number of columns of the input matrix.
+       All the elements in each column are processed by the row operations */
+
+    /* Index modifier to navigate through the columns */
+    for(column = 0U; column < numCols; column++)
+    {
+      /* Check if the pivot element is zero..
+       * If it is zero then interchange the row with non zero row below.
+       * If there is no non zero element to replace in the rows below,
+       * then the matrix is Singular. */
+
+      pivotRow = column;
+
+      /* Temporary variable to hold the pivot value */
+      pTmp = ELEM(pSrc,column,column) ;
+      pivot = *pTmp;
+      selectedRow = column;
+
+      /* Find maximum pivot in column */
+      
+        /* Loop over the number rows present below */
+
+      for (rowNb = column+1; rowNb < numRows; rowNb++)
+      {
+          /* Update the input and destination pointers */
+          pTmp = ELEM(pSrc,rowNb,column);
+          newPivot = *pTmp;
+          if (fabsf(newPivot) > fabsf(pivot))
+          {
+            selectedRow = rowNb; 
+            pivot = newPivot;
+          }
+      }
+        
+      /* Check if there is a non zero pivot element to
+       * replace in the rows below */
+      if ((pivot != 0.0f) && (selectedRow != column))
+      {
+            
+            SWAP_ROWS_F32(pSrc,column, pivotRow,selectedRow);
+            SWAP_ROWS_F32(pDst,0, pivotRow,selectedRow);
+
+    
+            /* Flag to indicate whether exchange is done or not */
+            flag = 1U;
+       }
+
+
+      
+      
+
+      /* Update the status if the matrix is singular */
+      if ((flag != 1U) && (pivot == 0.0f))
+      {
+        return ARM_MATH_SINGULAR;
+      }
+
+     
+      /* Pivot element of the row */
+      pivot = 1.0f / pivot;
+
+      SCALE_ROW_F32(pSrc,column,pivot,pivotRow);
+      SCALE_ROW_F32(pDst,0,pivot,pivotRow);
+
+      
+      /* Replace the rows with the sum of that row and a multiple of row i
+       * so that each new element in column i above row i is zero.*/
+
+      rowNb = 0;
+      for (;rowNb < pivotRow; rowNb++)
+      {
+           pTmp = ELEM(pSrc,rowNb,column) ;
+           pivot = *pTmp;
+
+           MAS_ROW_F32(column,pSrc,rowNb,pivot,pSrc,pivotRow);
+           MAS_ROW_F32(0     ,pDst,rowNb,pivot,pDst,pivotRow);
+
+
+      }
+
+      for (rowNb = pivotRow + 1; rowNb < numRows; rowNb++)
+      {
+           pTmp = ELEM(pSrc,rowNb,column) ;
+           pivot = *pTmp;
+
+           MAS_ROW_F32(column,pSrc,rowNb,pivot,pSrc,pivotRow);
+           MAS_ROW_F32(0     ,pDst,rowNb,pivot,pDst,pivotRow);
+
+      }
+
+    }
+
+    /* Set status as ARM_MATH_SUCCESS */
+    status = ARM_MATH_SUCCESS;
+
+    if ((flag != 1U) && (pivot == 0.0f))
+    {
+      pIn = pSrc->pData;
+      for (i = 0; i < numRows * numCols; i++)
+      {
+        if (pIn[i] != 0.0f)
+            break;
+      }
+
+      if (i == numRows * numCols)
+        status = ARM_MATH_SINGULAR;
+    }
+  }
+
+  /* Return to application */
+  return (status);
+}
+/**
+  @} end of MatrixInv group
+ */
diff --git a/ARM_math/arm_mat_inverse_f64.c b/ARM_math/arm_mat_inverse_f64.c
new file mode 100644
index 0000000..d55806a
--- /dev/null
+++ b/ARM_math/arm_mat_inverse_f64.c
@@ -0,0 +1,263 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_mat_inverse_f64.c
+ * Description:  Floating-point matrix inverse
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "matrix_functions.h"
+#include "matrix_utils.h"
+
+/**
+  @ingroup groupMatrix
+ */
+
+
+/**
+  @addtogroup MatrixInv
+  @{
+ */
+
+/**
+  @brief         Floating-point (64 bit) matrix inverse.
+  @param[in]     pSrc      points to input matrix structure. The source matrix is modified by the function.
+  @param[out]    pDst      points to output matrix structure
+  @return        execution status
+                   - \ref ARM_MATH_SUCCESS       : Operation successful
+                   - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+                   - \ref ARM_MATH_SINGULAR      : Input matrix is found to be singular (non-invertible)
+ */
+
+arm_status arm_mat_inverse_f64(
+  const arm_matrix_instance_f64 * pSrc,
+        arm_matrix_instance_f64 * pDst)
+{
+  float64_t *pIn = pSrc->pData;                  /* input data matrix pointer */
+  float64_t *pOut = pDst->pData;                 /* output data matrix pointer */
+  
+  float64_t *pTmp;
+  uint32_t numRows = pSrc->numRows;              /* Number of rows in the matrix  */
+  uint32_t numCols = pSrc->numCols;              /* Number of Cols in the matrix  */
+
+
+  float64_t pivot = 0.0, newPivot=0.0;                /* Temporary input values  */
+  uint32_t selectedRow,pivotRow,i, rowNb, rowCnt, flag = 0U, j,column;      /* loop counters */
+  arm_status status;                             /* status of matrix inverse */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+  /* Check for matrix mismatch condition */
+  if ((pSrc->numRows != pSrc->numCols) ||
+      (pDst->numRows != pDst->numCols) ||
+      (pSrc->numRows != pDst->numRows)   )
+  {
+    /* Set status as ARM_MATH_SIZE_MISMATCH */
+    status = ARM_MATH_SIZE_MISMATCH;
+  }
+  else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+  {
+    /*--------------------------------------------------------------------------------------------------------------
+     * Matrix Inverse can be solved using elementary row operations.
+     *
+     *  Gauss-Jordan Method:
+     *
+     *      1. First combine the identity matrix and the input matrix separated by a bar to form an
+     *        augmented matrix as follows:
+     *                      _                  _         _         _
+     *                     |  a11  a12 | 1   0  |       |  X11 X12  |
+     *                     |           |        |   =   |           |
+     *                     |_ a21  a22 | 0   1 _|       |_ X21 X21 _|
+     *
+     *      2. In our implementation, pDst Matrix is used as identity matrix.
+     *
+     *      3. Begin with the first row. Let i = 1.
+     *
+     *      4. Check to see if the pivot for row i is zero.
+     *         The pivot is the element of the main diagonal that is on the current row.
+     *         For instance, if working with row i, then the pivot element is aii.
+     *         If the pivot is zero, exchange that row with a row below it that does not
+     *         contain a zero in column i. If this is not possible, then an inverse
+     *         to that matrix does not exist.
+     *
+     *      5. Divide every element of row i by the pivot.
+     *
+     *      6. For every row below and  row i, replace that row with the sum of that row and
+     *         a multiple of row i so that each new element in column i below row i is zero.
+     *
+     *      7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
+     *         for every element below and above the main diagonal.
+     *
+     *      8. Now an identical matrix is formed to the left of the bar(input matrix, pSrc).
+     *         Therefore, the matrix to the right of the bar is our solution(pDst matrix, pDst).
+     *----------------------------------------------------------------------------------------------------------------*/
+
+    /* Working pointer for destination matrix */
+    pTmp = pOut;
+
+    /* Loop over the number of rows */
+    rowCnt = numRows;
+
+    /* Making the destination matrix as identity matrix */
+    while (rowCnt > 0U)
+    {
+      /* Writing all zeroes in lower triangle of the destination matrix */
+      j = numRows - rowCnt;
+      while (j > 0U)
+      {
+        *pTmp++ = 0.0;
+        j--;
+      }
+
+      /* Writing all ones in the diagonal of the destination matrix */
+      *pTmp++ = 1.0;
+
+      /* Writing all zeroes in upper triangle of the destination matrix */
+      j = rowCnt - 1U;
+      while (j > 0U)
+      {
+        *pTmp++ = 0.0;
+        j--;
+      }
+
+      /* Decrement loop counter */
+      rowCnt--;
+    }
+
+    /* Loop over the number of columns of the input matrix.
+       All the elements in each column are processed by the row operations */
+
+    /* Index modifier to navigate through the columns */
+    for(column = 0U; column < numCols; column++)
+    {
+      /* Check if the pivot element is zero..
+       * If it is zero then interchange the row with non zero row below.
+       * If there is no non zero element to replace in the rows below,
+       * then the matrix is Singular. */
+
+      pivotRow = column;
+
+      /* Temporary variable to hold the pivot value */
+      pTmp = ELEM(pSrc,column,column) ;
+      pivot = *pTmp;
+      selectedRow = column;
+
+      
+        /* Loop over the number rows present below */
+
+      for (rowNb = column+1; rowNb < numRows; rowNb++)
+      {
+          /* Update the input and destination pointers */
+          pTmp = ELEM(pSrc,rowNb,column);
+          newPivot = *pTmp;
+          if (fabs(newPivot) > fabs(pivot))
+          {
+            selectedRow = rowNb; 
+            pivot = newPivot;
+          }
+      }
+
+          /* Check if there is a non zero pivot element to
+           * replace in the rows below */
+      if ((pivot != 0.0) && (selectedRow != column))
+      {
+            /* Loop over number of columns
+             * to the right of the pilot element */
+
+            SWAP_ROWS_F64(pSrc,column, pivotRow,selectedRow);
+            SWAP_ROWS_F64(pDst,0, pivotRow,selectedRow);
+
+    
+            /* Flag to indicate whether exchange is done or not */
+            flag = 1U;
+
+      }
+
+
+      /* Update the status if the matrix is singular */
+      if ((flag != 1U) && (pivot == 0.0))
+      {
+        return ARM_MATH_SINGULAR;
+      }
+
+     
+      /* Pivot element of the row */
+      pivot = 1.0 / pivot;
+
+      SCALE_ROW_F64(pSrc,column,pivot,pivotRow);
+      SCALE_ROW_F64(pDst,0,pivot,pivotRow);
+
+      
+      /* Replace the rows with the sum of that row and a multiple of row i
+       * so that each new element in column i above row i is zero.*/
+
+      rowNb = 0;
+      for (;rowNb < pivotRow; rowNb++)
+      {
+           pTmp = ELEM(pSrc,rowNb,column) ;
+           pivot = *pTmp;
+
+           MAS_ROW_F64(column,pSrc,rowNb,pivot,pSrc,pivotRow);
+           MAS_ROW_F64(0     ,pDst,rowNb,pivot,pDst,pivotRow);
+
+
+      }
+
+      for (rowNb = pivotRow + 1; rowNb < numRows; rowNb++)
+      {
+           pTmp = ELEM(pSrc,rowNb,column) ;
+           pivot = *pTmp;
+
+           MAS_ROW_F64(column,pSrc,rowNb,pivot,pSrc,pivotRow);
+           MAS_ROW_F64(0     ,pDst,rowNb,pivot,pDst,pivotRow);
+
+      }
+
+    }
+
+    /* Set status as ARM_MATH_SUCCESS */
+    status = ARM_MATH_SUCCESS;
+
+    if ((flag != 1U) && (pivot == 0.0))
+    {
+      pIn = pSrc->pData;
+      for (i = 0; i < numRows * numCols; i++)
+      {
+        if (pIn[i] != 0.0)
+            break;
+      }
+
+      if (i == numRows * numCols)
+        status = ARM_MATH_SINGULAR;
+    }
+  }
+
+  /* Return to application */
+  return (status);
+}
+/**
+  @} end of MatrixInv group
+ */
diff --git a/ARM_math/arm_mat_mult_f32.c b/ARM_math/arm_mat_mult_f32.c
new file mode 100644
index 0000000..978e8be
--- /dev/null
+++ b/ARM_math/arm_mat_mult_f32.c
@@ -0,0 +1,1021 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_mat_mult_f32.c
+ * Description:  Floating-point matrix multiplication
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "matrix_functions.h"
+
+#if defined(ARM_MATH_NEON)
+#define GROUPOFROWS 8
+#endif
+
+/**
+ * @ingroup groupMatrix
+ */
+
+/**
+ * @defgroup MatrixMult Matrix Multiplication
+ *
+ * Multiplies two matrices.
+ *
+ * @par Multiplication of two 3x3 matrices:
+ * 
+ * \f[
+ * \begin{pmatrix}
+ *  a_{1,1} & a_{1,2} & a_{1,3} \\
+ *  a_{2,1} & a_{2,2} & a_{2,3} \\
+ *  a_{3,1} & a_{3,2} & a_{3,3} \\
+ * \end{pmatrix}
+ * 
+ * \begin{pmatrix}
+ *  b_{1,1} & b_{1,2} & b_{1,3} \\
+ *  b_{2,1} & b_{2,2} & b_{2,3} \\
+ *  b_{3,1} & b_{3,2} & b_{3,3} \\
+ * \end{pmatrix}
+ * =
+ * \begin{pmatrix}
+ *  a_{1,1} b_{1,1}+a_{1,2} b_{2,1}+a_{1,3} b_{3,1} & a_{1,1} b_{1,2}+a_{1,2} b_{2,2}+a_{1,3} b_{3,2} & a_{1,1} b_{1,3}+a_{1,2} b_{2,3}+a_{1,3} b_{3,3} \\
+ *  a_{2,1} b_{1,1}+a_{2,2} b_{2,1}+a_{2,3} b_{3,1} & a_{2,1} b_{1,2}+a_{2,2} b_{2,2}+a_{2,3} b_{3,2} & a_{2,1} b_{1,3}+a_{2,2} b_{2,3}+a_{2,3} b_{3,3} \\
+ *  a_{3,1} b_{1,1}+a_{3,2} b_{2,1}+a_{3,3} b_{3,1} & a_{3,1} b_{1,2}+a_{3,2} b_{2,2}+a_{3,3} b_{3,2} & a_{3,1} b_{1,3}+a_{3,2} b_{2,3}+a_{3,3} b_{3,3} \\
+ * \end{pmatrix}
+ * \f]
+
+ * Matrix multiplication is only defined if the number of columns of the
+ * first matrix equals the number of rows of the second matrix.
+ * Multiplying an <code>M x N</code> matrix with an <code>N x P</code> matrix results
+ * in an <code>M x P</code> matrix.
+ * When matrix size checking is enabled, the functions check: (1) that the inner dimensions of
+ * <code>pSrcA</code> and <code>pSrcB</code> are equal; and (2) that the size of the output
+ * matrix equals the outer dimensions of <code>pSrcA</code> and <code>pSrcB</code>.
+ */
+
+
+/**
+ * @addtogroup MatrixMult
+ * @{
+ */
+
+
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#define MATRIX_DIM3 3 
+#define MATRIX_DIM4 4 
+
+__STATIC_INLINE  arm_status arm_mat_mult_f32_2x2_mve(
+    const arm_matrix_instance_f32 *pSrcA,
+    const arm_matrix_instance_f32 *pSrcB,
+    arm_matrix_instance_f32 *pDst)
+{
+    /* {a00, a00, a10, a10} */
+    static const uint32_t  offsetA0[4] = { 0, 0, 2, 2 };
+    /* {b00, b01, b00, b01} */
+    static const uint32_t  offsetB0[4] = { 0, 1, 0, 1 };
+    /* {a01, a01, a11, a11} */
+    static const uint32_t  offsetA1[4] = { 1, 1, 3, 3 };
+    /* {b10, b11, b10, b11} */
+    static const uint32_t  offsetB1[4] = { 2, 3, 2, 3 };
+
+    uint32x4_t vecOffsA, vecOffsB;
+    f32x4_t vecInA, vecInB, vecDst;
+
+    vecOffsA = vldrwq_u32((uint32_t const *) offsetA0);
+    vecOffsB = vldrwq_u32((uint32_t const *) offsetB0);
+
+    vecInA = vldrwq_gather_shifted_offset((float32_t const *) pSrcA->pData, vecOffsA);
+    vecInB = vldrwq_gather_shifted_offset((float32_t const *) pSrcB->pData, vecOffsB);
+
+    vecDst = vmulq(vecInA, vecInB);
+
+    vecOffsA = vldrwq_u32((uint32_t const *) offsetA1);
+    vecOffsB = vldrwq_u32((uint32_t const *) offsetB1);
+
+    vecInA = vldrwq_gather_shifted_offset((float32_t const *) pSrcA->pData, vecOffsA);
+    vecInB = vldrwq_gather_shifted_offset((float32_t const *) pSrcB->pData, vecOffsB);
+
+    vecDst = vfmaq(vecDst, vecInA, vecInB);
+
+    vstrwq_f32(pDst->pData, vecDst);
+
+    return (ARM_MATH_SUCCESS);
+
+}
+
+
+/*
+ * A  =  {{a00, a01, a02},
+ *        {a10, a11, a12},
+ *        {a20, a21, a22}}
+ * B  =  {{b00, b01, b02},
+ *        {b10, b11, b12},
+ *        {b20, b21, b22}}
+ *
+ * Dst = {{a00 b00 + a01 b10 + a02 b20, a00 b01 + a01 b11 + a02 b21, a00 b02 + a01 b12 + a02 b22},
+ *        {a10 b00 + a11 b10 + a12 b20, a10 b01 + a11 b11 + a12 b21, a10 b02 + a11 b12 + a12 b22},
+ *        {a20 b00 + a21 b10 + a22 b20, a20 b01 + a21 b11 + a22 b21, a20 b02 + a21 b12 + a22 b22}}
+ */
+__STATIC_INLINE  arm_status arm_mat_mult_f32_3x3_mve(
+    const arm_matrix_instance_f32 *pSrcA,
+    const arm_matrix_instance_f32 *pSrcB,
+    arm_matrix_instance_f32 *pDst)
+{
+    float32_t   *pInB = pSrcB->pData; /* input data matrix pointer B */
+    float32_t   *pInA = pSrcA->pData; /* input data matrix pointer A  */
+    float32_t   *pOut = pDst->pData;  /* output data matrix pointer */
+    float32_t   *pInA0, *pInA1, *pInA2;
+    f32x4_t    vecMac0, vecMac1, vecMac2;
+    f32x4_t    vecInB;
+    float32_t const *pSrBVec;
+
+    pSrBVec = (float32_t const *) pInB;
+
+    pInA0 = pInA;
+    pInA1 = pInA0 + MATRIX_DIM3;
+    pInA2 = pInA1 + MATRIX_DIM3;
+    /* enable predication to disable last (4th) vector element */
+    mve_pred16_t p0 = vctp32q(MATRIX_DIM3);
+
+    /*
+     * load {b0,0, b0,1, b0,2, 0}
+     */
+    vecInB = vldrwq_z_f32(pSrBVec, p0);  
+    pSrBVec += MATRIX_DIM3;
+
+    vecMac0 = vmulq(vecInB, *pInA0++);
+    vecMac1 = vmulq(vecInB, *pInA1++);
+    vecMac2 = vmulq(vecInB, *pInA2++);
+    /*
+     * load {b1,0, b1,1, b1,2, 0}
+     */
+    vecInB = vldrwq_z_f32(pSrBVec, p0);  
+    pSrBVec += MATRIX_DIM3;
+
+    vecMac0 = vfmaq(vecMac0, vecInB, *pInA0++);
+    vecMac1 = vfmaq(vecMac1, vecInB, *pInA1++);
+    vecMac2 = vfmaq(vecMac2, vecInB, *pInA2++);
+    /*
+     * load {b2,0, b2,1 , b2,2, 0}
+     */
+    vecInB = vldrwq_z_f32(pSrBVec, p0);  
+    pSrBVec += MATRIX_DIM3;
+
+    vecMac0 = vfmaq(vecMac0, vecInB, *pInA0++);
+    vecMac1 = vfmaq(vecMac1, vecInB, *pInA1++);
+    vecMac2 = vfmaq(vecMac2, vecInB, *pInA2++);
+
+    /* partial vector stores */
+    vstrwq_p_f32(pOut, vecMac0, p0); 
+    pOut += MATRIX_DIM3;
+    vstrwq_p_f32(pOut, vecMac1, p0); 
+    pOut += MATRIX_DIM3;
+    vstrwq_p_f32(pOut, vecMac2, p0);
+    /*
+     * Return to application
+     */
+    return (ARM_MATH_SUCCESS);
+}
+
+
+
+
+__STATIC_INLINE arm_status arm_mat_mult_f32_4x4_mve(
+    const arm_matrix_instance_f32 *pSrcA,
+    const arm_matrix_instance_f32 *pSrcB,
+    arm_matrix_instance_f32 *pDst)
+{
+    float32_t const *pSrBVec;
+    float32_t *pInB = pSrcB->pData; /* input data matrix pointer B */
+    float32_t *pInA = pSrcA->pData; /* input data matrix pointer A  */
+    float32_t *pOut = pDst->pData;  /* output data matrix pointer */
+    float32_t *pInA0, *pInA1, *pInA2, *pInA3;
+    f32x4_t vecMac0, vecMac1, vecMac2, vecMac3;
+    f32x4_t vecInB;
+
+    pSrBVec = (float32_t const *) pInB;
+
+    pInA0 = pInA;
+    pInA1 = pInA0 + MATRIX_DIM4;
+    pInA2 = pInA1 + MATRIX_DIM4;
+    pInA3 = pInA2 + MATRIX_DIM4;
+    /*
+     * load {b0,0, b0,1, b0,2, b0,3}
+     */
+    vecInB = vld1q(pSrBVec);  
+    pSrBVec += MATRIX_DIM4;
+
+    vecMac0 = vmulq(vecInB, *pInA0++);
+    vecMac1 = vmulq(vecInB, *pInA1++);
+    vecMac2 = vmulq(vecInB, *pInA2++);
+    vecMac3 = vmulq(vecInB, *pInA3++);
+    /*
+     * load {b1,0, b1,1, b1,2, b1,3}
+     */
+    vecInB = vld1q(pSrBVec);  
+    pSrBVec += MATRIX_DIM4;
+
+    vecMac0 = vfmaq(vecMac0, vecInB, *pInA0++);
+    vecMac1 = vfmaq(vecMac1, vecInB, *pInA1++);
+    vecMac2 = vfmaq(vecMac2, vecInB, *pInA2++);
+    vecMac3 = vfmaq(vecMac3, vecInB, *pInA3++);
+    /*
+     * load {b2,0, b2,1, b2,2, b2,3}
+     */
+    vecInB = vld1q(pSrBVec);  
+    pSrBVec += MATRIX_DIM4;
+
+    vecMac0 = vfmaq(vecMac0, vecInB, *pInA0++);
+    vecMac1 = vfmaq(vecMac1, vecInB, *pInA1++);
+    vecMac2 = vfmaq(vecMac2, vecInB, *pInA2++);
+    vecMac3 = vfmaq(vecMac3, vecInB, *pInA3++);
+    /*
+     * load {b3,0, b3,1, b3,2, b3,3}
+     */
+    vecInB = vld1q(pSrBVec);  
+    pSrBVec += MATRIX_DIM4;
+
+    vecMac0 = vfmaq(vecMac0, vecInB, *pInA0++);
+    vecMac1 = vfmaq(vecMac1, vecInB, *pInA1++);
+    vecMac2 = vfmaq(vecMac2, vecInB, *pInA2++);
+    vecMac3 = vfmaq(vecMac3, vecInB, *pInA3++);
+
+    vst1q(pOut, vecMac0);  
+    pOut += MATRIX_DIM4;
+    vst1q(pOut, vecMac1);  
+    pOut += MATRIX_DIM4;
+    vst1q(pOut, vecMac2);  
+    pOut += MATRIX_DIM4;
+    vst1q(pOut, vecMac3);
+    /*
+     * Return to application
+     */
+    return (ARM_MATH_SUCCESS);
+}
+
+
+/**
+ * @brief Floating-point matrix multiplication.
+ * @param[in]       *pSrcA points to the first input matrix structure
+ * @param[in]       *pSrcB points to the second input matrix structure
+ * @param[out]      *pDst points to output matrix structure
+ * @return          The function returns either
+ * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+ */
+arm_status arm_mat_mult_f32(
+  const arm_matrix_instance_f32 * pSrcA,
+  const arm_matrix_instance_f32 * pSrcB,
+  arm_matrix_instance_f32 * pDst)
+{
+    float32_t  *pInB = pSrcB->pData;        /* input data matrix pointer B */
+    float32_t  *pInA = pSrcA->pData;        /* input data matrix pointer A  */
+    float32_t  *pOut = pDst->pData;         /* output data matrix pointer */
+    int         numRowsA = pSrcA->numRows;  /* number of rows of input matrix A */
+    int         numColsB = pSrcB->numCols;  /* number of columns of input matrix B */
+    int         numColsA = pSrcA->numCols;  /* number of columns of input matrix A */
+    uint32_t    blkCnt;                     /* loop counters */
+    uint32_t    i;
+    arm_status status; 
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+  /* Check for matrix mismatch condition */
+  if ((pSrcA->numCols != pSrcB->numRows) ||
+     (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
+  {
+    /* Set status as ARM_MATH_SIZE_MISMATCH */
+    status = ARM_MATH_SIZE_MISMATCH;
+  }
+  else
+#endif /*      #ifdef ARM_MATH_MATRIX_CHECK    */
+  {
+      /* small squared matrix specialized routines */
+    if(numRowsA == numColsB && numColsB == numColsA) {
+        if (numRowsA == 1)
+        {
+           pOut[0] = pInA[0] * pInB[0];
+           return(ARM_MATH_SUCCESS);
+        }
+        else if(numRowsA == 2)
+            return arm_mat_mult_f32_2x2_mve(pSrcA, pSrcB, pDst);
+        else if(numRowsA == 3)
+            return arm_mat_mult_f32_3x3_mve(pSrcA, pSrcB, pDst);
+        else if(numRowsA == 4)
+            return arm_mat_mult_f32_4x4_mve(pSrcA, pSrcB, pDst);
+    }
+
+    /* main loop process 4 rows */
+    i = numRowsA >> 2;
+    while (i > 0U)
+    {
+        float32_t *pInA0, *pInA1, *pInA2, *pInA3;
+        float32_t *pInB0;
+        float32_t *pOut0, *pOut1, *pOut2, *pOut3;
+        f32x4_t vecMac0, vecMac1, vecMac2, vecMac3;
+        f32x4_t vecInB;
+
+        /* pointers to 4 consecutive output rows */
+        pOut0 = pOut;
+        pOut1 = pOut0 + numColsB;
+        pOut2 = pOut1 + numColsB;
+        pOut3 = pOut2 + numColsB;
+        pInB0 = pInB;
+
+        uint32_t  k = numColsB >> 2;
+        while (k > 0U)
+        {
+            /* pointers to 4 consecutive Matrix A rows */
+            pInA0 = pInA;
+            pInA1 = pInA0 + numColsA;
+            pInA2 = pInA1 + numColsA;
+            pInA3 = pInA2 + numColsA;
+
+            vecMac0 = vdupq_n_f32(0.0f);
+            vecMac1 = vdupq_n_f32(0.0f);
+            vecMac2 = vdupq_n_f32(0.0f);
+            vecMac3 = vdupq_n_f32(0.0f);
+
+            blkCnt = numColsA;
+
+            while (blkCnt > 0U)
+            {
+                /*
+                 * load {bi,4n+0, bi,4n+1, bi,4n+2, bi,4n+3}
+                 */
+                vecInB = *(f32x4_t *)pInB0; /* vldrwq_f32(pInB0, 0); */
+
+                vecMac0 = vfmaq(vecMac0, vecInB, *pInA0++);
+                vecMac1 = vfmaq(vecMac1, vecInB, *pInA1++);
+                vecMac2 = vfmaq(vecMac2, vecInB, *pInA2++);
+                vecMac3 = vfmaq(vecMac3, vecInB, *pInA3++);
+
+                pInB0 = pInB0 + numColsB;
+                /*
+                 * Decrement the blockSize loop counter
+                 */
+                blkCnt--;
+            }
+
+            /* Store the results (4 x 4 block) in the destination buffer */
+            vst1q(pOut0, vecMac0);  
+            pOut0 += 4;
+            vst1q(pOut1, vecMac1);  
+            pOut1 += 4;
+            vst1q(pOut2, vecMac2);  
+            pOut2 += 4;
+            vst1q(pOut3, vecMac3);  
+            pOut3 += 4;
+
+            /*
+             * rewind
+             */
+            pInB0 -= (numColsB * numColsA) - 4;
+            k--;
+        }
+
+        int       colBLeft = numColsB & 3;
+        if (colBLeft)
+        {
+            pInA0 = pInA;
+            pInA1 = pInA0 + numColsA;
+            pInA2 = pInA1 + numColsA;
+            pInA3 = pInA2 + numColsA;
+            mve_pred16_t p0 = vctp32q(colBLeft);
+
+            vecMac0 = vdupq_n_f32(0.0f);
+            vecMac1 = vdupq_n_f32(0.0f);
+            vecMac2 = vdupq_n_f32(0.0f);
+            vecMac3 = vdupq_n_f32(0.0f);
+
+            blkCnt = numColsA;
+
+            while (blkCnt > 0U)
+            {
+                /*
+                 * load {bi,4n+0, bi,4n+1, bi,4n+2, bi,4n+3}
+                 */
+                vecInB = vldrwq_z_f32(pInB0, p0);
+
+                vecMac0 = vfmaq(vecMac0, vecInB, *pInA0++);
+                vecMac1 = vfmaq(vecMac1, vecInB, *pInA1++);
+                vecMac2 = vfmaq(vecMac2, vecInB, *pInA2++);
+                vecMac3 = vfmaq(vecMac3, vecInB, *pInA3++);
+
+                pInB0 = pInB0 + numColsB;
+                /*
+                 * Decrement the blockSize loop counter
+                 */
+                blkCnt--;
+            }
+
+            /* Store the results (4 x colBLeft block) in the destination buffer */
+            vstrwq_p_f32(pOut0, vecMac0, p0);
+            vstrwq_p_f32(pOut1, vecMac1, p0);
+            vstrwq_p_f32(pOut2, vecMac2, p0);
+            vstrwq_p_f32(pOut3, vecMac3, p0);
+        }
+
+        /* move to next rows */
+        pInA += 4 * numColsA;
+        pOut += 4 * numColsB;
+        i--;
+    }
+
+    /*
+     * non multiple of 4 rows for Matrix A
+     * process single row
+     */
+    if (numRowsA & 3)
+    {
+        i = numRowsA & 3;
+        while (i > 0U)
+        {
+            float32_t   *pInA0;
+            float32_t   *pInB0;
+            float32_t   *pOut0;
+            f32x4_t    vecInB;
+            f32x4_t    vecMac0;
+
+            pOut0 = pOut;
+            pInB0 = pInB;
+
+            uint32_t       k = numColsB >> 2;
+            while (k > 0U)
+            {
+                pInA0 = pInA;
+
+                vecMac0 = vdupq_n_f32(0.0f);
+                blkCnt = numColsA;
+                while (blkCnt > 0U)
+                {
+                    /*
+                     * load {bi,4n+0, bi,4n+1, bi,4n+2, bi,4n+3}
+                     */
+                    vecInB = *(f32x4_t *)pInB0; /* vldrwq_f32(pInB0, 0); */
+
+                    vecMac0 = vfmaq(vecMac0, vecInB, *pInA0++);
+
+                    pInB0 = pInB0 + numColsB;
+                    /*
+                     * Decrement the blockSize loop counter
+                     */
+                    blkCnt--;
+                }
+
+                /* Store the results (1 x 4 block) in the destination buffer */
+                vst1q(pOut0, vecMac0);  
+                pOut0 += 4;
+
+                /*
+                 * rewind
+                 */
+                pInB0 -= (numColsB * numColsA) - 4;
+                k--;
+            }
+
+            int       colBLeft = numColsB & 3;
+            if (colBLeft)
+            {
+                pInA0 = pInA;
+                mve_pred16_t p0 = vctp32q(colBLeft);
+
+                vecMac0 = vdupq_n_f32(0.0f);
+                blkCnt = numColsA;
+                while (blkCnt > 0U)
+                {
+                    /*
+                     * load {bi,4n+0, bi,4n+1, bi,4n+2, bi,4n+3}
+                     */
+                    vecInB = vldrwq_z_f32(pInB0, p0);
+
+                    vecMac0 = vfmaq(vecMac0, vecInB, *pInA0++);
+
+                    pInB0 = pInB0 + numColsB;
+                    /*
+                     * Decrement the blockSize loop counter
+                     */
+                    blkCnt--;
+                }
+                /* Store the results (1 x colBLeft block) in the destination buffer */
+                vstrwq_p_f32(pOut0, vecMac0, p0);
+            }
+
+            /* move to next row */
+            pInA += 1 * numColsA;
+            pOut += 1 * numColsB;
+            i--;
+        }
+        
+      }
+      status = ARM_MATH_SUCCESS;
+  }
+
+  /* Return to application */
+  return (status);
+}
+#else
+
+#if defined(ARM_MATH_NEON)
+/**
+ * @brief Floating-point matrix multiplication.
+ * @param[in]       *pSrcA points to the first input matrix structure
+ * @param[in]       *pSrcB points to the second input matrix structure
+ * @param[out]      *pDst points to output matrix structure
+ * @return          The function returns either
+ * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+ */
+arm_status arm_mat_mult_f32(
+  const arm_matrix_instance_f32 * pSrcA,
+  const arm_matrix_instance_f32 * pSrcB,
+  arm_matrix_instance_f32 * pDst)
+{
+  float32_t *pIn1 = pSrcA->pData;                /* input data matrix pointer A */
+  float32_t *pIn2 = pSrcB->pData;                /* input data matrix pointer B */
+  float32_t *pInA = pSrcA->pData;                /* input data matrix pointer A  */
+  float32_t *pOut = pDst->pData;                 /* output data matrix pointer */
+  float32_t *px;                                 /* Temporary output data matrix pointer */
+  float32_t sum;                                 /* Accumulator */
+  uint16_t numRowsA = pSrcA->numRows;            /* number of rows of input matrix A */
+  uint16_t numColsB = pSrcB->numCols;            /* number of columns of input matrix B */
+  uint16_t numColsA = pSrcA->numCols;            /* number of columns of input matrix A */
+
+
+  uint16_t col, i = 0U, j, row = numRowsA, rowCnt, colCnt;      /* loop counters */
+  arm_status status;                             /* status of matrix multiplication */
+
+  float32x4_t a0V, a1V, a2V, a3V, a4V, a5V, a6V, a7V;
+  float32x4_t acc0,acc1,acc2,acc3,acc4,acc5,acc6,acc7,temp;
+  float32x2_t accum = vdup_n_f32(0);
+  float32_t *pIn1B = pSrcA->pData;    
+  float32_t *pIn1C = pSrcA->pData;    
+  float32_t *pIn1D = pSrcA->pData;  
+  float32_t *pIn1E = pSrcA->pData; 
+  float32_t *pIn1F = pSrcA->pData; 
+  float32_t *pIn1G = pSrcA->pData; 
+  float32_t *pIn1H = pSrcA->pData;   
+
+  float32_t *pxB,*pxC, *pxD, *pxE, *pxF, *pxG, *pxH;                                 /* Temporary output data matrix pointer */
+  float32_t sum0,sum1, sum2,sum3, sum4, sum5 , sum6, sum7;
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+  /* Check for matrix mismatch condition */
+  if ((pSrcA->numCols != pSrcB->numRows) ||
+     (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
+  {
+    /* Set status as ARM_MATH_SIZE_MISMATCH */
+    status = ARM_MATH_SIZE_MISMATCH;
+  }
+  else
+#endif /*      #ifdef ARM_MATH_MATRIX_CHECK    */
+  {
+    /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
+    /* Row loop */
+    rowCnt = row >> 3;
+
+    while(rowCnt > 0)
+    {
+      /* Output pointer is set to starting address of the row being processed */
+      px = pOut + GROUPOFROWS*i;
+      pxB = px + numColsB;
+      pxC = px + 2*numColsB;
+      pxD = px + 3*numColsB;
+      pxE = px + 4*numColsB;
+      pxF = px + 5*numColsB;
+      pxG = px + 6*numColsB;
+      pxH = px + 7*numColsB;
+
+      /* For every row wise process, the column loop counter is to be initiated */
+      col = numColsB;
+
+      /* For every row wise process, the pIn2 pointer is set
+       ** to the starting address of the pSrcB data */
+      pIn2 = pSrcB->pData;
+
+      j = 0U;
+
+      /* Column loop */
+      do
+      {
+        /* Set the variable sum, that acts as accumulator, to zero */
+        sum0 = 0.0f;
+        sum1 = 0.0f;
+        sum2 = 0.0f;
+        sum3 = 0.0f;
+        sum4 = 0.0f;
+        sum5 = 0.0f;
+        sum6 = 0.0f;
+        sum7 = 0.0f;
+
+        /* Initiate the pointer pIn1 to point to the starting address of the column being processed */
+        pIn1 = pInA;
+        pIn1B = pIn1 + numColsA;
+        pIn1C = pIn1 + 2*numColsA;
+        pIn1D = pIn1 + 3*numColsA;
+        pIn1E = pIn1 + 4*numColsA;
+        pIn1F = pIn1 + 5*numColsA;
+        pIn1G = pIn1 + 6*numColsA;
+        pIn1H = pIn1 + 7*numColsA;
+
+        acc0 = vdupq_n_f32(0.0);
+        acc1 = vdupq_n_f32(0.0);
+        acc2 = vdupq_n_f32(0.0);
+        acc3 = vdupq_n_f32(0.0);
+        acc4 = vdupq_n_f32(0.0);
+        acc5 = vdupq_n_f32(0.0);
+        acc6 = vdupq_n_f32(0.0);
+        acc7 = vdupq_n_f32(0.0);
+
+        /* Compute 4 MACs simultaneously. */
+        colCnt = numColsA >> 2U;
+
+        /* Matrix multiplication */
+        while (colCnt > 0U)
+        {
+          /* c(m,n) = a(1,1)*b(1,1) + a(1,2)*b(2,1) + ... + a(m,p)*b(p,n) */
+          a0V = vld1q_f32(pIn1);  
+          a1V = vld1q_f32(pIn1B);  
+          a2V = vld1q_f32(pIn1C); 
+          a3V = vld1q_f32(pIn1D); 
+          a4V = vld1q_f32(pIn1E); 
+          a5V = vld1q_f32(pIn1F); 
+          a6V = vld1q_f32(pIn1G); 
+          a7V = vld1q_f32(pIn1H); 
+
+	      pIn1 += 4;
+          pIn1B += 4;
+          pIn1C += 4;
+          pIn1D += 4;
+          pIn1E += 4;
+          pIn1F += 4;
+          pIn1G += 4;
+          pIn1H += 4;
+          
+          temp = vsetq_lane_f32(*pIn2,temp,0);
+          pIn2 += numColsB;
+          temp = vsetq_lane_f32(*pIn2,temp,1);
+          pIn2 += numColsB;
+          temp = vsetq_lane_f32(*pIn2,temp,2);
+          pIn2 += numColsB;
+          temp = vsetq_lane_f32(*pIn2,temp,3);
+          pIn2 += numColsB;
+
+          acc0 = vmlaq_f32(acc0,a0V,temp);
+          acc1 = vmlaq_f32(acc1,a1V,temp);
+          acc2 = vmlaq_f32(acc2,a2V,temp);
+          acc3 = vmlaq_f32(acc3,a3V,temp);
+          acc4 = vmlaq_f32(acc4,a4V,temp);
+          acc5 = vmlaq_f32(acc5,a5V,temp);
+          acc6 = vmlaq_f32(acc6,a6V,temp);
+          acc7 = vmlaq_f32(acc7,a7V,temp);
+
+          /* Decrement the loop count */
+          colCnt--;
+        }
+
+        accum = vpadd_f32(vget_low_f32(acc0), vget_high_f32(acc0));
+        sum0 += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
+
+        accum = vpadd_f32(vget_low_f32(acc1), vget_high_f32(acc1));
+        sum1 += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
+
+        accum = vpadd_f32(vget_low_f32(acc2), vget_high_f32(acc2));
+        sum2 += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
+
+        accum = vpadd_f32(vget_low_f32(acc3), vget_high_f32(acc3));
+        sum3 += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
+
+        accum = vpadd_f32(vget_low_f32(acc4), vget_high_f32(acc4));
+        sum4 += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
+
+        accum = vpadd_f32(vget_low_f32(acc5), vget_high_f32(acc5));
+        sum5 += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
+
+        accum = vpadd_f32(vget_low_f32(acc6), vget_high_f32(acc6));
+        sum6 += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
+
+        accum = vpadd_f32(vget_low_f32(acc7), vget_high_f32(acc7));
+        sum7 += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
+
+        /* If the columns of pSrcA is not a multiple of 4, compute any remaining MACs here.
+         ** No loop unrolling is used. */
+        colCnt = numColsA & 3;
+
+        while (colCnt > 0U)
+        {
+          /* c(m,n) = a(1,1)*b(1,1) + a(1,2)*b(2,1) + ... + a(m,p)*b(p,n) */
+          sum0 += *pIn1++ * (*pIn2);
+          sum1 += *pIn1B++ * (*pIn2);
+          sum2 += *pIn1C++ * (*pIn2);
+          sum3 += *pIn1D++ * (*pIn2);
+          sum4 += *pIn1E++ * (*pIn2);
+          sum5 += *pIn1F++ * (*pIn2);
+          sum6 += *pIn1G++ * (*pIn2);
+          sum7 += *pIn1H++ * (*pIn2);
+          pIn2 += numColsB;
+
+          /* Decrement the loop counter */
+          colCnt--;
+        }
+
+        /* Store the result in the destination buffer */
+        *px++ = sum0;
+        *pxB++ = sum1;
+        *pxC++ = sum2;
+        *pxD++ = sum3;
+        *pxE++ = sum4;
+        *pxF++ = sum5;
+        *pxG++ = sum6;
+        *pxH++ = sum7;
+
+        /* Update the pointer pIn2 to point to the  starting address of the next column */
+        j++;
+        pIn2 = pSrcB->pData + j;
+
+        /* Decrement the column loop counter */
+        col--;
+
+      } while (col > 0U);
+
+      /* Update the pointer pInA to point to the  starting address of the next row */
+      i = i + numColsB;
+      pInA = pInA + GROUPOFROWS*numColsA;
+
+      /* Decrement the row loop counter */
+      rowCnt--;
+    } 
+
+    /*
+
+    i was the index of a group of rows computed by previous loop.
+    Now i is the index of a row since below code is computing row per row
+    and no more group of row per group of rows.
+
+    */
+
+    i = GROUPOFROWS*i;
+    rowCnt = row & 7;
+
+    while(rowCnt > 0)
+    {
+      /* Output pointer is set to starting address of the row being processed */
+      px = pOut + i;
+
+      /* For every row wise process, the column loop counter is to be initiated */
+      col = numColsB;
+
+      /* For every row wise process, the pIn2 pointer is set
+       ** to the starting address of the pSrcB data */
+      pIn2 = pSrcB->pData;
+
+      j = 0U;
+
+      /* Column loop */
+      do
+      {
+        /* Set the variable sum, that acts as accumulator, to zero */
+        sum = 0.0f;
+
+        /* Initiate the pointer pIn1 to point to the starting address of the column being processed */
+        pIn1 = pInA;
+
+        acc0 = vdupq_n_f32(0.0);
+
+        /* Compute 4 MACs simultaneously. */
+        colCnt = numColsA >> 2U;
+
+        /* Matrix multiplication   */
+        while (colCnt > 0U)
+        {
+          /* c(m,n) = a(1,1)*b(1,1) + a(1,2)*b(2,1) + ... + a(m,p)*b(p,n) */
+          a0V = vld1q_f32(pIn1);  // load & separate real/imag pSrcA (de-interleave 2)
+          pIn1 += 4;
+          
+          temp = vsetq_lane_f32(*pIn2,temp,0);
+          pIn2 += numColsB;
+          temp = vsetq_lane_f32(*pIn2,temp,1);
+          pIn2 += numColsB;
+          temp = vsetq_lane_f32(*pIn2,temp,2);
+          pIn2 += numColsB;
+          temp = vsetq_lane_f32(*pIn2,temp,3);
+          pIn2 += numColsB;
+
+          acc0 = vmlaq_f32(acc0,a0V,temp);
+
+          /* Decrement the loop count */
+          colCnt--;
+        }
+
+        accum = vpadd_f32(vget_low_f32(acc0), vget_high_f32(acc0));
+        sum += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
+
+        /* If the columns of pSrcA is not a multiple of 4, compute any remaining MACs here.
+         ** No loop unrolling is used. */
+        colCnt = numColsA % 0x4U;
+
+        while (colCnt > 0U)
+        {
+          /* c(m,n) = a(1,1)*b(1,1) + a(1,2)*b(2,1) + ... + a(m,p)*b(p,n) */
+          sum += *pIn1++ * (*pIn2);
+          pIn2 += numColsB;
+
+          /* Decrement the loop counter */
+          colCnt--;
+        }
+
+        /* Store the result in the destination buffer */
+        *px++ = sum;
+
+        /* Update the pointer pIn2 to point to the  starting address of the next column */
+        j++;
+        pIn2 = pSrcB->pData + j;
+
+        /* Decrement the column loop counter */
+        col--;
+
+      } while (col > 0U);
+
+
+      /* Update the pointer pInA to point to the  starting address of the next row */
+      i = i + numColsB;
+      pInA = pInA + numColsA;
+
+      /* Decrement the row loop counter */
+      rowCnt--;
+
+    } 
+    /* Set status as ARM_MATH_SUCCESS */
+    status = ARM_MATH_SUCCESS;
+  }
+
+  /* Return to application */
+  return (status);
+}
+#else
+/**
+ * @brief Floating-point matrix multiplication.
+ * @param[in]       *pSrcA points to the first input matrix structure
+ * @param[in]       *pSrcB points to the second input matrix structure
+ * @param[out]      *pDst points to output matrix structure
+ * @return          The function returns either
+ * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+ */
+arm_status arm_mat_mult_f32(
+  const arm_matrix_instance_f32 * pSrcA,
+  const arm_matrix_instance_f32 * pSrcB,
+        arm_matrix_instance_f32 * pDst)
+{
+  float32_t *pIn1 = pSrcA->pData;                /* Input data matrix pointer A */
+  float32_t *pIn2 = pSrcB->pData;                /* Input data matrix pointer B */
+  float32_t *pInA = pSrcA->pData;                /* Input data matrix pointer A */
+  float32_t *pInB = pSrcB->pData;                /* Input data matrix pointer B */
+  float32_t *pOut = pDst->pData;                 /* Output data matrix pointer */
+  float32_t *px;                                 /* Temporary output data matrix pointer */
+  float32_t sum;                                 /* Accumulator */
+  uint16_t numRowsA = pSrcA->numRows;            /* Number of rows of input matrix A */
+  uint16_t numColsB = pSrcB->numCols;            /* Number of columns of input matrix B */
+  uint16_t numColsA = pSrcA->numCols;            /* Number of columns of input matrix A */
+  uint32_t col, i = 0U, row = numRowsA, colCnt;  /* Loop counters */
+  arm_status status;                             /* Status of matrix multiplication */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+  /* Check for matrix mismatch condition */
+  if ((pSrcA->numCols != pSrcB->numRows) ||
+      (pSrcA->numRows != pDst->numRows)  ||
+      (pSrcB->numCols != pDst->numCols)    )
+  {
+    /* Set status as ARM_MATH_SIZE_MISMATCH */
+    status = ARM_MATH_SIZE_MISMATCH;
+  }
+  else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+  {
+    /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
+    /* row loop */
+    do
+    {
+      /* Output pointer is set to starting address of row being processed */
+      px = pOut + i;
+
+      /* For every row wise process, column loop counter is to be initiated */
+      col = numColsB;
+
+      /* For every row wise process, pIn2 pointer is set to starting address of pSrcB data */
+      pIn2 = pSrcB->pData;
+
+      /* column loop */
+      do
+      {
+        /* Set the variable sum, that acts as accumulator, to zero */
+        sum = 0.0f;
+
+        /* Initialize pointer pIn1 to point to starting address of column being processed */
+        pIn1 = pInA;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+        /* Loop unrolling: Compute 4 MACs at a time. */
+        colCnt = numColsA >> 2U;
+
+        /* matrix multiplication */
+        while (colCnt > 0U)
+        {
+          /* c(m,p) = a(m,1) * b(1,p) + a(m,2) * b(2,p) + .... + a(m,n) * b(n,p) */
+
+          /* Perform the multiply-accumulates */
+          sum += *pIn1++ * *pIn2;
+          pIn2 += numColsB;
+
+          sum += *pIn1++ * *pIn2;
+          pIn2 += numColsB;
+
+          sum += *pIn1++ * *pIn2;
+          pIn2 += numColsB;
+
+          sum += *pIn1++ * *pIn2;
+          pIn2 += numColsB;
+
+          /* Decrement loop counter */
+          colCnt--;
+        }
+
+        /* Loop unrolling: Compute remaining MACs */
+        colCnt = numColsA % 0x4U;
+
+#else
+
+        /* Initialize cntCnt with number of columns */
+        colCnt = numColsA;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+        while (colCnt > 0U)
+        {
+          /* c(m,p) = a(m,1) * b(1,p) + a(m,2) * b(2,p) + .... + a(m,n) * b(n,p) */
+
+          /* Perform the multiply-accumulates */
+          sum += *pIn1++ * *pIn2;
+          pIn2 += numColsB;
+
+          /* Decrement loop counter */
+          colCnt--;
+        }
+
+        /* Store result in destination buffer */
+        *px++ = sum;
+
+        /* Decrement column loop counter */
+        col--;
+
+        /* Update pointer pIn2 to point to starting address of next column */
+        pIn2 = pInB + (numColsB - col);
+
+      } while (col > 0U);
+
+      /* Update pointer pInA to point to starting address of next row */
+      i = i + numColsB;
+      pInA = pInA + numColsA;
+
+      /* Decrement row loop counter */
+      row--;
+
+    } while (row > 0U);
+
+    /* Set status as ARM_MATH_SUCCESS */
+    status = ARM_MATH_SUCCESS;
+  }
+
+  /* Return to application */
+  return (status);
+}
+
+#endif /* #if defined(ARM_MATH_NEON) */
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ * @} end of MatrixMult group
+ */
diff --git a/ARM_math/arm_mat_mult_f64.c b/ARM_math/arm_mat_mult_f64.c
new file mode 100644
index 0000000..f7b3110
--- /dev/null
+++ b/ARM_math/arm_mat_mult_f64.c
@@ -0,0 +1,185 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_mat_mult_f64.c
+ * Description:  Floating-point matrix multiplication
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "matrix_functions.h"
+
+/**
+ * @ingroup groupMatrix
+ */
+
+/**
+ * @addtogroup MatrixMult
+ * @{
+ */
+
+/**
+ * @brief Floating-point matrix multiplication.
+ * @param[in]       *pSrcA points to the first input matrix structure
+ * @param[in]       *pSrcB points to the second input matrix structure
+ * @param[out]      *pDst points to output matrix structure
+ * @return     		The function returns either
+ * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+ */
+
+
+arm_status arm_mat_mult_f64(
+  const arm_matrix_instance_f64 * pSrcA,
+  const arm_matrix_instance_f64 * pSrcB,
+        arm_matrix_instance_f64 * pDst)
+{
+  float64_t *pIn1 = pSrcA->pData;                /* Input data matrix pointer A */
+  float64_t *pIn2 = pSrcB->pData;                /* Input data matrix pointer B */
+  float64_t *pInA = pSrcA->pData;                /* Input data matrix pointer A */
+  float64_t *pInB = pSrcB->pData;                /* Input data matrix pointer B */
+  float64_t *pOut = pDst->pData;                 /* Output data matrix pointer */
+  float64_t *px;                                 /* Temporary output data matrix pointer */
+  float64_t sum;                                 /* Accumulator */
+  uint16_t numRowsA = pSrcA->numRows;            /* Number of rows of input matrix A */
+  uint16_t numColsB = pSrcB->numCols;            /* Number of columns of input matrix B */
+  uint16_t numColsA = pSrcA->numCols;            /* Number of columns of input matrix A */
+  uint64_t col, i = 0U, row = numRowsA, colCnt;  /* Loop counters */
+  arm_status status;                             /* Status of matrix multiplication */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+  /* Check for matrix mismatch condition */
+  if ((pSrcA->numCols != pSrcB->numRows) ||
+      (pSrcA->numRows != pDst->numRows)  ||
+      (pSrcB->numCols != pDst->numCols)    )
+  {
+    /* Set status as ARM_MATH_SIZE_MISMATCH */
+    status = ARM_MATH_SIZE_MISMATCH;
+  }
+  else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+  {
+    /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
+    /* row loop */
+    do
+    {
+      /* Output pointer is set to starting address of row being processed */
+      px = pOut + i;
+
+      /* For every row wise process, column loop counter is to be initiated */
+      col = numColsB;
+
+      /* For every row wise process, pIn2 pointer is set to starting address of pSrcB data */
+      pIn2 = pSrcB->pData;
+
+      /* column loop */
+      do
+      {
+        /* Set the variable sum, that acts as accumulator, to zero */
+        sum = 0.0;
+
+        /* Initialize pointer pIn1 to point to starting address of column being processed */
+        pIn1 = pInA;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+        /* Loop unrolling: Compute 4 MACs at a time. */
+        colCnt = numColsA >> 2U;
+
+        /* matrix multiplication */
+        while (colCnt > 0U)
+        {
+          /* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
+
+          /* Perform the multiply-accumulates */
+          sum += *pIn1++ * *pIn2;
+          pIn2 += numColsB;
+
+          sum += *pIn1++ * *pIn2;
+          pIn2 += numColsB;
+
+          sum += *pIn1++ * *pIn2;
+          pIn2 += numColsB;
+
+          sum += *pIn1++ * *pIn2;
+          pIn2 += numColsB;
+
+          /* Decrement loop counter */
+          colCnt--;
+        }
+
+        /* Loop unrolling: Compute remaining MACs */
+        colCnt = numColsA % 0x4U;
+
+#else
+
+        /* Initialize cntCnt with number of columns */
+        colCnt = numColsA;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+        while (colCnt > 0U)
+        {
+          /* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
+
+          /* Perform the multiply-accumulates */
+          sum += *pIn1++ * *pIn2;
+          pIn2 += numColsB;
+
+          /* Decrement loop counter */
+          colCnt--;
+        }
+
+        /* Store result in destination buffer */
+        *px++ = sum;
+
+        /* Decrement column loop counter */
+        col--;
+
+        /* Update pointer pIn2 to point to starting address of next column */
+        pIn2 = pInB + (numColsB - col);
+
+      } while (col > 0U);
+
+      /* Update pointer pInA to point to starting address of next row */
+      i = i + numColsB;
+      pInA = pInA + numColsA;
+
+      /* Decrement row loop counter */
+      row--;
+
+    } while (row > 0U);
+
+    /* Set status as ARM_MATH_SUCCESS */
+    status = ARM_MATH_SUCCESS;
+  }
+
+  /* Return to application */
+  return (status);
+}
+
+
+/**
+ * @} end of MatrixMult group
+ */
diff --git a/ARM_math/arm_mat_trans_f32.c b/ARM_math/arm_mat_trans_f32.c
new file mode 100644
index 0000000..27868d2
--- /dev/null
+++ b/ARM_math/arm_mat_trans_f32.c
@@ -0,0 +1,341 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_mat_trans_f32.c
+ * Description:  Floating-point matrix transpose
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "matrix_functions.h"
+
+/**
+  @ingroup groupMatrix
+ */
+
+/**
+  @defgroup MatrixTrans Matrix Transpose
+
+  Tranposes a matrix.
+
+  Transposing an <code>M x N</code> matrix flips it around the center diagonal and results in an <code>N x M</code> matrix.
+
+  @par Transpose of a 3 x 3 matrix
+  
+  \f[
+  \begin{pmatrix}
+   a_{1,1} & a_{1,2} & a_{1,3} \\
+   a_{2,1} & a_{2,2} & a_{2,3} \\
+   a_{3,1} & a_{3,2} & a_{3,3} \\
+  \end{pmatrix}^T
+   =
+  \begin{pmatrix}
+   a_{1,1} & a_{2,1} & a_{3,1} \\
+   a_{1,2} & a_{2,2} & a_{3,2} \\
+   a_{1,3} & a_{2,3} & a_{3,3} \\
+  \end{pmatrix}
+  \f]
+  
+ */
+
+/**
+  @addtogroup MatrixTrans
+  @{
+ */
+
+/**
+  @brief         Floating-point matrix transpose.
+  @param[in]     pSrc      points to input matrix
+  @param[out]    pDst      points to output matrix
+  @return        execution status
+                   - \ref ARM_MATH_SUCCESS       : Operation successful
+                   - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+ */
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+arm_status arm_mat_trans_f32(
+  const arm_matrix_instance_f32 * pSrc,
+  arm_matrix_instance_f32 * pDst)
+{
+  arm_status status;                             /* status of matrix transpose  */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+  /* Check for matrix mismatch condition */
+  if ((pSrc->numRows != pDst->numCols) || (pSrc->numCols != pDst->numRows))
+  {
+    /* Set status as ARM_MATH_SIZE_MISMATCH */
+    status = ARM_MATH_SIZE_MISMATCH;
+  }
+  else
+#endif /*    #ifdef ARM_MATH_MATRIX_CHECK    */
+  {
+    if (pDst->numRows == pDst->numCols)
+    {
+        if (pDst->numCols == 2)
+            return arm_mat_trans_32bit_2x2_mve((uint32_t *)pSrc->pData, (uint32_t *)pDst->pData);
+        if (pDst->numCols == 3)
+            return arm_mat_trans_32bit_3x3_mve((uint32_t *)pSrc->pData, (uint32_t *)pDst->pData);
+        if (pDst->numCols == 4)
+            return arm_mat_trans_32bit_4x4_mve((uint32_t *)pSrc->pData, (uint32_t *)pDst->pData);
+    }
+
+    arm_mat_trans_32bit_generic_mve(pSrc->numRows, pSrc->numCols, (uint32_t *)pSrc->pData, (uint32_t *)pDst->pData);
+    /* Set status as ARM_MATH_SUCCESS */
+    status = ARM_MATH_SUCCESS;
+  }
+
+  /* Return to application */
+  return (status);
+}
+
+#else
+#if defined(ARM_MATH_NEON)
+
+arm_status arm_mat_trans_f32(
+  const arm_matrix_instance_f32 * pSrc,
+  arm_matrix_instance_f32 * pDst)
+{
+  float32_t *pIn = pSrc->pData;                  /* input data matrix pointer */
+  float32_t *pOut = pDst->pData;                 /* output data matrix pointer */
+  float32_t *px;                                 /* Temporary output data matrix pointer */
+  uint16_t nRows = pSrc->numRows;                /* number of rows */
+  uint16_t nColumns = pSrc->numCols;             /* number of columns */
+
+  uint16_t blkCnt, rowCnt, i = 0U, row = nRows;          /* loop counters */
+  arm_status status;                             /* status of matrix transpose  */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+  /* Check for matrix mismatch condition */
+  if ((pSrc->numRows != pDst->numCols) || (pSrc->numCols != pDst->numRows))
+  {
+    /* Set status as ARM_MATH_SIZE_MISMATCH */
+    status = ARM_MATH_SIZE_MISMATCH;
+  }
+  else
+#endif /*    #ifdef ARM_MATH_MATRIX_CHECK    */
+
+  {
+    /* Matrix transpose by exchanging the rows with columns */
+    /* Row loop */
+    rowCnt = row >> 2;
+    while (rowCnt > 0U)
+    {
+      float32x4_t row0V,row1V,row2V,row3V;
+      float32x4x2_t ra0,ra1,rb0,rb1;
+
+      blkCnt = nColumns >> 2;
+
+      /* The pointer px is set to starting address of the column being processed */
+      px = pOut + i;
+
+      /* Compute 4 outputs at a time.
+       ** a second loop below computes the remaining 1 to 3 samples. */
+      while (blkCnt > 0U)        /* Column loop */
+      {
+        row0V = vld1q_f32(pIn);
+        row1V = vld1q_f32(pIn + 1 * nColumns);
+        row2V = vld1q_f32(pIn + 2 * nColumns);
+        row3V = vld1q_f32(pIn + 3 * nColumns);
+        pIn += 4;
+
+        ra0 = vzipq_f32(row0V,row2V);
+        ra1 = vzipq_f32(row1V,row3V);
+
+        rb0 = vzipq_f32(ra0.val[0],ra1.val[0]);
+        rb1 = vzipq_f32(ra0.val[1],ra1.val[1]);
+
+        vst1q_f32(px,rb0.val[0]);
+        px += nRows;
+
+        vst1q_f32(px,rb0.val[1]);
+        px += nRows;
+
+        vst1q_f32(px,rb1.val[0]);
+        px += nRows;
+
+        vst1q_f32(px,rb1.val[1]);
+        px += nRows;
+
+        /* Decrement the column loop counter */
+        blkCnt--;
+      }
+
+      /* Perform matrix transpose for last 3 samples here. */
+      blkCnt = nColumns % 0x4U;
+
+      while (blkCnt > 0U)
+      {
+        /* Read and store the input element in the destination */
+        *px++ = *pIn;
+        *px++ = *(pIn + 1 * nColumns);
+        *px++ = *(pIn + 2 * nColumns);
+        *px++ = *(pIn + 3 * nColumns);
+        
+        px += (nRows - 4);
+        pIn++;
+
+        /* Decrement the column loop counter */
+        blkCnt--;
+      }
+
+      i += 4;
+      pIn += 3 * nColumns;
+
+      /* Decrement the row loop counter */
+      rowCnt--;
+
+    }         /* Row loop end  */
+
+    rowCnt = row & 3;
+    while (rowCnt > 0U)
+    {
+      blkCnt = nColumns ;
+      /* The pointer px is set to starting address of the column being processed */
+      px = pOut + i;
+
+      while (blkCnt > 0U)
+      {
+        /* Read and store the input element in the destination */
+        *px = *pIn++;
+
+        /* Update the pointer px to point to the next row of the transposed matrix */
+        px += nRows;
+
+        /* Decrement the column loop counter */
+        blkCnt--;
+      }
+      i++;
+      rowCnt -- ;
+    }
+
+    /* Set status as ARM_MATH_SUCCESS */
+    status = ARM_MATH_SUCCESS;
+  }
+
+  /* Return to application */
+  return (status);
+}
+#else
+arm_status arm_mat_trans_f32(
+  const arm_matrix_instance_f32 * pSrc,
+        arm_matrix_instance_f32 * pDst)
+{
+  float32_t *pIn = pSrc->pData;                  /* input data matrix pointer */
+  float32_t *pOut = pDst->pData;                 /* output data matrix pointer */
+  float32_t *px;                                 /* Temporary output data matrix pointer */
+  uint16_t nRows = pSrc->numRows;                /* number of rows */
+  uint16_t nCols = pSrc->numCols;                /* number of columns */
+  uint32_t col, row = nRows, i = 0U;             /* Loop counters */
+  arm_status status;                             /* status of matrix transpose */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+  /* Check for matrix mismatch condition */
+  if ((pSrc->numRows != pDst->numCols) ||
+      (pSrc->numCols != pDst->numRows)   )
+  {
+    /* Set status as ARM_MATH_SIZE_MISMATCH */
+    status = ARM_MATH_SIZE_MISMATCH;
+  }
+  else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+  {
+    /* Matrix transpose by exchanging the rows with columns */
+    /* row loop */
+    do
+    {
+      /* Pointer px is set to starting address of column being processed */
+      px = pOut + i;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+      /* Loop unrolling: Compute 4 outputs at a time */
+      col = nCols >> 2U;
+
+      while (col > 0U)        /* column loop */
+      {
+        /* Read and store input element in destination */
+        *px = *pIn++;
+        /* Update pointer px to point to next row of transposed matrix */
+        px += nRows;
+
+        *px = *pIn++;
+        px += nRows;
+
+        *px = *pIn++;
+        px += nRows;
+
+        *px = *pIn++;
+        px += nRows;
+
+        /* Decrement column loop counter */
+        col--;
+      }
+
+      /* Loop unrolling: Compute remaining outputs */
+      col = nCols % 0x4U;
+
+#else
+
+      /* Initialize col with number of samples */
+      col = nCols;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+      while (col > 0U)
+      {
+        /* Read and store input element in destination */
+        *px = *pIn++;
+
+        /* Update pointer px to point to next row of transposed matrix */
+        px += nRows;
+
+        /* Decrement column loop counter */
+        col--;
+      }
+
+      i++;
+
+      /* Decrement row loop counter */
+      row--;
+
+    } while (row > 0U);          /* row loop end */
+
+    /* Set status as ARM_MATH_SUCCESS */
+    status = ARM_MATH_SUCCESS;
+  }
+
+  /* Return to application */
+  return (status);
+}
+#endif /* #if defined(ARM_MATH_NEON) */
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ * @} end of MatrixTrans group
+ */
diff --git a/ARM_math/arm_mat_trans_f64.c b/ARM_math/arm_mat_trans_f64.c
new file mode 100644
index 0000000..a507347
--- /dev/null
+++ b/ARM_math/arm_mat_trans_f64.c
@@ -0,0 +1,148 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_mat_trans_f64.c
+ * Description:  Floating-point matrix transpose
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "matrix_functions.h"
+
+/**
+  @ingroup groupMatrix
+ */
+
+
+
+/**
+  @addtogroup MatrixTrans
+  @{
+ */
+
+/**
+  @brief         Floating-point matrix transpose.
+  @param[in]     pSrc      points to input matrix
+  @param[out]    pDst      points to output matrix
+  @return        execution status
+                   - \ref ARM_MATH_SUCCESS       : Operation successful
+                   - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+ */
+
+arm_status arm_mat_trans_f64(
+  const arm_matrix_instance_f64 * pSrc,
+        arm_matrix_instance_f64 * pDst)
+{
+  float64_t *pIn = pSrc->pData;                  /* input data matrix pointer */
+  float64_t *pOut = pDst->pData;                 /* output data matrix pointer */
+  float64_t *px;                                 /* Temporary output data matrix pointer */
+  uint16_t nRows = pSrc->numRows;                /* number of rows */
+  uint16_t nCols = pSrc->numCols;                /* number of columns */
+  uint64_t col, row = nRows, i = 0U;             /* Loop counters */
+  arm_status status;                             /* status of matrix transpose */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+  /* Check for matrix mismatch condition */
+  if ((pSrc->numRows != pDst->numCols) ||
+      (pSrc->numCols != pDst->numRows)   )
+  {
+    /* Set status as ARM_MATH_SIZE_MISMATCH */
+    status = ARM_MATH_SIZE_MISMATCH;
+  }
+  else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+  {
+    /* Matrix transpose by exchanging the rows with columns */
+    /* row loop */
+    do
+    {
+      /* Pointer px is set to starting address of column being processed */
+      px = pOut + i;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+      /* Loop unrolling: Compute 4 outputs at a time */
+      col = nCols >> 2U;
+
+      while (col > 0U)        /* column loop */
+      {
+        /* Read and store input element in destination */
+        *px = *pIn++;
+        /* Update pointer px to point to next row of transposed matrix */
+        px += nRows;
+
+        *px = *pIn++;
+        px += nRows;
+
+        *px = *pIn++;
+        px += nRows;
+
+        *px = *pIn++;
+        px += nRows;
+
+        /* Decrement column loop counter */
+        col--;
+      }
+
+      /* Loop unrolling: Compute remaining outputs */
+      col = nCols % 0x4U;
+
+#else
+
+      /* Initialize col with number of samples */
+      col = nCols;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+      while (col > 0U)
+      {
+        /* Read and store input element in destination */
+        *px = *pIn++;
+
+        /* Update pointer px to point to next row of transposed matrix */
+        px += nRows;
+
+        /* Decrement column loop counter */
+        col--;
+      }
+
+      i++;
+
+      /* Decrement row loop counter */
+      row--;
+
+    } while (row > 0U);          /* row loop end */
+
+    /* Set status as ARM_MATH_SUCCESS */
+    status = ARM_MATH_SUCCESS;
+  }
+
+  /* Return to application */
+  return (status);
+}
+
+/**
+ * @} end of MatrixTrans group
+ */
diff --git a/ARM_math/arm_math_memory.h b/ARM_math/arm_math_memory.h
new file mode 100644
index 0000000..7bd83dc
--- /dev/null
+++ b/ARM_math/arm_math_memory.h
@@ -0,0 +1,206 @@
+/******************************************************************************
+ * @file     arm_math_memory.h
+ * @brief    Public header file for CMSIS DSP Library
+ * @version  V1.10.0
+ * @date     08 July 2021
+ * Target Processor: Cortex-M and Cortex-A cores
+ ******************************************************************************/
+/*
+ * Copyright (c) 2010-2021 Arm Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _ARM_MATH_MEMORY_H_
+
+#define _ARM_MATH_MEMORY_H_
+
+#include "arm_math_types.h"
+
+
+#ifdef   __cplusplus
+extern "C"
+{
+#endif
+
+/**
+  @brief definition to read/write two 16 bit values.
+  @deprecated
+ */
+#if   defined ( __CC_ARM )
+  #define __SIMD32_TYPE int32_t __packed
+#elif defined ( __ARMCC_VERSION ) && ( __ARMCC_VERSION >= 6010050 )
+  #define __SIMD32_TYPE int32_t
+#elif defined ( __GNUC__ )
+  #define __SIMD32_TYPE int32_t
+#elif defined ( __ICCARM__ )
+  #define __SIMD32_TYPE int32_t __packed
+#elif defined ( __TI_ARM__ )
+  #define __SIMD32_TYPE int32_t
+#elif defined ( __CSMC__ )
+  #define __SIMD32_TYPE int32_t
+#elif defined ( __TASKING__ )
+  #define __SIMD32_TYPE __un(aligned) int32_t
+#elif defined(_MSC_VER )
+  #define __SIMD32_TYPE int32_t
+#else
+  #error Unknown compiler
+#endif
+
+#define __SIMD32(addr)        (*(__SIMD32_TYPE **) & (addr))
+#define __SIMD32_CONST(addr)  ( (__SIMD32_TYPE * )   (addr))
+#define _SIMD32_OFFSET(addr)  (*(__SIMD32_TYPE * )   (addr))
+#define __SIMD64(addr)        (*(      int64_t **) & (addr))
+
+
+/* SIMD replacement */
+
+
+/**
+  @brief         Read 2 Q15 from Q15 pointer.
+  @param[in]     pQ15      points to input value
+  @return        Q31 value
+ */
+__STATIC_FORCEINLINE q31_t read_q15x2 (
+  q15_t const * pQ15)
+{
+  q31_t val;
+
+#ifdef __ARM_FEATURE_UNALIGNED
+  memcpy (&val, pQ15, 4);
+#else
+  val = (pQ15[1] << 16) | (pQ15[0] & 0x0FFFF) ;
+#endif
+
+  return (val);
+}
+
+/**
+  @brief         Read 2 Q15 from Q15 pointer and increment pointer afterwards.
+  @param[in]     pQ15      points to input value
+  @return        Q31 value
+ */
+#define read_q15x2_ia(pQ15) read_q15x2((*(pQ15) += 2) - 2)
+
+/**
+  @brief         Read 2 Q15 from Q15 pointer and decrement pointer afterwards.
+  @param[in]     pQ15      points to input value
+  @return        Q31 value
+ */
+#define read_q15x2_da(pQ15) read_q15x2((*(pQ15) -= 2) + 2)
+
+/**
+  @brief         Write 2 Q15 to Q15 pointer and increment pointer afterwards.
+  @param[in]     pQ15      points to input value
+  @param[in]     value     Q31 value
+  @return        none
+ */
+__STATIC_FORCEINLINE void write_q15x2_ia (
+  q15_t ** pQ15,
+  q31_t    value)
+{
+  q31_t val = value;
+#ifdef __ARM_FEATURE_UNALIGNED
+  memcpy (*pQ15, &val, 4);
+#else
+  (*pQ15)[0] = (q15_t)(val & 0x0FFFF);
+  (*pQ15)[1] = (q15_t)((val >> 16) & 0x0FFFF);
+#endif
+
+ *pQ15 += 2;
+}
+
+/**
+  @brief         Write 2 Q15 to Q15 pointer.
+  @param[in]     pQ15      points to input value
+  @param[in]     value     Q31 value
+  @return        none
+ */
+__STATIC_FORCEINLINE void write_q15x2 (
+  q15_t * pQ15,
+  q31_t   value)
+{
+  q31_t val = value;
+
+#ifdef __ARM_FEATURE_UNALIGNED
+  memcpy (pQ15, &val, 4);
+#else
+  pQ15[0] = (q15_t)(val & 0x0FFFF);
+  pQ15[1] = (q15_t)(val >> 16);
+#endif
+}
+
+
+/**
+  @brief         Read 4 Q7 from Q7 pointer
+  @param[in]     pQ7       points to input value
+  @return        Q31 value
+ */
+__STATIC_FORCEINLINE q31_t read_q7x4 (
+  q7_t const * pQ7)
+{
+  q31_t val;
+
+#ifdef __ARM_FEATURE_UNALIGNED
+  memcpy (&val, pQ7, 4);
+#else
+  val =((pQ7[3] & 0x0FF) << 24)  | ((pQ7[2] & 0x0FF) << 16)  | ((pQ7[1] & 0x0FF) << 8)  | (pQ7[0] & 0x0FF);
+#endif 
+  return (val);
+}
+
+/**
+  @brief         Read 4 Q7 from Q7 pointer and increment pointer afterwards.
+  @param[in]     pQ7       points to input value
+  @return        Q31 value
+ */
+#define read_q7x4_ia(pQ7) read_q7x4((*(pQ7) += 4) - 4)
+
+/**
+  @brief         Read 4 Q7 from Q7 pointer and decrement pointer afterwards.
+  @param[in]     pQ7       points to input value
+  @return        Q31 value
+ */
+#define read_q7x4_da(pQ7) read_q7x4((*(pQ7) -= 4) + 4)
+
+/**
+  @brief         Write 4 Q7 to Q7 pointer and increment pointer afterwards.
+  @param[in]     pQ7       points to input value
+  @param[in]     value     Q31 value
+  @return        none
+ */
+__STATIC_FORCEINLINE void write_q7x4_ia (
+  q7_t ** pQ7,
+  q31_t   value)
+{
+  q31_t val = value;
+#ifdef __ARM_FEATURE_UNALIGNED
+  memcpy (*pQ7, &val, 4);
+#else
+  (*pQ7)[0] = (q7_t)(val & 0x0FF);
+  (*pQ7)[1] = (q7_t)((val >> 8) & 0x0FF);
+  (*pQ7)[2] = (q7_t)((val >> 16) & 0x0FF);
+  (*pQ7)[3] = (q7_t)((val >> 24) & 0x0FF);
+
+#endif
+  *pQ7 += 4;
+}
+
+
+#ifdef   __cplusplus
+}
+#endif
+
+#endif /*ifndef _ARM_MATH_MEMORY_H_ */
diff --git a/ARM_math/arm_math_types.h b/ARM_math/arm_math_types.h
new file mode 100644
index 0000000..c615e66
--- /dev/null
+++ b/ARM_math/arm_math_types.h
@@ -0,0 +1,616 @@
+/******************************************************************************
+ * @file     arm_math_types.h
+ * @brief    Public header file for CMSIS DSP Library
+ * @version  V1.10.0
+ * @date     08 July 2021
+ * Target Processor: Cortex-M and Cortex-A cores
+ ******************************************************************************/
+/*
+ * Copyright (c) 2010-2021 Arm Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _ARM_MATH_TYPES_H_
+
+#define _ARM_MATH_TYPES_H_
+
+#ifdef   __cplusplus
+extern "C"
+{
+#endif
+
+/* Compiler specific diagnostic adjustment */
+#if   defined ( __CC_ARM )
+
+#elif defined ( __ARMCC_VERSION ) && ( __ARMCC_VERSION >= 6010050 )
+
+#elif defined ( __APPLE_CC__ )
+  #pragma GCC diagnostic ignored "-Wold-style-cast"
+
+#elif defined ( __GNUC__ )
+  #pragma GCC diagnostic push
+  #pragma GCC diagnostic ignored "-Wsign-conversion"
+  #pragma GCC diagnostic ignored "-Wconversion"
+  #pragma GCC diagnostic ignored "-Wunused-parameter"
+
+#elif defined ( __ICCARM__ )
+
+#elif defined ( __TI_ARM__ )
+
+#elif defined ( __CSMC__ )
+
+#elif defined ( __TASKING__ )
+
+#elif defined ( _MSC_VER )
+
+#else
+  #error Unknown compiler
+#endif
+
+
+/* Included for instrinsics definitions */
+#if defined (_MSC_VER ) 
+#include <stdint.h>
+#define __STATIC_FORCEINLINE static __forceinline
+#define __STATIC_INLINE static __inline
+#define __ALIGNED(x) __declspec(align(x))
+#elif defined ( __APPLE_CC__ )
+#include <stdint.h>
+#define  __ALIGNED(x) __attribute__((aligned(x)))
+#define __STATIC_FORCEINLINE static inline __attribute__((always_inline)) 
+#define __STATIC_INLINE static inline
+#elif defined (__GNUC_PYTHON__)
+#include <stdint.h>
+#define  __ALIGNED(x) __attribute__((aligned(x)))
+#define __STATIC_FORCEINLINE static inline __attribute__((always_inline)) 
+#define __STATIC_INLINE static inline
+
+#else
+#include "cmsis_compiler.h"
+#endif
+
+
+
+#include <string.h>
+#include <math.h>
+#include <float.h>
+#include <limits.h>
+
+/* evaluate ARM DSP feature */
+#if (defined (__ARM_FEATURE_DSP) && (__ARM_FEATURE_DSP == 1))
+  #define ARM_MATH_DSP                   1
+#endif
+
+#if defined(ARM_MATH_NEON)
+  #if defined(_MSC_VER) && defined(_M_ARM64EC)
+    #include <arm64_neon.h>
+  #else
+    #include <arm_neon.h>
+  #endif
+  #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    #if !defined(ARM_MATH_NEON_FLOAT16)
+      #define ARM_MATH_NEON_FLOAT16
+    #endif
+  #endif
+#endif
+
+#if !defined(ARM_MATH_AUTOVECTORIZE)
+
+
+#if defined(__ARM_FEATURE_MVE)
+#if __ARM_FEATURE_MVE
+  #if !defined(ARM_MATH_MVEI)
+    #define ARM_MATH_MVEI
+  #endif
+#endif
+
+#if (__ARM_FEATURE_MVE & 2)
+  #if !defined(ARM_MATH_MVEF)
+    #define ARM_MATH_MVEF
+  #endif
+  #if !defined(ARM_MATH_MVE_FLOAT16)
+       #define ARM_MATH_MVE_FLOAT16
+  #endif
+#endif
+
+#endif /*defined(__ARM_FEATURE_MVE)*/
+#endif /*!defined(ARM_MATH_AUTOVECTORIZE)*/
+
+
+#if defined (ARM_MATH_HELIUM)
+  #if !defined(ARM_MATH_MVEF)
+    #define ARM_MATH_MVEF
+  #endif
+
+  #if !defined(ARM_MATH_MVEI)
+    #define ARM_MATH_MVEI
+  #endif
+
+  #if !defined(ARM_MATH_MVE_FLOAT16)
+       #define ARM_MATH_MVE_FLOAT16
+  #endif
+#endif
+
+
+
+#if   defined ( __CC_ARM )
+  /* Enter low optimization region - place directly above function definition */
+  #if defined( __ARM_ARCH_7EM__ )
+    #define LOW_OPTIMIZATION_ENTER \
+       _Pragma ("push")         \
+       _Pragma ("O1")
+  #else
+    #define LOW_OPTIMIZATION_ENTER
+  #endif
+
+  /* Exit low optimization region - place directly after end of function definition */
+  #if defined ( __ARM_ARCH_7EM__ )
+    #define LOW_OPTIMIZATION_EXIT \
+       _Pragma ("pop")
+  #else
+    #define LOW_OPTIMIZATION_EXIT
+  #endif
+
+  /* Enter low optimization region - place directly above function definition */
+  #define IAR_ONLY_LOW_OPTIMIZATION_ENTER
+
+  /* Exit low optimization region - place directly after end of function definition */
+  #define IAR_ONLY_LOW_OPTIMIZATION_EXIT
+
+#elif defined (__ARMCC_VERSION ) && ( __ARMCC_VERSION >= 6010050 )
+  #define LOW_OPTIMIZATION_ENTER
+  #define LOW_OPTIMIZATION_EXIT
+  #define IAR_ONLY_LOW_OPTIMIZATION_ENTER
+  #define IAR_ONLY_LOW_OPTIMIZATION_EXIT
+  
+#elif defined ( __APPLE_CC__ )
+  #define LOW_OPTIMIZATION_ENTER
+  #define LOW_OPTIMIZATION_EXIT
+  #define IAR_ONLY_LOW_OPTIMIZATION_ENTER
+  #define IAR_ONLY_LOW_OPTIMIZATION_EXIT
+
+#elif defined ( __GNUC__ )
+  #define LOW_OPTIMIZATION_ENTER \
+       __attribute__(( optimize("-O1") ))
+  #define LOW_OPTIMIZATION_EXIT
+  #define IAR_ONLY_LOW_OPTIMIZATION_ENTER
+  #define IAR_ONLY_LOW_OPTIMIZATION_EXIT
+
+#elif defined ( __ICCARM__ )
+  /* Enter low optimization region - place directly above function definition */
+  #if defined ( __ARM_ARCH_7EM__ )
+    #define LOW_OPTIMIZATION_ENTER \
+       _Pragma ("optimize=low")
+  #else
+    #define LOW_OPTIMIZATION_ENTER
+  #endif
+
+  /* Exit low optimization region - place directly after end of function definition */
+  #define LOW_OPTIMIZATION_EXIT
+
+  /* Enter low optimization region - place directly above function definition */
+  #if defined ( __ARM_ARCH_7EM__ )
+    #define IAR_ONLY_LOW_OPTIMIZATION_ENTER \
+       _Pragma ("optimize=low")
+  #else
+    #define IAR_ONLY_LOW_OPTIMIZATION_ENTER
+  #endif
+
+  /* Exit low optimization region - place directly after end of function definition */
+  #define IAR_ONLY_LOW_OPTIMIZATION_EXIT
+
+#elif defined ( __TI_ARM__ )
+  #define LOW_OPTIMIZATION_ENTER
+  #define LOW_OPTIMIZATION_EXIT
+  #define IAR_ONLY_LOW_OPTIMIZATION_ENTER
+  #define IAR_ONLY_LOW_OPTIMIZATION_EXIT
+
+#elif defined ( __CSMC__ )
+  #define LOW_OPTIMIZATION_ENTER
+  #define LOW_OPTIMIZATION_EXIT
+  #define IAR_ONLY_LOW_OPTIMIZATION_ENTER
+  #define IAR_ONLY_LOW_OPTIMIZATION_EXIT
+
+#elif defined ( __TASKING__ )
+  #define LOW_OPTIMIZATION_ENTER
+  #define LOW_OPTIMIZATION_EXIT
+  #define IAR_ONLY_LOW_OPTIMIZATION_ENTER
+  #define IAR_ONLY_LOW_OPTIMIZATION_EXIT
+       
+#elif defined ( _MSC_VER ) || defined(__GNUC_PYTHON__)
+      #define LOW_OPTIMIZATION_ENTER
+      #define LOW_OPTIMIZATION_EXIT
+      #define IAR_ONLY_LOW_OPTIMIZATION_ENTER 
+      #define IAR_ONLY_LOW_OPTIMIZATION_EXIT
+#endif
+
+
+
+/* Compiler specific diagnostic adjustment */
+#if   defined ( __CC_ARM )
+
+#elif defined ( __ARMCC_VERSION ) && ( __ARMCC_VERSION >= 6010050 )
+
+#elif defined ( __APPLE_CC__ )
+
+#elif defined ( __GNUC__ )
+#pragma GCC diagnostic pop
+
+#elif defined ( __ICCARM__ )
+
+#elif defined ( __TI_ARM__ )
+
+#elif defined ( __CSMC__ )
+
+#elif defined ( __TASKING__ )
+
+#elif defined ( _MSC_VER )
+
+#else
+  #error Unknown compiler
+#endif
+
+#ifdef   __cplusplus
+}
+#endif
+
+#if defined(__ARM_FEATURE_MVE) && __ARM_FEATURE_MVE
+#include <arm_mve.h>
+#endif
+
+#ifdef   __cplusplus
+extern "C"
+{
+#endif
+
+ /**
+   * @brief 8-bit fractional data type in 1.7 format.
+   */
+  typedef int8_t q7_t;
+
+  /**
+   * @brief 16-bit fractional data type in 1.15 format.
+   */
+  typedef int16_t q15_t;
+
+  /**
+   * @brief 32-bit fractional data type in 1.31 format.
+   */
+  typedef int32_t q31_t;
+
+  /**
+   * @brief 64-bit fractional data type in 1.63 format.
+   */
+  typedef int64_t q63_t;
+
+  /**
+   * @brief 32-bit floating-point type definition.
+   */
+#if !defined(__ICCARM__) || !(__ARM_FEATURE_MVE & 2)
+  typedef float float32_t;
+#endif
+
+  /**
+   * @brief 64-bit floating-point type definition.
+   */
+  typedef double float64_t;
+
+  /**
+   * @brief vector types
+   */
+#if defined(ARM_MATH_NEON) || (defined (ARM_MATH_MVEI)  && !defined(ARM_MATH_AUTOVECTORIZE))
+  /**
+   * @brief 64-bit fractional 128-bit vector data type in 1.63 format
+   */
+  typedef int64x2_t q63x2_t;
+
+  /**
+   * @brief 32-bit fractional 128-bit vector data type in 1.31 format.
+   */
+  typedef int32x4_t q31x4_t;
+
+  /**
+   * @brief 16-bit fractional 128-bit vector data type with 16-bit alignment in 1.15 format.
+   */
+  typedef __ALIGNED(2) int16x8_t q15x8_t;
+
+ /**
+   * @brief 8-bit fractional 128-bit vector data type with 8-bit alignment in 1.7 format.
+   */
+  typedef __ALIGNED(1) int8x16_t q7x16_t;
+
+    /**
+   * @brief 32-bit fractional 128-bit vector pair data type in 1.31 format.
+   */
+  typedef int32x4x2_t q31x4x2_t;
+
+  /**
+   * @brief 32-bit fractional 128-bit vector quadruplet data type in 1.31 format.
+   */
+  typedef int32x4x4_t q31x4x4_t;
+
+  /**
+   * @brief 16-bit fractional 128-bit vector pair data type in 1.15 format.
+   */
+  typedef int16x8x2_t q15x8x2_t;
+
+  /**
+   * @brief 16-bit fractional 128-bit vector quadruplet data type in 1.15 format.
+   */
+  typedef int16x8x4_t q15x8x4_t;
+
+  /**
+   * @brief 8-bit fractional 128-bit vector pair data type in 1.7 format.
+   */
+  typedef int8x16x2_t q7x16x2_t;
+
+  /**
+   * @brief 8-bit fractional 128-bit vector quadruplet data type in 1.7 format.
+   */
+   typedef int8x16x4_t q7x16x4_t;
+
+  /**
+   * @brief 32-bit fractional data type in 9.23 format.
+   */
+  typedef int32_t q23_t;
+
+  /**
+   * @brief 32-bit fractional 128-bit vector data type in 9.23 format.
+   */
+  typedef int32x4_t q23x4_t;
+
+  /**
+   * @brief 64-bit status 128-bit vector data type.
+   */
+  typedef int64x2_t status64x2_t;
+
+  /**
+   * @brief 32-bit status 128-bit vector data type.
+   */
+  typedef int32x4_t status32x4_t;
+
+  /**
+   * @brief 16-bit status 128-bit vector data type.
+   */
+  typedef int16x8_t status16x8_t;
+
+  /**
+   * @brief 8-bit status 128-bit vector data type.
+   */
+  typedef int8x16_t status8x16_t;
+
+
+#endif
+
+#if defined(ARM_MATH_NEON) || (defined(ARM_MATH_MVEF)  && !defined(ARM_MATH_AUTOVECTORIZE)) /* floating point vector*/
+  /**
+   * @brief 32-bit floating-point 128-bit vector type
+   */
+  typedef float32x4_t f32x4_t;
+
+  /**
+   * @brief 32-bit floating-point 128-bit vector pair data type
+   */
+  typedef float32x4x2_t f32x4x2_t;
+
+  /**
+   * @brief 32-bit floating-point 128-bit vector quadruplet data type
+   */
+  typedef float32x4x4_t f32x4x4_t;
+
+  /**
+   * @brief 32-bit ubiquitous 128-bit vector data type
+   */
+  typedef union _any32x4_t
+  {
+      float32x4_t     f;
+      int32x4_t       i;
+  } any32x4_t;
+
+#endif
+
+#if defined(ARM_MATH_NEON)
+  /**
+   * @brief 32-bit fractional 64-bit vector data type in 1.31 format.
+   */
+  typedef int32x2_t  q31x2_t;
+
+  /**
+   * @brief 16-bit fractional 64-bit vector data type in 1.15 format.
+   */
+  typedef  __ALIGNED(2) int16x4_t q15x4_t;
+
+  /**
+   * @brief 8-bit fractional 64-bit vector data type in 1.7 format.
+   */
+  typedef  __ALIGNED(1) int8x8_t q7x8_t;
+
+  /**
+   * @brief 32-bit float 64-bit vector data type.
+   */
+  typedef float32x2_t  f32x2_t;
+
+  /**
+   * @brief 32-bit floating-point 128-bit vector triplet data type
+   */
+  typedef float32x4x3_t f32x4x3_t;
+
+
+  /**
+   * @brief 32-bit fractional 128-bit vector triplet data type in 1.31 format
+   */
+  typedef int32x4x3_t q31x4x3_t;
+
+  /**
+   * @brief 16-bit fractional 128-bit vector triplet data type in 1.15 format
+   */
+  typedef int16x8x3_t q15x8x3_t;
+
+  /**
+   * @brief 8-bit fractional 128-bit vector triplet data type in 1.7 format
+   */
+  typedef int8x16x3_t q7x16x3_t;
+
+  /**
+   * @brief 32-bit floating-point 64-bit vector pair data type
+   */
+  typedef float32x2x2_t f32x2x2_t;
+
+  /**
+   * @brief 32-bit floating-point 64-bit vector triplet data type
+   */
+  typedef float32x2x3_t f32x2x3_t;
+
+  /**
+   * @brief 32-bit floating-point 64-bit vector quadruplet data type
+   */
+  typedef float32x2x4_t f32x2x4_t;
+
+
+  /**
+   * @brief 32-bit fractional 64-bit vector pair data type in 1.31 format
+   */
+  typedef int32x2x2_t q31x2x2_t;
+
+  /**
+   * @brief 32-bit fractional 64-bit vector triplet data type in 1.31 format
+   */
+  typedef int32x2x3_t q31x2x3_t;
+
+  /**
+   * @brief 32-bit fractional 64-bit vector quadruplet data type in 1.31 format
+   */
+  typedef int32x4x3_t q31x2x4_t;
+
+  /**
+   * @brief 16-bit fractional 64-bit vector pair data type in 1.15 format
+   */
+  typedef int16x4x2_t q15x4x2_t;
+
+  /**
+   * @brief 16-bit fractional 64-bit vector triplet data type in 1.15 format
+   */
+  typedef int16x4x2_t q15x4x3_t;
+
+  /**
+   * @brief 16-bit fractional 64-bit vector quadruplet data type in 1.15 format
+   */
+  typedef int16x4x3_t q15x4x4_t;
+
+  /**
+   * @brief 8-bit fractional 64-bit vector pair data type in 1.7 format
+   */
+  typedef int8x8x2_t q7x8x2_t;
+
+  /**
+   * @brief 8-bit fractional 64-bit vector triplet data type in 1.7 format
+   */
+  typedef int8x8x3_t q7x8x3_t;
+
+  /**
+   * @brief 8-bit fractional 64-bit vector quadruplet data type in 1.7 format
+   */
+  typedef int8x8x4_t q7x8x4_t;
+
+  /**
+   * @brief 32-bit ubiquitous 64-bit vector data type
+   */
+  typedef union _any32x2_t
+  {
+      float32x2_t     f;
+      int32x2_t       i;
+  } any32x2_t;
+
+
+  /**
+   * @brief 32-bit status 64-bit vector data type.
+   */
+  typedef int32x4_t status32x2_t;
+
+  /**
+   * @brief 16-bit status 64-bit vector data type.
+   */
+  typedef int16x8_t status16x4_t;
+
+  /**
+   * @brief 8-bit status 64-bit vector data type.
+   */
+  typedef int8x16_t status8x8_t;
+
+#endif
+
+
+
+
+
+#define F64_MAX   ((float64_t)DBL_MAX)
+#define F32_MAX   ((float32_t)FLT_MAX)
+
+
+
+#define F64_MIN   (-DBL_MAX)
+#define F32_MIN   (-FLT_MAX)
+
+
+
+#define F64_ABSMAX   ((float64_t)DBL_MAX)
+#define F32_ABSMAX   ((float32_t)FLT_MAX)
+
+
+
+#define F64_ABSMIN   ((float64_t)0.0)
+#define F32_ABSMIN   ((float32_t)0.0)
+
+
+#define Q31_MAX   ((q31_t)(0x7FFFFFFFL))
+#define Q15_MAX   ((q15_t)(0x7FFF))
+#define Q7_MAX    ((q7_t)(0x7F))
+#define Q31_MIN   ((q31_t)(0x80000000L))
+#define Q15_MIN   ((q15_t)(0x8000))
+#define Q7_MIN    ((q7_t)(0x80))
+
+#define Q31_ABSMAX   ((q31_t)(0x7FFFFFFFL))
+#define Q15_ABSMAX   ((q15_t)(0x7FFF))
+#define Q7_ABSMAX    ((q7_t)(0x7F))
+#define Q31_ABSMIN   ((q31_t)0)
+#define Q15_ABSMIN   ((q15_t)0)
+#define Q7_ABSMIN    ((q7_t)0)
+
+  /* Dimension C vector space */
+  #define CMPLX_DIM 2
+
+  /**
+   * @brief Error status returned by some functions in the library.
+   */
+
+  typedef enum
+  {
+    ARM_MATH_SUCCESS                 =  0,        /**< No error */
+    ARM_MATH_ARGUMENT_ERROR          = -1,        /**< One or more arguments are incorrect */
+    ARM_MATH_LENGTH_ERROR            = -2,        /**< Length of data buffer is incorrect */
+    ARM_MATH_SIZE_MISMATCH           = -3,        /**< Size of matrices is not compatible with the operation */
+    ARM_MATH_NANINF                  = -4,        /**< Not-a-number (NaN) or infinity is generated */
+    ARM_MATH_SINGULAR                = -5,        /**< Input matrix is singular and cannot be inverted */
+    ARM_MATH_TEST_FAILURE            = -6,        /**< Test Failed */
+    ARM_MATH_DECOMPOSITION_FAILURE   = -7         /**< Decomposition Failed */
+  } arm_status;
+
+
+#ifdef   __cplusplus
+}
+#endif
+
+#endif /*ifndef _ARM_MATH_TYPES_H_ */
diff --git a/ARM_math/cmsis_compiler.h b/ARM_math/cmsis_compiler.h
new file mode 100644
index 0000000..adbf296
--- /dev/null
+++ b/ARM_math/cmsis_compiler.h
@@ -0,0 +1,283 @@
+/**************************************************************************//**
+ * @file     cmsis_compiler.h
+ * @brief    CMSIS compiler generic header file
+ * @version  V5.1.0
+ * @date     09. October 2018
+ ******************************************************************************/
+/*
+ * Copyright (c) 2009-2018 Arm Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __CMSIS_COMPILER_H
+#define __CMSIS_COMPILER_H
+
+#include <stdint.h>
+
+/*
+ * Arm Compiler 4/5
+ */
+#if   defined ( __CC_ARM )
+  #include "cmsis_armcc.h"
+
+
+/*
+ * Arm Compiler 6.6 LTM (armclang)
+ */
+#elif defined (__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050) && (__ARMCC_VERSION < 6100100)
+  #include "cmsis_armclang_ltm.h"
+
+  /*
+ * Arm Compiler above 6.10.1 (armclang)
+ */
+#elif defined (__ARMCC_VERSION) && (__ARMCC_VERSION >= 6100100)
+  #include "cmsis_armclang.h"
+
+
+/*
+ * GNU Compiler
+ */
+#elif defined ( __GNUC__ )
+  #include "cmsis_gcc.h"
+
+
+/*
+ * IAR Compiler
+ */
+#elif defined ( __ICCARM__ )
+  #include <cmsis_iccarm.h>
+
+
+/*
+ * TI Arm Compiler
+ */
+#elif defined ( __TI_ARM__ )
+  #include <cmsis_ccs.h>
+
+  #ifndef   __ASM
+    #define __ASM                                  __asm
+  #endif
+  #ifndef   __INLINE
+    #define __INLINE                               inline
+  #endif
+  #ifndef   __STATIC_INLINE
+    #define __STATIC_INLINE                        static inline
+  #endif
+  #ifndef   __STATIC_FORCEINLINE
+    #define __STATIC_FORCEINLINE                   __STATIC_INLINE
+  #endif
+  #ifndef   __NO_RETURN
+    #define __NO_RETURN                            __attribute__((noreturn))
+  #endif
+  #ifndef   __USED
+    #define __USED                                 __attribute__((used))
+  #endif
+  #ifndef   __WEAK
+    #define __WEAK                                 __attribute__((weak))
+  #endif
+  #ifndef   __PACKED
+    #define __PACKED                               __attribute__((packed))
+  #endif
+  #ifndef   __PACKED_STRUCT
+    #define __PACKED_STRUCT                        struct __attribute__((packed))
+  #endif
+  #ifndef   __PACKED_UNION
+    #define __PACKED_UNION                         union __attribute__((packed))
+  #endif
+  #ifndef   __UNALIGNED_UINT32        /* deprecated */
+    struct __attribute__((packed)) T_UINT32 { uint32_t v; };
+    #define __UNALIGNED_UINT32(x)                  (((struct T_UINT32 *)(x))->v)
+  #endif
+  #ifndef   __UNALIGNED_UINT16_WRITE
+    __PACKED_STRUCT T_UINT16_WRITE { uint16_t v; };
+    #define __UNALIGNED_UINT16_WRITE(addr, val)    (void)((((struct T_UINT16_WRITE *)(void*)(addr))->v) = (val))
+  #endif
+  #ifndef   __UNALIGNED_UINT16_READ
+    __PACKED_STRUCT T_UINT16_READ { uint16_t v; };
+    #define __UNALIGNED_UINT16_READ(addr)          (((const struct T_UINT16_READ *)(const void *)(addr))->v)
+  #endif
+  #ifndef   __UNALIGNED_UINT32_WRITE
+    __PACKED_STRUCT T_UINT32_WRITE { uint32_t v; };
+    #define __UNALIGNED_UINT32_WRITE(addr, val)    (void)((((struct T_UINT32_WRITE *)(void *)(addr))->v) = (val))
+  #endif
+  #ifndef   __UNALIGNED_UINT32_READ
+    __PACKED_STRUCT T_UINT32_READ { uint32_t v; };
+    #define __UNALIGNED_UINT32_READ(addr)          (((const struct T_UINT32_READ *)(const void *)(addr))->v)
+  #endif
+  #ifndef   __ALIGNED
+    #define __ALIGNED(x)                           __attribute__((aligned(x)))
+  #endif
+  #ifndef   __RESTRICT
+    #define __RESTRICT                             __restrict
+  #endif
+  #ifndef   __COMPILER_BARRIER
+    #warning No compiler specific solution for __COMPILER_BARRIER. __COMPILER_BARRIER is ignored.
+    #define __COMPILER_BARRIER()                   (void)0
+  #endif
+
+
+/*
+ * TASKING Compiler
+ */
+#elif defined ( __TASKING__ )
+  /*
+   * The CMSIS functions have been implemented as intrinsics in the compiler.
+   * Please use "carm -?i" to get an up to date list of all intrinsics,
+   * Including the CMSIS ones.
+   */
+
+  #ifndef   __ASM
+    #define __ASM                                  __asm
+  #endif
+  #ifndef   __INLINE
+    #define __INLINE                               inline
+  #endif
+  #ifndef   __STATIC_INLINE
+    #define __STATIC_INLINE                        static inline
+  #endif
+  #ifndef   __STATIC_FORCEINLINE
+    #define __STATIC_FORCEINLINE                   __STATIC_INLINE
+  #endif
+  #ifndef   __NO_RETURN
+    #define __NO_RETURN                            __attribute__((noreturn))
+  #endif
+  #ifndef   __USED
+    #define __USED                                 __attribute__((used))
+  #endif
+  #ifndef   __WEAK
+    #define __WEAK                                 __attribute__((weak))
+  #endif
+  #ifndef   __PACKED
+    #define __PACKED                               __packed__
+  #endif
+  #ifndef   __PACKED_STRUCT
+    #define __PACKED_STRUCT                        struct __packed__
+  #endif
+  #ifndef   __PACKED_UNION
+    #define __PACKED_UNION                         union __packed__
+  #endif
+  #ifndef   __UNALIGNED_UINT32        /* deprecated */
+    struct __packed__ T_UINT32 { uint32_t v; };
+    #define __UNALIGNED_UINT32(x)                  (((struct T_UINT32 *)(x))->v)
+  #endif
+  #ifndef   __UNALIGNED_UINT16_WRITE
+    __PACKED_STRUCT T_UINT16_WRITE { uint16_t v; };
+    #define __UNALIGNED_UINT16_WRITE(addr, val)    (void)((((struct T_UINT16_WRITE *)(void *)(addr))->v) = (val))
+  #endif
+  #ifndef   __UNALIGNED_UINT16_READ
+    __PACKED_STRUCT T_UINT16_READ { uint16_t v; };
+    #define __UNALIGNED_UINT16_READ(addr)          (((const struct T_UINT16_READ *)(const void *)(addr))->v)
+  #endif
+  #ifndef   __UNALIGNED_UINT32_WRITE
+    __PACKED_STRUCT T_UINT32_WRITE { uint32_t v; };
+    #define __UNALIGNED_UINT32_WRITE(addr, val)    (void)((((struct T_UINT32_WRITE *)(void *)(addr))->v) = (val))
+  #endif
+  #ifndef   __UNALIGNED_UINT32_READ
+    __PACKED_STRUCT T_UINT32_READ { uint32_t v; };
+    #define __UNALIGNED_UINT32_READ(addr)          (((const struct T_UINT32_READ *)(const void *)(addr))->v)
+  #endif
+  #ifndef   __ALIGNED
+    #define __ALIGNED(x)              __align(x)
+  #endif
+  #ifndef   __RESTRICT
+    #warning No compiler specific solution for __RESTRICT. __RESTRICT is ignored.
+    #define __RESTRICT
+  #endif
+  #ifndef   __COMPILER_BARRIER
+    #warning No compiler specific solution for __COMPILER_BARRIER. __COMPILER_BARRIER is ignored.
+    #define __COMPILER_BARRIER()                   (void)0
+  #endif
+
+
+/*
+ * COSMIC Compiler
+ */
+#elif defined ( __CSMC__ )
+   #include <cmsis_csm.h>
+
+ #ifndef   __ASM
+    #define __ASM                                  _asm
+  #endif
+  #ifndef   __INLINE
+    #define __INLINE                               inline
+  #endif
+  #ifndef   __STATIC_INLINE
+    #define __STATIC_INLINE                        static inline
+  #endif
+  #ifndef   __STATIC_FORCEINLINE
+    #define __STATIC_FORCEINLINE                   __STATIC_INLINE
+  #endif
+  #ifndef   __NO_RETURN
+    // NO RETURN is automatically detected hence no warning here
+    #define __NO_RETURN
+  #endif
+  #ifndef   __USED
+    #warning No compiler specific solution for __USED. __USED is ignored.
+    #define __USED
+  #endif
+  #ifndef   __WEAK
+    #define __WEAK                                 __weak
+  #endif
+  #ifndef   __PACKED
+    #define __PACKED                               @packed
+  #endif
+  #ifndef   __PACKED_STRUCT
+    #define __PACKED_STRUCT                        @packed struct
+  #endif
+  #ifndef   __PACKED_UNION
+    #define __PACKED_UNION                         @packed union
+  #endif
+  #ifndef   __UNALIGNED_UINT32        /* deprecated */
+    @packed struct T_UINT32 { uint32_t v; };
+    #define __UNALIGNED_UINT32(x)                  (((struct T_UINT32 *)(x))->v)
+  #endif
+  #ifndef   __UNALIGNED_UINT16_WRITE
+    __PACKED_STRUCT T_UINT16_WRITE { uint16_t v; };
+    #define __UNALIGNED_UINT16_WRITE(addr, val)    (void)((((struct T_UINT16_WRITE *)(void *)(addr))->v) = (val))
+  #endif
+  #ifndef   __UNALIGNED_UINT16_READ
+    __PACKED_STRUCT T_UINT16_READ { uint16_t v; };
+    #define __UNALIGNED_UINT16_READ(addr)          (((const struct T_UINT16_READ *)(const void *)(addr))->v)
+  #endif
+  #ifndef   __UNALIGNED_UINT32_WRITE
+    __PACKED_STRUCT T_UINT32_WRITE { uint32_t v; };
+    #define __UNALIGNED_UINT32_WRITE(addr, val)    (void)((((struct T_UINT32_WRITE *)(void *)(addr))->v) = (val))
+  #endif
+  #ifndef   __UNALIGNED_UINT32_READ
+    __PACKED_STRUCT T_UINT32_READ { uint32_t v; };
+    #define __UNALIGNED_UINT32_READ(addr)          (((const struct T_UINT32_READ *)(const void *)(addr))->v)
+  #endif
+  #ifndef   __ALIGNED
+    #warning No compiler specific solution for __ALIGNED. __ALIGNED is ignored.
+    #define __ALIGNED(x)
+  #endif
+  #ifndef   __RESTRICT
+    #warning No compiler specific solution for __RESTRICT. __RESTRICT is ignored.
+    #define __RESTRICT
+  #endif
+  #ifndef   __COMPILER_BARRIER
+    #warning No compiler specific solution for __COMPILER_BARRIER. __COMPILER_BARRIER is ignored.
+    #define __COMPILER_BARRIER()                   (void)0
+  #endif
+
+
+#else
+  #error Unknown compiler.
+#endif
+
+
+#endif /* __CMSIS_COMPILER_H */
+
diff --git a/ARM_math/cmsis_gcc.h b/ARM_math/cmsis_gcc.h
new file mode 100644
index 0000000..3ddcc58
--- /dev/null
+++ b/ARM_math/cmsis_gcc.h
@@ -0,0 +1,2168 @@
+/**************************************************************************//**
+ * @file     cmsis_gcc.h
+ * @brief    CMSIS compiler GCC header file
+ * @version  V5.2.0
+ * @date     08. May 2019
+ ******************************************************************************/
+/*
+ * Copyright (c) 2009-2019 Arm Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __CMSIS_GCC_H
+#define __CMSIS_GCC_H
+
+/* ignore some GCC warnings */
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wsign-conversion"
+#pragma GCC diagnostic ignored "-Wconversion"
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+
+/* Fallback for __has_builtin */
+#ifndef __has_builtin
+  #define __has_builtin(x) (0)
+#endif
+
+/* CMSIS compiler specific defines */
+#ifndef   __ASM
+  #define __ASM                                  __asm
+#endif
+#ifndef   __INLINE
+  #define __INLINE                               inline
+#endif
+#ifndef   __STATIC_INLINE
+  #define __STATIC_INLINE                        static inline
+#endif
+#ifndef   __STATIC_FORCEINLINE                 
+  #define __STATIC_FORCEINLINE                   __attribute__((always_inline)) static inline
+#endif                                           
+#ifndef   __NO_RETURN
+  #define __NO_RETURN                            __attribute__((__noreturn__))
+#endif
+#ifndef   __USED
+  #define __USED                                 __attribute__((used))
+#endif
+#ifndef   __WEAK
+  #define __WEAK                                 __attribute__((weak))
+#endif
+#ifndef   __PACKED
+  #define __PACKED                               __attribute__((packed, aligned(1)))
+#endif
+#ifndef   __PACKED_STRUCT
+  #define __PACKED_STRUCT                        struct __attribute__((packed, aligned(1)))
+#endif
+#ifndef   __PACKED_UNION
+  #define __PACKED_UNION                         union __attribute__((packed, aligned(1)))
+#endif
+#ifndef   __UNALIGNED_UINT32        /* deprecated */
+  #pragma GCC diagnostic push
+  #pragma GCC diagnostic ignored "-Wpacked"
+  #pragma GCC diagnostic ignored "-Wattributes"
+  struct __attribute__((packed)) T_UINT32 { uint32_t v; };
+  #pragma GCC diagnostic pop
+  #define __UNALIGNED_UINT32(x)                  (((struct T_UINT32 *)(x))->v)
+#endif
+#ifndef   __UNALIGNED_UINT16_WRITE
+  #pragma GCC diagnostic push
+  #pragma GCC diagnostic ignored "-Wpacked"
+  #pragma GCC diagnostic ignored "-Wattributes"
+  __PACKED_STRUCT T_UINT16_WRITE { uint16_t v; };
+  #pragma GCC diagnostic pop
+  #define __UNALIGNED_UINT16_WRITE(addr, val)    (void)((((struct T_UINT16_WRITE *)(void *)(addr))->v) = (val))
+#endif
+#ifndef   __UNALIGNED_UINT16_READ
+  #pragma GCC diagnostic push
+  #pragma GCC diagnostic ignored "-Wpacked"
+  #pragma GCC diagnostic ignored "-Wattributes"
+  __PACKED_STRUCT T_UINT16_READ { uint16_t v; };
+  #pragma GCC diagnostic pop
+  #define __UNALIGNED_UINT16_READ(addr)          (((const struct T_UINT16_READ *)(const void *)(addr))->v)
+#endif
+#ifndef   __UNALIGNED_UINT32_WRITE
+  #pragma GCC diagnostic push
+  #pragma GCC diagnostic ignored "-Wpacked"
+  #pragma GCC diagnostic ignored "-Wattributes"
+  __PACKED_STRUCT T_UINT32_WRITE { uint32_t v; };
+  #pragma GCC diagnostic pop
+  #define __UNALIGNED_UINT32_WRITE(addr, val)    (void)((((struct T_UINT32_WRITE *)(void *)(addr))->v) = (val))
+#endif
+#ifndef   __UNALIGNED_UINT32_READ
+  #pragma GCC diagnostic push
+  #pragma GCC diagnostic ignored "-Wpacked"
+  #pragma GCC diagnostic ignored "-Wattributes"
+  __PACKED_STRUCT T_UINT32_READ { uint32_t v; };
+  #pragma GCC diagnostic pop
+  #define __UNALIGNED_UINT32_READ(addr)          (((const struct T_UINT32_READ *)(const void *)(addr))->v)
+#endif
+#ifndef   __ALIGNED
+  #define __ALIGNED(x)                           __attribute__((aligned(x)))
+#endif
+#ifndef   __RESTRICT
+  #define __RESTRICT                             __restrict
+#endif
+#ifndef   __COMPILER_BARRIER
+  #define __COMPILER_BARRIER()                   __ASM volatile("":::"memory")
+#endif
+
+/* #########################  Startup and Lowlevel Init  ######################## */
+
+#ifndef __PROGRAM_START
+
+/**
+  \brief   Initializes data and bss sections
+  \details This default implementations initialized all data and additional bss
+           sections relying on .copy.table and .zero.table specified properly
+           in the used linker script.
+  
+ */
+__STATIC_FORCEINLINE __NO_RETURN void __cmsis_start(void)
+{
+  extern void _start(void) __NO_RETURN;
+  
+  typedef struct {
+    uint32_t const* src;
+    uint32_t* dest;
+    uint32_t  wlen;
+  } __copy_table_t;
+  
+  typedef struct {
+    uint32_t* dest;
+    uint32_t  wlen;
+  } __zero_table_t;
+  
+  extern const __copy_table_t __copy_table_start__;
+  extern const __copy_table_t __copy_table_end__;
+  extern const __zero_table_t __zero_table_start__;
+  extern const __zero_table_t __zero_table_end__;
+
+  for (__copy_table_t const* pTable = &__copy_table_start__; pTable < &__copy_table_end__; ++pTable) {
+    for(uint32_t i=0u; i<pTable->wlen; ++i) {
+      pTable->dest[i] = pTable->src[i];
+    }
+  }
+ 
+  for (__zero_table_t const* pTable = &__zero_table_start__; pTable < &__zero_table_end__; ++pTable) {
+    for(uint32_t i=0u; i<pTable->wlen; ++i) {
+      pTable->dest[i] = 0u;
+    }
+  }
+ 
+  _start();
+}
+  
+#define __PROGRAM_START           __cmsis_start
+#endif
+
+#ifndef __INITIAL_SP
+#define __INITIAL_SP              __StackTop
+#endif
+
+#ifndef __STACK_LIMIT
+#define __STACK_LIMIT             __StackLimit
+#endif
+
+#ifndef __VECTOR_TABLE
+#define __VECTOR_TABLE            __Vectors
+#endif
+
+#ifndef __VECTOR_TABLE_ATTRIBUTE
+#define __VECTOR_TABLE_ATTRIBUTE  __attribute((used, section(".vectors")))
+#endif
+
+/* ###########################  Core Function Access  ########################### */
+/** \ingroup  CMSIS_Core_FunctionInterface
+    \defgroup CMSIS_Core_RegAccFunctions CMSIS Core Register Access Functions
+  @{
+ */
+
+/**
+  \brief   Enable IRQ Interrupts
+  \details Enables IRQ interrupts by clearing the I-bit in the CPSR.
+           Can only be executed in Privileged modes.
+ */
+__STATIC_FORCEINLINE void __enable_irq(void)
+{
+  __ASM volatile ("cpsie i" : : : "memory");
+}
+
+
+/**
+  \brief   Disable IRQ Interrupts
+  \details Disables IRQ interrupts by setting the I-bit in the CPSR.
+           Can only be executed in Privileged modes.
+ */
+__STATIC_FORCEINLINE void __disable_irq(void)
+{
+  __ASM volatile ("cpsid i" : : : "memory");
+}
+
+
+/**
+  \brief   Get Control Register
+  \details Returns the content of the Control Register.
+  \return               Control Register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_CONTROL(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, control" : "=r" (result) );
+  return(result);
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Get Control Register (non-secure)
+  \details Returns the content of the non-secure Control Register when in secure mode.
+  \return               non-secure Control Register value
+ */
+__STATIC_FORCEINLINE uint32_t __TZ_get_CONTROL_NS(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, control_ns" : "=r" (result) );
+  return(result);
+}
+#endif
+
+
+/**
+  \brief   Set Control Register
+  \details Writes the given value to the Control Register.
+  \param [in]    control  Control Register value to set
+ */
+__STATIC_FORCEINLINE void __set_CONTROL(uint32_t control)
+{
+  __ASM volatile ("MSR control, %0" : : "r" (control) : "memory");
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Set Control Register (non-secure)
+  \details Writes the given value to the non-secure Control Register when in secure state.
+  \param [in]    control  Control Register value to set
+ */
+__STATIC_FORCEINLINE void __TZ_set_CONTROL_NS(uint32_t control)
+{
+  __ASM volatile ("MSR control_ns, %0" : : "r" (control) : "memory");
+}
+#endif
+
+
+/**
+  \brief   Get IPSR Register
+  \details Returns the content of the IPSR Register.
+  \return               IPSR Register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_IPSR(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, ipsr" : "=r" (result) );
+  return(result);
+}
+
+
+/**
+  \brief   Get APSR Register
+  \details Returns the content of the APSR Register.
+  \return               APSR Register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_APSR(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, apsr" : "=r" (result) );
+  return(result);
+}
+
+
+/**
+  \brief   Get xPSR Register
+  \details Returns the content of the xPSR Register.
+  \return               xPSR Register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_xPSR(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, xpsr" : "=r" (result) );
+  return(result);
+}
+
+
+/**
+  \brief   Get Process Stack Pointer
+  \details Returns the current value of the Process Stack Pointer (PSP).
+  \return               PSP Register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_PSP(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, psp"  : "=r" (result) );
+  return(result);
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Get Process Stack Pointer (non-secure)
+  \details Returns the current value of the non-secure Process Stack Pointer (PSP) when in secure state.
+  \return               PSP Register value
+ */
+__STATIC_FORCEINLINE uint32_t __TZ_get_PSP_NS(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, psp_ns"  : "=r" (result) );
+  return(result);
+}
+#endif
+
+
+/**
+  \brief   Set Process Stack Pointer
+  \details Assigns the given value to the Process Stack Pointer (PSP).
+  \param [in]    topOfProcStack  Process Stack Pointer value to set
+ */
+__STATIC_FORCEINLINE void __set_PSP(uint32_t topOfProcStack)
+{
+  __ASM volatile ("MSR psp, %0" : : "r" (topOfProcStack) : );
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Set Process Stack Pointer (non-secure)
+  \details Assigns the given value to the non-secure Process Stack Pointer (PSP) when in secure state.
+  \param [in]    topOfProcStack  Process Stack Pointer value to set
+ */
+__STATIC_FORCEINLINE void __TZ_set_PSP_NS(uint32_t topOfProcStack)
+{
+  __ASM volatile ("MSR psp_ns, %0" : : "r" (topOfProcStack) : );
+}
+#endif
+
+
+/**
+  \brief   Get Main Stack Pointer
+  \details Returns the current value of the Main Stack Pointer (MSP).
+  \return               MSP Register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_MSP(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, msp" : "=r" (result) );
+  return(result);
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Get Main Stack Pointer (non-secure)
+  \details Returns the current value of the non-secure Main Stack Pointer (MSP) when in secure state.
+  \return               MSP Register value
+ */
+__STATIC_FORCEINLINE uint32_t __TZ_get_MSP_NS(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, msp_ns" : "=r" (result) );
+  return(result);
+}
+#endif
+
+
+/**
+  \brief   Set Main Stack Pointer
+  \details Assigns the given value to the Main Stack Pointer (MSP).
+  \param [in]    topOfMainStack  Main Stack Pointer value to set
+ */
+__STATIC_FORCEINLINE void __set_MSP(uint32_t topOfMainStack)
+{
+  __ASM volatile ("MSR msp, %0" : : "r" (topOfMainStack) : );
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Set Main Stack Pointer (non-secure)
+  \details Assigns the given value to the non-secure Main Stack Pointer (MSP) when in secure state.
+  \param [in]    topOfMainStack  Main Stack Pointer value to set
+ */
+__STATIC_FORCEINLINE void __TZ_set_MSP_NS(uint32_t topOfMainStack)
+{
+  __ASM volatile ("MSR msp_ns, %0" : : "r" (topOfMainStack) : );
+}
+#endif
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Get Stack Pointer (non-secure)
+  \details Returns the current value of the non-secure Stack Pointer (SP) when in secure state.
+  \return               SP Register value
+ */
+__STATIC_FORCEINLINE uint32_t __TZ_get_SP_NS(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, sp_ns" : "=r" (result) );
+  return(result);
+}
+
+
+/**
+  \brief   Set Stack Pointer (non-secure)
+  \details Assigns the given value to the non-secure Stack Pointer (SP) when in secure state.
+  \param [in]    topOfStack  Stack Pointer value to set
+ */
+__STATIC_FORCEINLINE void __TZ_set_SP_NS(uint32_t topOfStack)
+{
+  __ASM volatile ("MSR sp_ns, %0" : : "r" (topOfStack) : );
+}
+#endif
+
+
+/**
+  \brief   Get Priority Mask
+  \details Returns the current state of the priority mask bit from the Priority Mask Register.
+  \return               Priority Mask value
+ */
+__STATIC_FORCEINLINE uint32_t __get_PRIMASK(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, primask" : "=r" (result) :: "memory");
+  return(result);
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Get Priority Mask (non-secure)
+  \details Returns the current state of the non-secure priority mask bit from the Priority Mask Register when in secure state.
+  \return               Priority Mask value
+ */
+__STATIC_FORCEINLINE uint32_t __TZ_get_PRIMASK_NS(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, primask_ns" : "=r" (result) :: "memory");
+  return(result);
+}
+#endif
+
+
+/**
+  \brief   Set Priority Mask
+  \details Assigns the given value to the Priority Mask Register.
+  \param [in]    priMask  Priority Mask
+ */
+__STATIC_FORCEINLINE void __set_PRIMASK(uint32_t priMask)
+{
+  __ASM volatile ("MSR primask, %0" : : "r" (priMask) : "memory");
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Set Priority Mask (non-secure)
+  \details Assigns the given value to the non-secure Priority Mask Register when in secure state.
+  \param [in]    priMask  Priority Mask
+ */
+__STATIC_FORCEINLINE void __TZ_set_PRIMASK_NS(uint32_t priMask)
+{
+  __ASM volatile ("MSR primask_ns, %0" : : "r" (priMask) : "memory");
+}
+#endif
+
+
+#if ((defined (__ARM_ARCH_7M__      ) && (__ARM_ARCH_7M__      == 1)) || \
+     (defined (__ARM_ARCH_7EM__     ) && (__ARM_ARCH_7EM__     == 1)) || \
+     (defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1))    )
+/**
+  \brief   Enable FIQ
+  \details Enables FIQ interrupts by clearing the F-bit in the CPSR.
+           Can only be executed in Privileged modes.
+ */
+__STATIC_FORCEINLINE void __enable_fault_irq(void)
+{
+  __ASM volatile ("cpsie f" : : : "memory");
+}
+
+
+/**
+  \brief   Disable FIQ
+  \details Disables FIQ interrupts by setting the F-bit in the CPSR.
+           Can only be executed in Privileged modes.
+ */
+__STATIC_FORCEINLINE void __disable_fault_irq(void)
+{
+  __ASM volatile ("cpsid f" : : : "memory");
+}
+
+
+/**
+  \brief   Get Base Priority
+  \details Returns the current value of the Base Priority register.
+  \return               Base Priority register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_BASEPRI(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, basepri" : "=r" (result) );
+  return(result);
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Get Base Priority (non-secure)
+  \details Returns the current value of the non-secure Base Priority register when in secure state.
+  \return               Base Priority register value
+ */
+__STATIC_FORCEINLINE uint32_t __TZ_get_BASEPRI_NS(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, basepri_ns" : "=r" (result) );
+  return(result);
+}
+#endif
+
+
+/**
+  \brief   Set Base Priority
+  \details Assigns the given value to the Base Priority register.
+  \param [in]    basePri  Base Priority value to set
+ */
+__STATIC_FORCEINLINE void __set_BASEPRI(uint32_t basePri)
+{
+  __ASM volatile ("MSR basepri, %0" : : "r" (basePri) : "memory");
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Set Base Priority (non-secure)
+  \details Assigns the given value to the non-secure Base Priority register when in secure state.
+  \param [in]    basePri  Base Priority value to set
+ */
+__STATIC_FORCEINLINE void __TZ_set_BASEPRI_NS(uint32_t basePri)
+{
+  __ASM volatile ("MSR basepri_ns, %0" : : "r" (basePri) : "memory");
+}
+#endif
+
+
+/**
+  \brief   Set Base Priority with condition
+  \details Assigns the given value to the Base Priority register only if BASEPRI masking is disabled,
+           or the new value increases the BASEPRI priority level.
+  \param [in]    basePri  Base Priority value to set
+ */
+__STATIC_FORCEINLINE void __set_BASEPRI_MAX(uint32_t basePri)
+{
+  __ASM volatile ("MSR basepri_max, %0" : : "r" (basePri) : "memory");
+}
+
+
+/**
+  \brief   Get Fault Mask
+  \details Returns the current value of the Fault Mask register.
+  \return               Fault Mask register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_FAULTMASK(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, faultmask" : "=r" (result) );
+  return(result);
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Get Fault Mask (non-secure)
+  \details Returns the current value of the non-secure Fault Mask register when in secure state.
+  \return               Fault Mask register value
+ */
+__STATIC_FORCEINLINE uint32_t __TZ_get_FAULTMASK_NS(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, faultmask_ns" : "=r" (result) );
+  return(result);
+}
+#endif
+
+
+/**
+  \brief   Set Fault Mask
+  \details Assigns the given value to the Fault Mask register.
+  \param [in]    faultMask  Fault Mask value to set
+ */
+__STATIC_FORCEINLINE void __set_FAULTMASK(uint32_t faultMask)
+{
+  __ASM volatile ("MSR faultmask, %0" : : "r" (faultMask) : "memory");
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Set Fault Mask (non-secure)
+  \details Assigns the given value to the non-secure Fault Mask register when in secure state.
+  \param [in]    faultMask  Fault Mask value to set
+ */
+__STATIC_FORCEINLINE void __TZ_set_FAULTMASK_NS(uint32_t faultMask)
+{
+  __ASM volatile ("MSR faultmask_ns, %0" : : "r" (faultMask) : "memory");
+}
+#endif
+
+#endif /* ((defined (__ARM_ARCH_7M__      ) && (__ARM_ARCH_7M__      == 1)) || \
+           (defined (__ARM_ARCH_7EM__     ) && (__ARM_ARCH_7EM__     == 1)) || \
+           (defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1))    ) */
+
+
+#if ((defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) || \
+     (defined (__ARM_ARCH_8M_BASE__ ) && (__ARM_ARCH_8M_BASE__ == 1))    )
+
+/**
+  \brief   Get Process Stack Pointer Limit
+  Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure
+  Stack Pointer Limit register hence zero is returned always in non-secure
+  mode.
+  
+  \details Returns the current value of the Process Stack Pointer Limit (PSPLIM).
+  \return               PSPLIM Register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_PSPLIM(void)
+{
+#if (!(defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) && \
+    (!defined (__ARM_FEATURE_CMSE) || (__ARM_FEATURE_CMSE < 3)))
+    // without main extensions, the non-secure PSPLIM is RAZ/WI
+  return 0U;
+#else
+  uint32_t result;
+  __ASM volatile ("MRS %0, psplim"  : "=r" (result) );
+  return result;
+#endif
+}
+
+#if (defined (__ARM_FEATURE_CMSE) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Get Process Stack Pointer Limit (non-secure)
+  Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure
+  Stack Pointer Limit register hence zero is returned always.
+
+  \details Returns the current value of the non-secure Process Stack Pointer Limit (PSPLIM) when in secure state.
+  \return               PSPLIM Register value
+ */
+__STATIC_FORCEINLINE uint32_t __TZ_get_PSPLIM_NS(void)
+{
+#if (!(defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)))
+  // without main extensions, the non-secure PSPLIM is RAZ/WI
+  return 0U;
+#else
+  uint32_t result;
+  __ASM volatile ("MRS %0, psplim_ns"  : "=r" (result) );
+  return result;
+#endif
+}
+#endif
+
+
+/**
+  \brief   Set Process Stack Pointer Limit
+  Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure
+  Stack Pointer Limit register hence the write is silently ignored in non-secure
+  mode.
+  
+  \details Assigns the given value to the Process Stack Pointer Limit (PSPLIM).
+  \param [in]    ProcStackPtrLimit  Process Stack Pointer Limit value to set
+ */
+__STATIC_FORCEINLINE void __set_PSPLIM(uint32_t ProcStackPtrLimit)
+{
+#if (!(defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) && \
+    (!defined (__ARM_FEATURE_CMSE) || (__ARM_FEATURE_CMSE < 3)))
+  // without main extensions, the non-secure PSPLIM is RAZ/WI
+  (void)ProcStackPtrLimit;
+#else
+  __ASM volatile ("MSR psplim, %0" : : "r" (ProcStackPtrLimit));
+#endif
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE  ) && (__ARM_FEATURE_CMSE   == 3))
+/**
+  \brief   Set Process Stack Pointer (non-secure)
+  Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure
+  Stack Pointer Limit register hence the write is silently ignored.
+
+  \details Assigns the given value to the non-secure Process Stack Pointer Limit (PSPLIM) when in secure state.
+  \param [in]    ProcStackPtrLimit  Process Stack Pointer Limit value to set
+ */
+__STATIC_FORCEINLINE void __TZ_set_PSPLIM_NS(uint32_t ProcStackPtrLimit)
+{
+#if (!(defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)))
+  // without main extensions, the non-secure PSPLIM is RAZ/WI
+  (void)ProcStackPtrLimit;
+#else
+  __ASM volatile ("MSR psplim_ns, %0\n" : : "r" (ProcStackPtrLimit));
+#endif
+}
+#endif
+
+
+/**
+  \brief   Get Main Stack Pointer Limit
+  Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure
+  Stack Pointer Limit register hence zero is returned always in non-secure
+  mode.
+
+  \details Returns the current value of the Main Stack Pointer Limit (MSPLIM).
+  \return               MSPLIM Register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_MSPLIM(void)
+{
+#if (!(defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) && \
+    (!defined (__ARM_FEATURE_CMSE) || (__ARM_FEATURE_CMSE < 3)))
+  // without main extensions, the non-secure MSPLIM is RAZ/WI
+  return 0U;
+#else
+  uint32_t result;
+  __ASM volatile ("MRS %0, msplim" : "=r" (result) );
+  return result;
+#endif
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE  ) && (__ARM_FEATURE_CMSE   == 3))
+/**
+  \brief   Get Main Stack Pointer Limit (non-secure)
+  Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure
+  Stack Pointer Limit register hence zero is returned always.
+
+  \details Returns the current value of the non-secure Main Stack Pointer Limit(MSPLIM) when in secure state.
+  \return               MSPLIM Register value
+ */
+__STATIC_FORCEINLINE uint32_t __TZ_get_MSPLIM_NS(void)
+{
+#if (!(defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)))
+  // without main extensions, the non-secure MSPLIM is RAZ/WI
+  return 0U;
+#else
+  uint32_t result;
+  __ASM volatile ("MRS %0, msplim_ns" : "=r" (result) );
+  return result;
+#endif
+}
+#endif
+
+
+/**
+  \brief   Set Main Stack Pointer Limit
+  Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure
+  Stack Pointer Limit register hence the write is silently ignored in non-secure
+  mode.
+
+  \details Assigns the given value to the Main Stack Pointer Limit (MSPLIM).
+  \param [in]    MainStackPtrLimit  Main Stack Pointer Limit value to set
+ */
+__STATIC_FORCEINLINE void __set_MSPLIM(uint32_t MainStackPtrLimit)
+{
+#if (!(defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) && \
+    (!defined (__ARM_FEATURE_CMSE) || (__ARM_FEATURE_CMSE < 3)))
+  // without main extensions, the non-secure MSPLIM is RAZ/WI
+  (void)MainStackPtrLimit;
+#else
+  __ASM volatile ("MSR msplim, %0" : : "r" (MainStackPtrLimit));
+#endif
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE  ) && (__ARM_FEATURE_CMSE   == 3))
+/**
+  \brief   Set Main Stack Pointer Limit (non-secure)
+  Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure
+  Stack Pointer Limit register hence the write is silently ignored.
+
+  \details Assigns the given value to the non-secure Main Stack Pointer Limit (MSPLIM) when in secure state.
+  \param [in]    MainStackPtrLimit  Main Stack Pointer value to set
+ */
+__STATIC_FORCEINLINE void __TZ_set_MSPLIM_NS(uint32_t MainStackPtrLimit)
+{
+#if (!(defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)))
+  // without main extensions, the non-secure MSPLIM is RAZ/WI
+  (void)MainStackPtrLimit;
+#else
+  __ASM volatile ("MSR msplim_ns, %0" : : "r" (MainStackPtrLimit));
+#endif
+}
+#endif
+
+#endif /* ((defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) || \
+           (defined (__ARM_ARCH_8M_BASE__ ) && (__ARM_ARCH_8M_BASE__ == 1))    ) */
+
+
+/**
+  \brief   Get FPSCR
+  \details Returns the current value of the Floating Point Status/Control register.
+  \return               Floating Point Status/Control register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_FPSCR(void)
+{
+#if ((defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U)) && \
+     (defined (__FPU_USED   ) && (__FPU_USED    == 1U))     )
+#if __has_builtin(__builtin_arm_get_fpscr) 
+// Re-enable using built-in when GCC has been fixed
+// || (__GNUC__ > 7) || (__GNUC__ == 7 && __GNUC_MINOR__ >= 2)
+  /* see https://gcc.gnu.org/ml/gcc-patches/2017-04/msg00443.html */
+  return __builtin_arm_get_fpscr();
+#else
+  uint32_t result;
+
+  __ASM volatile ("VMRS %0, fpscr" : "=r" (result) );
+  return(result);
+#endif
+#else
+  return(0U);
+#endif
+}
+
+
+/**
+  \brief   Set FPSCR
+  \details Assigns the given value to the Floating Point Status/Control register.
+  \param [in]    fpscr  Floating Point Status/Control value to set
+ */
+__STATIC_FORCEINLINE void __set_FPSCR(uint32_t fpscr)
+{
+#if ((defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U)) && \
+     (defined (__FPU_USED   ) && (__FPU_USED    == 1U))     )
+#if __has_builtin(__builtin_arm_set_fpscr)
+// Re-enable using built-in when GCC has been fixed
+// || (__GNUC__ > 7) || (__GNUC__ == 7 && __GNUC_MINOR__ >= 2)
+  /* see https://gcc.gnu.org/ml/gcc-patches/2017-04/msg00443.html */
+  __builtin_arm_set_fpscr(fpscr);
+#else
+  __ASM volatile ("VMSR fpscr, %0" : : "r" (fpscr) : "vfpcc", "memory");
+#endif
+#else
+  (void)fpscr;
+#endif
+}
+
+
+/*@} end of CMSIS_Core_RegAccFunctions */
+
+
+/* ##########################  Core Instruction Access  ######################### */
+/** \defgroup CMSIS_Core_InstructionInterface CMSIS Core Instruction Interface
+  Access to dedicated instructions
+  @{
+*/
+
+/* Define macros for porting to both thumb1 and thumb2.
+ * For thumb1, use low register (r0-r7), specified by constraint "l"
+ * Otherwise, use general registers, specified by constraint "r" */
+#if defined (__thumb__) && !defined (__thumb2__)
+#define __CMSIS_GCC_OUT_REG(r) "=l" (r)
+#define __CMSIS_GCC_RW_REG(r) "+l" (r)
+#define __CMSIS_GCC_USE_REG(r) "l" (r)
+#else
+#define __CMSIS_GCC_OUT_REG(r) "=r" (r)
+#define __CMSIS_GCC_RW_REG(r) "+r" (r)
+#define __CMSIS_GCC_USE_REG(r) "r" (r)
+#endif
+
+/**
+  \brief   No Operation
+  \details No Operation does nothing. This instruction can be used for code alignment purposes.
+ */
+#define __NOP()                             __ASM volatile ("nop")
+
+/**
+  \brief   Wait For Interrupt
+  \details Wait For Interrupt is a hint instruction that suspends execution until one of a number of events occurs.
+ */
+#define __WFI()                             __ASM volatile ("wfi")
+
+
+/**
+  \brief   Wait For Event
+  \details Wait For Event is a hint instruction that permits the processor to enter
+           a low-power state until one of a number of events occurs.
+ */
+#define __WFE()                             __ASM volatile ("wfe")
+
+
+/**
+  \brief   Send Event
+  \details Send Event is a hint instruction. It causes an event to be signaled to the CPU.
+ */
+#define __SEV()                             __ASM volatile ("sev")
+
+
+/**
+  \brief   Instruction Synchronization Barrier
+  \details Instruction Synchronization Barrier flushes the pipeline in the processor,
+           so that all instructions following the ISB are fetched from cache or memory,
+           after the instruction has been completed.
+ */
+__STATIC_FORCEINLINE void __ISB(void)
+{
+  __ASM volatile ("isb 0xF":::"memory");
+}
+
+
+/**
+  \brief   Data Synchronization Barrier
+  \details Acts as a special kind of Data Memory Barrier.
+           It completes when all explicit memory accesses before this instruction complete.
+ */
+__STATIC_FORCEINLINE void __DSB(void)
+{
+  __ASM volatile ("dsb 0xF":::"memory");
+}
+
+
+/**
+  \brief   Data Memory Barrier
+  \details Ensures the apparent order of the explicit memory operations before
+           and after the instruction, without ensuring their completion.
+ */
+__STATIC_FORCEINLINE void __DMB(void)
+{
+  __ASM volatile ("dmb 0xF":::"memory");
+}
+
+
+/**
+  \brief   Reverse byte order (32 bit)
+  \details Reverses the byte order in unsigned integer value. For example, 0x12345678 becomes 0x78563412.
+  \param [in]    value  Value to reverse
+  \return               Reversed value
+ */
+__STATIC_FORCEINLINE uint32_t __REV(uint32_t value)
+{
+#if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5)
+  return __builtin_bswap32(value);
+#else
+  uint32_t result;
+
+  __ASM volatile ("rev %0, %1" : __CMSIS_GCC_OUT_REG (result) : __CMSIS_GCC_USE_REG (value) );
+  return result;
+#endif
+}
+
+
+/**
+  \brief   Reverse byte order (16 bit)
+  \details Reverses the byte order within each halfword of a word. For example, 0x12345678 becomes 0x34127856.
+  \param [in]    value  Value to reverse
+  \return               Reversed value
+ */
+__STATIC_FORCEINLINE uint32_t __REV16(uint32_t value)
+{
+  uint32_t result;
+
+  __ASM volatile ("rev16 %0, %1" : __CMSIS_GCC_OUT_REG (result) : __CMSIS_GCC_USE_REG (value) );
+  return result;
+}
+
+
+/**
+  \brief   Reverse byte order (16 bit)
+  \details Reverses the byte order in a 16-bit value and returns the signed 16-bit result. For example, 0x0080 becomes 0x8000.
+  \param [in]    value  Value to reverse
+  \return               Reversed value
+ */
+__STATIC_FORCEINLINE int16_t __REVSH(int16_t value)
+{
+#if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
+  return (int16_t)__builtin_bswap16(value);
+#else
+  int16_t result;
+
+  __ASM volatile ("revsh %0, %1" : __CMSIS_GCC_OUT_REG (result) : __CMSIS_GCC_USE_REG (value) );
+  return result;
+#endif
+}
+
+
+/**
+  \brief   Rotate Right in unsigned value (32 bit)
+  \details Rotate Right (immediate) provides the value of the contents of a register rotated by a variable number of bits.
+  \param [in]    op1  Value to rotate
+  \param [in]    op2  Number of Bits to rotate
+  \return               Rotated value
+ */
+__STATIC_FORCEINLINE uint32_t __ROR(uint32_t op1, uint32_t op2)
+{
+  op2 %= 32U;
+  if (op2 == 0U)
+  {
+    return op1;
+  }
+  return (op1 >> op2) | (op1 << (32U - op2));
+}
+
+
+/**
+  \brief   Breakpoint
+  \details Causes the processor to enter Debug state.
+           Debug tools can use this to investigate system state when the instruction at a particular address is reached.
+  \param [in]    value  is ignored by the processor.
+                 If required, a debugger can use it to store additional information about the breakpoint.
+ */
+#define __BKPT(value)                       __ASM volatile ("bkpt "#value)
+
+
+/**
+  \brief   Reverse bit order of value
+  \details Reverses the bit order of the given value.
+  \param [in]    value  Value to reverse
+  \return               Reversed value
+ */
+__STATIC_FORCEINLINE uint32_t __RBIT(uint32_t value)
+{
+  uint32_t result;
+
+#if ((defined (__ARM_ARCH_7M__      ) && (__ARM_ARCH_7M__      == 1)) || \
+     (defined (__ARM_ARCH_7EM__     ) && (__ARM_ARCH_7EM__     == 1)) || \
+     (defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1))    )
+   __ASM volatile ("rbit %0, %1" : "=r" (result) : "r" (value) );
+#else
+  uint32_t s = (4U /*sizeof(v)*/ * 8U) - 1U; /* extra shift needed at end */
+
+  result = value;                      /* r will be reversed bits of v; first get LSB of v */
+  for (value >>= 1U; value != 0U; value >>= 1U)
+  {
+    result <<= 1U;
+    result |= value & 1U;
+    s--;
+  }
+  result <<= s;                        /* shift when v's highest bits are zero */
+#endif
+  return result;
+}
+
+
+/**
+  \brief   Count leading zeros
+  \details Counts the number of leading zeros of a data value.
+  \param [in]  value  Value to count the leading zeros
+  \return             number of leading zeros in value
+ */
+__STATIC_FORCEINLINE uint8_t __CLZ(uint32_t value)
+{
+  /* Even though __builtin_clz produces a CLZ instruction on ARM, formally
+     __builtin_clz(0) is undefined behaviour, so handle this case specially.
+     This guarantees ARM-compatible results if happening to compile on a non-ARM
+     target, and ensures the compiler doesn't decide to activate any
+     optimisations using the logic "value was passed to __builtin_clz, so it
+     is non-zero".
+     ARM GCC 7.3 and possibly earlier will optimise this test away, leaving a
+     single CLZ instruction.
+   */
+  if (value == 0U)
+  {
+    return 32U;
+  }
+  return __builtin_clz(value);
+}
+
+
+#if ((defined (__ARM_ARCH_7M__      ) && (__ARM_ARCH_7M__      == 1)) || \
+     (defined (__ARM_ARCH_7EM__     ) && (__ARM_ARCH_7EM__     == 1)) || \
+     (defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) || \
+     (defined (__ARM_ARCH_8M_BASE__ ) && (__ARM_ARCH_8M_BASE__ == 1))    )
+/**
+  \brief   LDR Exclusive (8 bit)
+  \details Executes a exclusive LDR instruction for 8 bit value.
+  \param [in]    ptr  Pointer to data
+  \return             value of type uint8_t at (*ptr)
+ */
+__STATIC_FORCEINLINE uint8_t __LDREXB(volatile uint8_t *addr)
+{
+    uint32_t result;
+
+#if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
+   __ASM volatile ("ldrexb %0, %1" : "=r" (result) : "Q" (*addr) );
+#else
+    /* Prior to GCC 4.8, "Q" will be expanded to [rx, #0] which is not
+       accepted by assembler. So has to use following less efficient pattern.
+    */
+   __ASM volatile ("ldrexb %0, [%1]" : "=r" (result) : "r" (addr) : "memory" );
+#endif
+   return ((uint8_t) result);    /* Add explicit type cast here */
+}
+
+
+/**
+  \brief   LDR Exclusive (16 bit)
+  \details Executes a exclusive LDR instruction for 16 bit values.
+  \param [in]    ptr  Pointer to data
+  \return        value of type uint16_t at (*ptr)
+ */
+__STATIC_FORCEINLINE uint16_t __LDREXH(volatile uint16_t *addr)
+{
+    uint32_t result;
+
+#if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
+   __ASM volatile ("ldrexh %0, %1" : "=r" (result) : "Q" (*addr) );
+#else
+    /* Prior to GCC 4.8, "Q" will be expanded to [rx, #0] which is not
+       accepted by assembler. So has to use following less efficient pattern.
+    */
+   __ASM volatile ("ldrexh %0, [%1]" : "=r" (result) : "r" (addr) : "memory" );
+#endif
+   return ((uint16_t) result);    /* Add explicit type cast here */
+}
+
+
+/**
+  \brief   LDR Exclusive (32 bit)
+  \details Executes a exclusive LDR instruction for 32 bit values.
+  \param [in]    ptr  Pointer to data
+  \return        value of type uint32_t at (*ptr)
+ */
+__STATIC_FORCEINLINE uint32_t __LDREXW(volatile uint32_t *addr)
+{
+    uint32_t result;
+
+   __ASM volatile ("ldrex %0, %1" : "=r" (result) : "Q" (*addr) );
+   return(result);
+}
+
+
+/**
+  \brief   STR Exclusive (8 bit)
+  \details Executes a exclusive STR instruction for 8 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+  \return          0  Function succeeded
+  \return          1  Function failed
+ */
+__STATIC_FORCEINLINE uint32_t __STREXB(uint8_t value, volatile uint8_t *addr)
+{
+   uint32_t result;
+
+   __ASM volatile ("strexb %0, %2, %1" : "=&r" (result), "=Q" (*addr) : "r" ((uint32_t)value) );
+   return(result);
+}
+
+
+/**
+  \brief   STR Exclusive (16 bit)
+  \details Executes a exclusive STR instruction for 16 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+  \return          0  Function succeeded
+  \return          1  Function failed
+ */
+__STATIC_FORCEINLINE uint32_t __STREXH(uint16_t value, volatile uint16_t *addr)
+{
+   uint32_t result;
+
+   __ASM volatile ("strexh %0, %2, %1" : "=&r" (result), "=Q" (*addr) : "r" ((uint32_t)value) );
+   return(result);
+}
+
+
+/**
+  \brief   STR Exclusive (32 bit)
+  \details Executes a exclusive STR instruction for 32 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+  \return          0  Function succeeded
+  \return          1  Function failed
+ */
+__STATIC_FORCEINLINE uint32_t __STREXW(uint32_t value, volatile uint32_t *addr)
+{
+   uint32_t result;
+
+   __ASM volatile ("strex %0, %2, %1" : "=&r" (result), "=Q" (*addr) : "r" (value) );
+   return(result);
+}
+
+
+/**
+  \brief   Remove the exclusive lock
+  \details Removes the exclusive lock which is created by LDREX.
+ */
+__STATIC_FORCEINLINE void __CLREX(void)
+{
+  __ASM volatile ("clrex" ::: "memory");
+}
+
+#endif /* ((defined (__ARM_ARCH_7M__      ) && (__ARM_ARCH_7M__      == 1)) || \
+           (defined (__ARM_ARCH_7EM__     ) && (__ARM_ARCH_7EM__     == 1)) || \
+           (defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) || \
+           (defined (__ARM_ARCH_8M_BASE__ ) && (__ARM_ARCH_8M_BASE__ == 1))    ) */
+
+
+#if ((defined (__ARM_ARCH_7M__      ) && (__ARM_ARCH_7M__      == 1)) || \
+     (defined (__ARM_ARCH_7EM__     ) && (__ARM_ARCH_7EM__     == 1)) || \
+     (defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1))    )
+/**
+  \brief   Signed Saturate
+  \details Saturates a signed value.
+  \param [in]  ARG1  Value to be saturated
+  \param [in]  ARG2  Bit position to saturate to (1..32)
+  \return             Saturated value
+ */
+#define __SSAT(ARG1,ARG2) \
+__extension__ \
+({                          \
+  int32_t __RES, __ARG1 = (ARG1); \
+  __ASM ("ssat %0, %1, %2" : "=r" (__RES) :  "I" (ARG2), "r" (__ARG1) ); \
+  __RES; \
+ })
+
+
+/**
+  \brief   Unsigned Saturate
+  \details Saturates an unsigned value.
+  \param [in]  ARG1  Value to be saturated
+  \param [in]  ARG2  Bit position to saturate to (0..31)
+  \return             Saturated value
+ */
+#define __USAT(ARG1,ARG2) \
+ __extension__ \
+({                          \
+  uint32_t __RES, __ARG1 = (ARG1); \
+  __ASM ("usat %0, %1, %2" : "=r" (__RES) :  "I" (ARG2), "r" (__ARG1) ); \
+  __RES; \
+ })
+
+
+/**
+  \brief   Rotate Right with Extend (32 bit)
+  \details Moves each bit of a bitstring right by one bit.
+           The carry input is shifted in at the left end of the bitstring.
+  \param [in]    value  Value to rotate
+  \return               Rotated value
+ */
+__STATIC_FORCEINLINE uint32_t __RRX(uint32_t value)
+{
+  uint32_t result;
+
+  __ASM volatile ("rrx %0, %1" : __CMSIS_GCC_OUT_REG (result) : __CMSIS_GCC_USE_REG (value) );
+  return(result);
+}
+
+
+/**
+  \brief   LDRT Unprivileged (8 bit)
+  \details Executes a Unprivileged LDRT instruction for 8 bit value.
+  \param [in]    ptr  Pointer to data
+  \return             value of type uint8_t at (*ptr)
+ */
+__STATIC_FORCEINLINE uint8_t __LDRBT(volatile uint8_t *ptr)
+{
+    uint32_t result;
+
+#if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
+   __ASM volatile ("ldrbt %0, %1" : "=r" (result) : "Q" (*ptr) );
+#else
+    /* Prior to GCC 4.8, "Q" will be expanded to [rx, #0] which is not
+       accepted by assembler. So has to use following less efficient pattern.
+    */
+   __ASM volatile ("ldrbt %0, [%1]" : "=r" (result) : "r" (ptr) : "memory" );
+#endif
+   return ((uint8_t) result);    /* Add explicit type cast here */
+}
+
+
+/**
+  \brief   LDRT Unprivileged (16 bit)
+  \details Executes a Unprivileged LDRT instruction for 16 bit values.
+  \param [in]    ptr  Pointer to data
+  \return        value of type uint16_t at (*ptr)
+ */
+__STATIC_FORCEINLINE uint16_t __LDRHT(volatile uint16_t *ptr)
+{
+    uint32_t result;
+
+#if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
+   __ASM volatile ("ldrht %0, %1" : "=r" (result) : "Q" (*ptr) );
+#else
+    /* Prior to GCC 4.8, "Q" will be expanded to [rx, #0] which is not
+       accepted by assembler. So has to use following less efficient pattern.
+    */
+   __ASM volatile ("ldrht %0, [%1]" : "=r" (result) : "r" (ptr) : "memory" );
+#endif
+   return ((uint16_t) result);    /* Add explicit type cast here */
+}
+
+
+/**
+  \brief   LDRT Unprivileged (32 bit)
+  \details Executes a Unprivileged LDRT instruction for 32 bit values.
+  \param [in]    ptr  Pointer to data
+  \return        value of type uint32_t at (*ptr)
+ */
+__STATIC_FORCEINLINE uint32_t __LDRT(volatile uint32_t *ptr)
+{
+    uint32_t result;
+
+   __ASM volatile ("ldrt %0, %1" : "=r" (result) : "Q" (*ptr) );
+   return(result);
+}
+
+
+/**
+  \brief   STRT Unprivileged (8 bit)
+  \details Executes a Unprivileged STRT instruction for 8 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+ */
+__STATIC_FORCEINLINE void __STRBT(uint8_t value, volatile uint8_t *ptr)
+{
+   __ASM volatile ("strbt %1, %0" : "=Q" (*ptr) : "r" ((uint32_t)value) );
+}
+
+
+/**
+  \brief   STRT Unprivileged (16 bit)
+  \details Executes a Unprivileged STRT instruction for 16 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+ */
+__STATIC_FORCEINLINE void __STRHT(uint16_t value, volatile uint16_t *ptr)
+{
+   __ASM volatile ("strht %1, %0" : "=Q" (*ptr) : "r" ((uint32_t)value) );
+}
+
+
+/**
+  \brief   STRT Unprivileged (32 bit)
+  \details Executes a Unprivileged STRT instruction for 32 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+ */
+__STATIC_FORCEINLINE void __STRT(uint32_t value, volatile uint32_t *ptr)
+{
+   __ASM volatile ("strt %1, %0" : "=Q" (*ptr) : "r" (value) );
+}
+
+#else  /* ((defined (__ARM_ARCH_7M__      ) && (__ARM_ARCH_7M__      == 1)) || \
+           (defined (__ARM_ARCH_7EM__     ) && (__ARM_ARCH_7EM__     == 1)) || \
+           (defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1))    ) */
+
+/**
+  \brief   Signed Saturate
+  \details Saturates a signed value.
+  \param [in]  value  Value to be saturated
+  \param [in]    sat  Bit position to saturate to (1..32)
+  \return             Saturated value
+ */
+__STATIC_FORCEINLINE int32_t __SSAT(int32_t val, uint32_t sat)
+{
+  if ((sat >= 1U) && (sat <= 32U))
+  {
+    const int32_t max = (int32_t)((1U << (sat - 1U)) - 1U);
+    const int32_t min = -1 - max ;
+    if (val > max)
+    {
+      return max;
+    }
+    else if (val < min)
+    {
+      return min;
+    }
+  }
+  return val;
+}
+
+/**
+  \brief   Unsigned Saturate
+  \details Saturates an unsigned value.
+  \param [in]  value  Value to be saturated
+  \param [in]    sat  Bit position to saturate to (0..31)
+  \return             Saturated value
+ */
+__STATIC_FORCEINLINE uint32_t __USAT(int32_t val, uint32_t sat)
+{
+  if (sat <= 31U)
+  {
+    const uint32_t max = ((1U << sat) - 1U);
+    if (val > (int32_t)max)
+    {
+      return max;
+    }
+    else if (val < 0)
+    {
+      return 0U;
+    }
+  }
+  return (uint32_t)val;
+}
+
+#endif /* ((defined (__ARM_ARCH_7M__      ) && (__ARM_ARCH_7M__      == 1)) || \
+           (defined (__ARM_ARCH_7EM__     ) && (__ARM_ARCH_7EM__     == 1)) || \
+           (defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1))    ) */
+
+
+#if ((defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) || \
+     (defined (__ARM_ARCH_8M_BASE__ ) && (__ARM_ARCH_8M_BASE__ == 1))    )
+/**
+  \brief   Load-Acquire (8 bit)
+  \details Executes a LDAB instruction for 8 bit value.
+  \param [in]    ptr  Pointer to data
+  \return             value of type uint8_t at (*ptr)
+ */
+__STATIC_FORCEINLINE uint8_t __LDAB(volatile uint8_t *ptr)
+{
+    uint32_t result;
+
+   __ASM volatile ("ldab %0, %1" : "=r" (result) : "Q" (*ptr) );
+   return ((uint8_t) result);
+}
+
+
+/**
+  \brief   Load-Acquire (16 bit)
+  \details Executes a LDAH instruction for 16 bit values.
+  \param [in]    ptr  Pointer to data
+  \return        value of type uint16_t at (*ptr)
+ */
+__STATIC_FORCEINLINE uint16_t __LDAH(volatile uint16_t *ptr)
+{
+    uint32_t result;
+
+   __ASM volatile ("ldah %0, %1" : "=r" (result) : "Q" (*ptr) );
+   return ((uint16_t) result);
+}
+
+
+/**
+  \brief   Load-Acquire (32 bit)
+  \details Executes a LDA instruction for 32 bit values.
+  \param [in]    ptr  Pointer to data
+  \return        value of type uint32_t at (*ptr)
+ */
+__STATIC_FORCEINLINE uint32_t __LDA(volatile uint32_t *ptr)
+{
+    uint32_t result;
+
+   __ASM volatile ("lda %0, %1" : "=r" (result) : "Q" (*ptr) );
+   return(result);
+}
+
+
+/**
+  \brief   Store-Release (8 bit)
+  \details Executes a STLB instruction for 8 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+ */
+__STATIC_FORCEINLINE void __STLB(uint8_t value, volatile uint8_t *ptr)
+{
+   __ASM volatile ("stlb %1, %0" : "=Q" (*ptr) : "r" ((uint32_t)value) );
+}
+
+
+/**
+  \brief   Store-Release (16 bit)
+  \details Executes a STLH instruction for 16 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+ */
+__STATIC_FORCEINLINE void __STLH(uint16_t value, volatile uint16_t *ptr)
+{
+   __ASM volatile ("stlh %1, %0" : "=Q" (*ptr) : "r" ((uint32_t)value) );
+}
+
+
+/**
+  \brief   Store-Release (32 bit)
+  \details Executes a STL instruction for 32 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+ */
+__STATIC_FORCEINLINE void __STL(uint32_t value, volatile uint32_t *ptr)
+{
+   __ASM volatile ("stl %1, %0" : "=Q" (*ptr) : "r" ((uint32_t)value) );
+}
+
+
+/**
+  \brief   Load-Acquire Exclusive (8 bit)
+  \details Executes a LDAB exclusive instruction for 8 bit value.
+  \param [in]    ptr  Pointer to data
+  \return             value of type uint8_t at (*ptr)
+ */
+__STATIC_FORCEINLINE uint8_t __LDAEXB(volatile uint8_t *ptr)
+{
+    uint32_t result;
+
+   __ASM volatile ("ldaexb %0, %1" : "=r" (result) : "Q" (*ptr) );
+   return ((uint8_t) result);
+}
+
+
+/**
+  \brief   Load-Acquire Exclusive (16 bit)
+  \details Executes a LDAH exclusive instruction for 16 bit values.
+  \param [in]    ptr  Pointer to data
+  \return        value of type uint16_t at (*ptr)
+ */
+__STATIC_FORCEINLINE uint16_t __LDAEXH(volatile uint16_t *ptr)
+{
+    uint32_t result;
+
+   __ASM volatile ("ldaexh %0, %1" : "=r" (result) : "Q" (*ptr) );
+   return ((uint16_t) result);
+}
+
+
+/**
+  \brief   Load-Acquire Exclusive (32 bit)
+  \details Executes a LDA exclusive instruction for 32 bit values.
+  \param [in]    ptr  Pointer to data
+  \return        value of type uint32_t at (*ptr)
+ */
+__STATIC_FORCEINLINE uint32_t __LDAEX(volatile uint32_t *ptr)
+{
+    uint32_t result;
+
+   __ASM volatile ("ldaex %0, %1" : "=r" (result) : "Q" (*ptr) );
+   return(result);
+}
+
+
+/**
+  \brief   Store-Release Exclusive (8 bit)
+  \details Executes a STLB exclusive instruction for 8 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+  \return          0  Function succeeded
+  \return          1  Function failed
+ */
+__STATIC_FORCEINLINE uint32_t __STLEXB(uint8_t value, volatile uint8_t *ptr)
+{
+   uint32_t result;
+
+   __ASM volatile ("stlexb %0, %2, %1" : "=&r" (result), "=Q" (*ptr) : "r" ((uint32_t)value) );
+   return(result);
+}
+
+
+/**
+  \brief   Store-Release Exclusive (16 bit)
+  \details Executes a STLH exclusive instruction for 16 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+  \return          0  Function succeeded
+  \return          1  Function failed
+ */
+__STATIC_FORCEINLINE uint32_t __STLEXH(uint16_t value, volatile uint16_t *ptr)
+{
+   uint32_t result;
+
+   __ASM volatile ("stlexh %0, %2, %1" : "=&r" (result), "=Q" (*ptr) : "r" ((uint32_t)value) );
+   return(result);
+}
+
+
+/**
+  \brief   Store-Release Exclusive (32 bit)
+  \details Executes a STL exclusive instruction for 32 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+  \return          0  Function succeeded
+  \return          1  Function failed
+ */
+__STATIC_FORCEINLINE uint32_t __STLEX(uint32_t value, volatile uint32_t *ptr)
+{
+   uint32_t result;
+
+   __ASM volatile ("stlex %0, %2, %1" : "=&r" (result), "=Q" (*ptr) : "r" ((uint32_t)value) );
+   return(result);
+}
+
+#endif /* ((defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) || \
+           (defined (__ARM_ARCH_8M_BASE__ ) && (__ARM_ARCH_8M_BASE__ == 1))    ) */
+
+/*@}*/ /* end of group CMSIS_Core_InstructionInterface */
+
+
+/* ###################  Compiler specific Intrinsics  ########################### */
+/** \defgroup CMSIS_SIMD_intrinsics CMSIS SIMD Intrinsics
+  Access to dedicated SIMD instructions
+  @{
+*/
+
+#if (defined (__ARM_FEATURE_DSP) && (__ARM_FEATURE_DSP == 1))
+
+__STATIC_FORCEINLINE uint32_t __SADD8(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("sadd8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __QADD8(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("qadd8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SHADD8(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("shadd8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UADD8(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("uadd8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UQADD8(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("uqadd8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UHADD8(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("uhadd8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+
+__STATIC_FORCEINLINE uint32_t __SSUB8(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("ssub8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __QSUB8(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("qsub8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SHSUB8(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("shsub8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __USUB8(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("usub8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UQSUB8(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("uqsub8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UHSUB8(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("uhsub8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+
+__STATIC_FORCEINLINE uint32_t __SADD16(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("sadd16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __QADD16(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("qadd16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SHADD16(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("shadd16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UADD16(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("uadd16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UQADD16(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("uqadd16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UHADD16(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("uhadd16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SSUB16(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("ssub16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __QSUB16(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("qsub16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SHSUB16(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("shsub16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __USUB16(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("usub16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UQSUB16(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("uqsub16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UHSUB16(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("uhsub16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SASX(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("sasx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __QASX(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("qasx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SHASX(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("shasx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UASX(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("uasx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UQASX(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("uqasx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UHASX(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("uhasx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SSAX(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("ssax %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __QSAX(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("qsax %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SHSAX(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("shsax %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __USAX(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("usax %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UQSAX(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("uqsax %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UHSAX(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("uhsax %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __USAD8(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("usad8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __USADA8(uint32_t op1, uint32_t op2, uint32_t op3)
+{
+  uint32_t result;
+
+  __ASM volatile ("usada8 %0, %1, %2, %3" : "=r" (result) : "r" (op1), "r" (op2), "r" (op3) );
+  return(result);
+}
+
+#define __SSAT16(ARG1,ARG2) \
+({                          \
+  int32_t __RES, __ARG1 = (ARG1); \
+  __ASM ("ssat16 %0, %1, %2" : "=r" (__RES) :  "I" (ARG2), "r" (__ARG1) ); \
+  __RES; \
+ })
+
+#define __USAT16(ARG1,ARG2) \
+({                          \
+  uint32_t __RES, __ARG1 = (ARG1); \
+  __ASM ("usat16 %0, %1, %2" : "=r" (__RES) :  "I" (ARG2), "r" (__ARG1) ); \
+  __RES; \
+ })
+
+__STATIC_FORCEINLINE uint32_t __UXTB16(uint32_t op1)
+{
+  uint32_t result;
+
+  __ASM volatile ("uxtb16 %0, %1" : "=r" (result) : "r" (op1));
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UXTAB16(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("uxtab16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SXTB16(uint32_t op1)
+{
+  uint32_t result;
+
+  __ASM volatile ("sxtb16 %0, %1" : "=r" (result) : "r" (op1));
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SXTAB16(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("sxtab16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SMUAD  (uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("smuad %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SMUADX (uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("smuadx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SMLAD (uint32_t op1, uint32_t op2, uint32_t op3)
+{
+  uint32_t result;
+
+  __ASM volatile ("smlad %0, %1, %2, %3" : "=r" (result) : "r" (op1), "r" (op2), "r" (op3) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SMLADX (uint32_t op1, uint32_t op2, uint32_t op3)
+{
+  uint32_t result;
+
+  __ASM volatile ("smladx %0, %1, %2, %3" : "=r" (result) : "r" (op1), "r" (op2), "r" (op3) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint64_t __SMLALD (uint32_t op1, uint32_t op2, uint64_t acc)
+{
+  union llreg_u{
+    uint32_t w32[2];
+    uint64_t w64;
+  } llr;
+  llr.w64 = acc;
+
+#ifndef __ARMEB__   /* Little endian */
+  __ASM volatile ("smlald %0, %1, %2, %3" : "=r" (llr.w32[0]), "=r" (llr.w32[1]): "r" (op1), "r" (op2) , "0" (llr.w32[0]), "1" (llr.w32[1]) );
+#else               /* Big endian */
+  __ASM volatile ("smlald %0, %1, %2, %3" : "=r" (llr.w32[1]), "=r" (llr.w32[0]): "r" (op1), "r" (op2) , "0" (llr.w32[1]), "1" (llr.w32[0]) );
+#endif
+
+  return(llr.w64);
+}
+
+__STATIC_FORCEINLINE uint64_t __SMLALDX (uint32_t op1, uint32_t op2, uint64_t acc)
+{
+  union llreg_u{
+    uint32_t w32[2];
+    uint64_t w64;
+  } llr;
+  llr.w64 = acc;
+
+#ifndef __ARMEB__   /* Little endian */
+  __ASM volatile ("smlaldx %0, %1, %2, %3" : "=r" (llr.w32[0]), "=r" (llr.w32[1]): "r" (op1), "r" (op2) , "0" (llr.w32[0]), "1" (llr.w32[1]) );
+#else               /* Big endian */
+  __ASM volatile ("smlaldx %0, %1, %2, %3" : "=r" (llr.w32[1]), "=r" (llr.w32[0]): "r" (op1), "r" (op2) , "0" (llr.w32[1]), "1" (llr.w32[0]) );
+#endif
+
+  return(llr.w64);
+}
+
+__STATIC_FORCEINLINE uint32_t __SMUSD  (uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("smusd %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SMUSDX (uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("smusdx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SMLSD (uint32_t op1, uint32_t op2, uint32_t op3)
+{
+  uint32_t result;
+
+  __ASM volatile ("smlsd %0, %1, %2, %3" : "=r" (result) : "r" (op1), "r" (op2), "r" (op3) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SMLSDX (uint32_t op1, uint32_t op2, uint32_t op3)
+{
+  uint32_t result;
+
+  __ASM volatile ("smlsdx %0, %1, %2, %3" : "=r" (result) : "r" (op1), "r" (op2), "r" (op3) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint64_t __SMLSLD (uint32_t op1, uint32_t op2, uint64_t acc)
+{
+  union llreg_u{
+    uint32_t w32[2];
+    uint64_t w64;
+  } llr;
+  llr.w64 = acc;
+
+#ifndef __ARMEB__   /* Little endian */
+  __ASM volatile ("smlsld %0, %1, %2, %3" : "=r" (llr.w32[0]), "=r" (llr.w32[1]): "r" (op1), "r" (op2) , "0" (llr.w32[0]), "1" (llr.w32[1]) );
+#else               /* Big endian */
+  __ASM volatile ("smlsld %0, %1, %2, %3" : "=r" (llr.w32[1]), "=r" (llr.w32[0]): "r" (op1), "r" (op2) , "0" (llr.w32[1]), "1" (llr.w32[0]) );
+#endif
+
+  return(llr.w64);
+}
+
+__STATIC_FORCEINLINE uint64_t __SMLSLDX (uint32_t op1, uint32_t op2, uint64_t acc)
+{
+  union llreg_u{
+    uint32_t w32[2];
+    uint64_t w64;
+  } llr;
+  llr.w64 = acc;
+
+#ifndef __ARMEB__   /* Little endian */
+  __ASM volatile ("smlsldx %0, %1, %2, %3" : "=r" (llr.w32[0]), "=r" (llr.w32[1]): "r" (op1), "r" (op2) , "0" (llr.w32[0]), "1" (llr.w32[1]) );
+#else               /* Big endian */
+  __ASM volatile ("smlsldx %0, %1, %2, %3" : "=r" (llr.w32[1]), "=r" (llr.w32[0]): "r" (op1), "r" (op2) , "0" (llr.w32[1]), "1" (llr.w32[0]) );
+#endif
+
+  return(llr.w64);
+}
+
+__STATIC_FORCEINLINE uint32_t __SEL  (uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("sel %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE  int32_t __QADD( int32_t op1,  int32_t op2)
+{
+  int32_t result;
+
+  __ASM volatile ("qadd %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE  int32_t __QSUB( int32_t op1,  int32_t op2)
+{
+  int32_t result;
+
+  __ASM volatile ("qsub %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+#if 0
+#define __PKHBT(ARG1,ARG2,ARG3) \
+({                          \
+  uint32_t __RES, __ARG1 = (ARG1), __ARG2 = (ARG2); \
+  __ASM ("pkhbt %0, %1, %2, lsl %3" : "=r" (__RES) :  "r" (__ARG1), "r" (__ARG2), "I" (ARG3)  ); \
+  __RES; \
+ })
+
+#define __PKHTB(ARG1,ARG2,ARG3) \
+({                          \
+  uint32_t __RES, __ARG1 = (ARG1), __ARG2 = (ARG2); \
+  if (ARG3 == 0) \
+    __ASM ("pkhtb %0, %1, %2" : "=r" (__RES) :  "r" (__ARG1), "r" (__ARG2)  ); \
+  else \
+    __ASM ("pkhtb %0, %1, %2, asr %3" : "=r" (__RES) :  "r" (__ARG1), "r" (__ARG2), "I" (ARG3)  ); \
+  __RES; \
+ })
+#endif
+
+#define __PKHBT(ARG1,ARG2,ARG3)          ( ((((uint32_t)(ARG1))          ) & 0x0000FFFFUL) |  \
+                                           ((((uint32_t)(ARG2)) << (ARG3)) & 0xFFFF0000UL)  )
+
+#define __PKHTB(ARG1,ARG2,ARG3)          ( ((((uint32_t)(ARG1))          ) & 0xFFFF0000UL) |  \
+                                           ((((uint32_t)(ARG2)) >> (ARG3)) & 0x0000FFFFUL)  )
+
+__STATIC_FORCEINLINE int32_t __SMMLA (int32_t op1, int32_t op2, int32_t op3)
+{
+ int32_t result;
+
+ __ASM volatile ("smmla %0, %1, %2, %3" : "=r" (result): "r"  (op1), "r" (op2), "r" (op3) );
+ return(result);
+}
+
+#endif /* (__ARM_FEATURE_DSP == 1) */
+/*@} end of group CMSIS_SIMD_intrinsics */
+
+
+#pragma GCC diagnostic pop
+
+#endif /* __CMSIS_GCC_H */
diff --git a/ARM_math/matrix_functions.h b/ARM_math/matrix_functions.h
new file mode 100644
index 0000000..d3f79a0
--- /dev/null
+++ b/ARM_math/matrix_functions.h
@@ -0,0 +1,842 @@
+/******************************************************************************
+ * @file     matrix_functions.h
+ * @brief    Public header file for CMSIS DSP Library
+ * @version  V1.10.0
+ * @date     08 July 2021
+ * Target Processor: Cortex-M and Cortex-A cores
+ ******************************************************************************/
+/*
+ * Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ 
+#ifndef _MATRIX_FUNCTIONS_H_
+#define _MATRIX_FUNCTIONS_H_
+
+#include "arm_math_types.h"
+#include "arm_math_memory.h"
+
+#include "none.h"
+#include "utils.h"
+
+#ifdef   __cplusplus
+extern "C"
+{
+#endif
+
+/**
+ * @defgroup groupMatrix Matrix Functions
+ *
+ * This set of functions provides basic matrix math operations.
+ * The functions operate on matrix data structures.  For example,
+ * the type
+ * definition for the floating-point matrix structure is shown
+ * below:
+ * <pre>
+ *     typedef struct
+ *     {
+ *       uint16_t numRows;     // number of rows of the matrix.
+ *       uint16_t numCols;     // number of columns of the matrix.
+ *       float32_t *pData;     // points to the data of the matrix.
+ *     } arm_matrix_instance_f32;
+ * </pre>
+ * There are similar definitions for Q15 and Q31 data types.
+ *
+ * The structure specifies the size of the matrix and then points to
+ * an array of data.  The array is of size <code>numRows X numCols</code>
+ * and the values are arranged in row order.  That is, the
+ * matrix element (i, j) is stored at:
+ * <pre>
+ *     pData[i*numCols + j]
+ * </pre>
+ *
+ * \par Init Functions
+ * There is an associated initialization function for each type of matrix
+ * data structure.
+ * The initialization function sets the values of the internal structure fields.
+ * Refer to \ref arm_mat_init_f32(), \ref arm_mat_init_q31() and \ref arm_mat_init_q15()
+ * for floating-point, Q31 and Q15 types,  respectively.
+ *
+ * \par
+ * Use of the initialization function is optional. However, if initialization function is used
+ * then the instance structure cannot be placed into a const data section.
+ * To place the instance structure in a const data
+ * section, manually initialize the data structure.  For example:
+ * <pre>
+ * <code>arm_matrix_instance_f32 S = {nRows, nColumns, pData};</code>
+ * <code>arm_matrix_instance_q31 S = {nRows, nColumns, pData};</code>
+ * <code>arm_matrix_instance_q15 S = {nRows, nColumns, pData};</code>
+ * </pre>
+ * where <code>nRows</code> specifies the number of rows, <code>nColumns</code>
+ * specifies the number of columns, and <code>pData</code> points to the
+ * data array.
+ *
+ * \par Size Checking
+ * By default all of the matrix functions perform size checking on the input and
+ * output matrices. For example, the matrix addition function verifies that the
+ * two input matrices and the output matrix all have the same number of rows and
+ * columns. If the size check fails the functions return:
+ * <pre>
+ *     ARM_MATH_SIZE_MISMATCH
+ * </pre>
+ * Otherwise the functions return
+ * <pre>
+ *     ARM_MATH_SUCCESS
+ * </pre>
+ * There is some overhead associated with this matrix size checking.
+ * The matrix size checking is enabled via the \#define
+ * <pre>
+ *     ARM_MATH_MATRIX_CHECK
+ * </pre>
+ * within the library project settings.  By default this macro is defined
+ * and size checking is enabled. By changing the project settings and
+ * undefining this macro size checking is eliminated and the functions
+ * run a bit faster. With size checking disabled the functions always
+ * return <code>ARM_MATH_SUCCESS</code>.
+ */
+
+  #define DEFAULT_HOUSEHOLDER_THRESHOLD_F64 (1.0e-16)
+  #define DEFAULT_HOUSEHOLDER_THRESHOLD_F32 (1.0e-12f)
+
+  /**
+   * @brief Instance structure for the floating-point matrix structure.
+   */
+  typedef struct
+  {
+    uint16_t numRows;     /**< number of rows of the matrix.     */
+    uint16_t numCols;     /**< number of columns of the matrix.  */
+    float32_t *pData;     /**< points to the data of the matrix. */
+  } arm_matrix_instance_f32;
+ 
+ /**
+   * @brief Instance structure for the floating-point matrix structure.
+   */
+  typedef struct
+  {
+    uint16_t numRows;     /**< number of rows of the matrix.     */
+    uint16_t numCols;     /**< number of columns of the matrix.  */
+    float64_t *pData;     /**< points to the data of the matrix. */
+  } arm_matrix_instance_f64;
+
+ /**
+   * @brief Instance structure for the Q7 matrix structure.
+   */
+  typedef struct
+  {
+    uint16_t numRows;     /**< number of rows of the matrix.     */
+    uint16_t numCols;     /**< number of columns of the matrix.  */
+    q7_t *pData;         /**< points to the data of the matrix. */
+  } arm_matrix_instance_q7;
+
+  /**
+   * @brief Instance structure for the Q15 matrix structure.
+   */
+  typedef struct
+  {
+    uint16_t numRows;     /**< number of rows of the matrix.     */
+    uint16_t numCols;     /**< number of columns of the matrix.  */
+    q15_t *pData;         /**< points to the data of the matrix. */
+  } arm_matrix_instance_q15;
+
+  /**
+   * @brief Instance structure for the Q31 matrix structure.
+   */
+  typedef struct
+  {
+    uint16_t numRows;     /**< number of rows of the matrix.     */
+    uint16_t numCols;     /**< number of columns of the matrix.  */
+    q31_t *pData;         /**< points to the data of the matrix. */
+  } arm_matrix_instance_q31;
+
+  /**
+   * @brief Floating-point matrix addition.
+   * @param[in]  pSrcA  points to the first input matrix structure
+   * @param[in]  pSrcB  points to the second input matrix structure
+   * @param[out] pDst   points to output matrix structure
+   * @return     The function returns either
+   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_add_f32(
+  const arm_matrix_instance_f32 * pSrcA,
+  const arm_matrix_instance_f32 * pSrcB,
+        arm_matrix_instance_f32 * pDst);
+
+  /**
+   * @brief Q15 matrix addition.
+   * @param[in]   pSrcA  points to the first input matrix structure
+   * @param[in]   pSrcB  points to the second input matrix structure
+   * @param[out]  pDst   points to output matrix structure
+   * @return     The function returns either
+   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_add_q15(
+  const arm_matrix_instance_q15 * pSrcA,
+  const arm_matrix_instance_q15 * pSrcB,
+        arm_matrix_instance_q15 * pDst);
+
+  /**
+   * @brief Q31 matrix addition.
+   * @param[in]  pSrcA  points to the first input matrix structure
+   * @param[in]  pSrcB  points to the second input matrix structure
+   * @param[out] pDst   points to output matrix structure
+   * @return     The function returns either
+   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_add_q31(
+  const arm_matrix_instance_q31 * pSrcA,
+  const arm_matrix_instance_q31 * pSrcB,
+        arm_matrix_instance_q31 * pDst);
+
+  /**
+   * @brief Floating-point, complex, matrix multiplication.
+   * @param[in]  pSrcA  points to the first input matrix structure
+   * @param[in]  pSrcB  points to the second input matrix structure
+   * @param[out] pDst   points to output matrix structure
+   * @return     The function returns either
+   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_cmplx_mult_f32(
+  const arm_matrix_instance_f32 * pSrcA,
+  const arm_matrix_instance_f32 * pSrcB,
+        arm_matrix_instance_f32 * pDst);
+
+  /**
+   * @brief Q15, complex,  matrix multiplication.
+   * @param[in]  pSrcA  points to the first input matrix structure
+   * @param[in]  pSrcB  points to the second input matrix structure
+   * @param[out] pDst   points to output matrix structure
+   * @return     The function returns either
+   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_cmplx_mult_q15(
+  const arm_matrix_instance_q15 * pSrcA,
+  const arm_matrix_instance_q15 * pSrcB,
+        arm_matrix_instance_q15 * pDst,
+        q15_t * pScratch);
+
+  /**
+   * @brief Q31, complex, matrix multiplication.
+   * @param[in]  pSrcA  points to the first input matrix structure
+   * @param[in]  pSrcB  points to the second input matrix structure
+   * @param[out] pDst   points to output matrix structure
+   * @return     The function returns either
+   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_cmplx_mult_q31(
+  const arm_matrix_instance_q31 * pSrcA,
+  const arm_matrix_instance_q31 * pSrcB,
+        arm_matrix_instance_q31 * pDst);
+
+  /**
+   * @brief Floating-point matrix transpose.
+   * @param[in]  pSrc  points to the input matrix
+   * @param[out] pDst  points to the output matrix
+   * @return    The function returns either  <code>ARM_MATH_SIZE_MISMATCH</code>
+   * or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_trans_f32(
+  const arm_matrix_instance_f32 * pSrc,
+        arm_matrix_instance_f32 * pDst);
+
+/**
+   * @brief Floating-point matrix transpose.
+   * @param[in]  pSrc  points to the input matrix
+   * @param[out] pDst  points to the output matrix
+   * @return    The function returns either  <code>ARM_MATH_SIZE_MISMATCH</code>
+   * or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_trans_f64(
+  const arm_matrix_instance_f64 * pSrc,
+        arm_matrix_instance_f64 * pDst);
+
+  /**
+   * @brief Floating-point complex matrix transpose.
+   * @param[in]  pSrc  points to the input matrix
+   * @param[out] pDst  points to the output matrix
+   * @return    The function returns either  <code>ARM_MATH_SIZE_MISMATCH</code>
+   * or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_cmplx_trans_f32(
+  const arm_matrix_instance_f32 * pSrc,
+  arm_matrix_instance_f32 * pDst);
+
+
+  /**
+   * @brief Q15 matrix transpose.
+   * @param[in]  pSrc  points to the input matrix
+   * @param[out] pDst  points to the output matrix
+   * @return    The function returns either  <code>ARM_MATH_SIZE_MISMATCH</code>
+   * or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_trans_q15(
+  const arm_matrix_instance_q15 * pSrc,
+        arm_matrix_instance_q15 * pDst);
+
+  /**
+   * @brief Q15 complex matrix transpose.
+   * @param[in]  pSrc  points to the input matrix
+   * @param[out] pDst  points to the output matrix
+   * @return    The function returns either  <code>ARM_MATH_SIZE_MISMATCH</code>
+   * or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_cmplx_trans_q15(
+  const arm_matrix_instance_q15 * pSrc,
+  arm_matrix_instance_q15 * pDst);
+
+  /**
+   * @brief Q7 matrix transpose.
+   * @param[in]  pSrc  points to the input matrix
+   * @param[out] pDst  points to the output matrix
+   * @return    The function returns either  <code>ARM_MATH_SIZE_MISMATCH</code>
+   * or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_trans_q7(
+  const arm_matrix_instance_q7 * pSrc,
+        arm_matrix_instance_q7 * pDst);
+
+  /**
+   * @brief Q31 matrix transpose.
+   * @param[in]  pSrc  points to the input matrix
+   * @param[out] pDst  points to the output matrix
+   * @return    The function returns either  <code>ARM_MATH_SIZE_MISMATCH</code>
+   * or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_trans_q31(
+  const arm_matrix_instance_q31 * pSrc,
+        arm_matrix_instance_q31 * pDst);
+
+  /**
+   * @brief Q31 complex matrix transpose.
+   * @param[in]  pSrc  points to the input matrix
+   * @param[out] pDst  points to the output matrix
+   * @return    The function returns either  <code>ARM_MATH_SIZE_MISMATCH</code>
+   * or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_cmplx_trans_q31(
+  const arm_matrix_instance_q31 * pSrc,
+  arm_matrix_instance_q31 * pDst);
+
+  /**
+   * @brief Floating-point matrix multiplication
+   * @param[in]  pSrcA  points to the first input matrix structure
+   * @param[in]  pSrcB  points to the second input matrix structure
+   * @param[out] pDst   points to output matrix structure
+   * @return     The function returns either
+   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_mult_f32(
+  const arm_matrix_instance_f32 * pSrcA,
+  const arm_matrix_instance_f32 * pSrcB,
+        arm_matrix_instance_f32 * pDst);
+
+  /**
+   * @brief Floating-point matrix multiplication
+   * @param[in]  pSrcA  points to the first input matrix structure
+   * @param[in]  pSrcB  points to the second input matrix structure
+   * @param[out] pDst   points to output matrix structure
+   * @return     The function returns either
+   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_mult_f64(
+  const arm_matrix_instance_f64 * pSrcA,
+  const arm_matrix_instance_f64 * pSrcB,
+        arm_matrix_instance_f64 * pDst);
+
+  /**
+   * @brief Floating-point matrix and vector multiplication
+   * @param[in]  pSrcMat  points to the input matrix structure
+   * @param[in]  pVec     points to vector
+   * @param[out] pDst     points to output vector
+   */
+void arm_mat_vec_mult_f32(
+  const arm_matrix_instance_f32 *pSrcMat, 
+  const float32_t *pVec, 
+  float32_t *pDst);
+
+  /**
+   * @brief Q7 matrix multiplication
+   * @param[in]  pSrcA   points to the first input matrix structure
+   * @param[in]  pSrcB   points to the second input matrix structure
+   * @param[out] pDst    points to output matrix structure
+   * @param[in]  pState  points to the array for storing intermediate results
+   * @return     The function returns either
+   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_mult_q7(
+  const arm_matrix_instance_q7 * pSrcA,
+  const arm_matrix_instance_q7 * pSrcB,
+        arm_matrix_instance_q7 * pDst,
+        q7_t * pState);
+
+  /**
+   * @brief Q7 matrix and vector multiplication
+   * @param[in]  pSrcMat  points to the input matrix structure
+   * @param[in]  pVec     points to vector
+   * @param[out] pDst     points to output vector
+   */
+void arm_mat_vec_mult_q7(
+  const arm_matrix_instance_q7 *pSrcMat, 
+  const q7_t *pVec, 
+  q7_t *pDst);
+
+  /**
+   * @brief Q15 matrix multiplication
+   * @param[in]  pSrcA   points to the first input matrix structure
+   * @param[in]  pSrcB   points to the second input matrix structure
+   * @param[out] pDst    points to output matrix structure
+   * @param[in]  pState  points to the array for storing intermediate results
+   * @return     The function returns either
+   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_mult_q15(
+  const arm_matrix_instance_q15 * pSrcA,
+  const arm_matrix_instance_q15 * pSrcB,
+        arm_matrix_instance_q15 * pDst,
+        q15_t * pState);
+
+  /**
+   * @brief Q15 matrix and vector multiplication
+   * @param[in]  pSrcMat  points to the input matrix structure
+   * @param[in]  pVec     points to vector
+   * @param[out] pDst     points to output vector
+   */
+void arm_mat_vec_mult_q15(
+  const arm_matrix_instance_q15 *pSrcMat, 
+  const q15_t *pVec, 
+  q15_t *pDst);
+
+  /**
+   * @brief Q15 matrix multiplication (fast variant) for Cortex-M3 and Cortex-M4
+   * @param[in]  pSrcA   points to the first input matrix structure
+   * @param[in]  pSrcB   points to the second input matrix structure
+   * @param[out] pDst    points to output matrix structure
+   * @param[in]  pState  points to the array for storing intermediate results
+   * @return     The function returns either
+   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_mult_fast_q15(
+  const arm_matrix_instance_q15 * pSrcA,
+  const arm_matrix_instance_q15 * pSrcB,
+        arm_matrix_instance_q15 * pDst,
+        q15_t * pState);
+
+  /**
+   * @brief Q31 matrix multiplication
+   * @param[in]  pSrcA  points to the first input matrix structure
+   * @param[in]  pSrcB  points to the second input matrix structure
+   * @param[out] pDst   points to output matrix structure
+   * @return     The function returns either
+   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_mult_q31(
+  const arm_matrix_instance_q31 * pSrcA,
+  const arm_matrix_instance_q31 * pSrcB,
+        arm_matrix_instance_q31 * pDst);
+
+  /**
+   * @brief Q31 matrix multiplication
+   * @param[in]  pSrcA  points to the first input matrix structure
+   * @param[in]  pSrcB  points to the second input matrix structure
+   * @param[out] pDst   points to output matrix structure
+   * @param[in]  pState  points to the array for storing intermediate results
+   * @return     The function returns either
+   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_mult_opt_q31(
+  const arm_matrix_instance_q31 * pSrcA,
+  const arm_matrix_instance_q31 * pSrcB,
+        arm_matrix_instance_q31 * pDst,
+        q31_t *pState);
+
+  /**
+   * @brief Q31 matrix and vector multiplication
+   * @param[in]  pSrcMat  points to the input matrix structure
+   * @param[in]  pVec     points to vector
+   * @param[out] pDst     points to output vector
+   */
+void arm_mat_vec_mult_q31(
+  const arm_matrix_instance_q31 *pSrcMat, 
+  const q31_t *pVec, 
+  q31_t *pDst);
+
+  /**
+   * @brief Q31 matrix multiplication (fast variant) for Cortex-M3 and Cortex-M4
+   * @param[in]  pSrcA  points to the first input matrix structure
+   * @param[in]  pSrcB  points to the second input matrix structure
+   * @param[out] pDst   points to output matrix structure
+   * @return     The function returns either
+   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_mult_fast_q31(
+  const arm_matrix_instance_q31 * pSrcA,
+  const arm_matrix_instance_q31 * pSrcB,
+        arm_matrix_instance_q31 * pDst);
+
+  /**
+   * @brief Floating-point matrix subtraction
+   * @param[in]  pSrcA  points to the first input matrix structure
+   * @param[in]  pSrcB  points to the second input matrix structure
+   * @param[out] pDst   points to output matrix structure
+   * @return     The function returns either
+   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_sub_f32(
+  const arm_matrix_instance_f32 * pSrcA,
+  const arm_matrix_instance_f32 * pSrcB,
+        arm_matrix_instance_f32 * pDst);
+
+  /**
+   * @brief Floating-point matrix subtraction
+   * @param[in]  pSrcA  points to the first input matrix structure
+   * @param[in]  pSrcB  points to the second input matrix structure
+   * @param[out] pDst   points to output matrix structure
+   * @return     The function returns either
+   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_sub_f64(
+  const arm_matrix_instance_f64 * pSrcA,
+  const arm_matrix_instance_f64 * pSrcB,
+        arm_matrix_instance_f64 * pDst);
+
+  /**
+   * @brief Q15 matrix subtraction
+   * @param[in]  pSrcA  points to the first input matrix structure
+   * @param[in]  pSrcB  points to the second input matrix structure
+   * @param[out] pDst   points to output matrix structure
+   * @return     The function returns either
+   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_sub_q15(
+  const arm_matrix_instance_q15 * pSrcA,
+  const arm_matrix_instance_q15 * pSrcB,
+        arm_matrix_instance_q15 * pDst);
+
+  /**
+   * @brief Q31 matrix subtraction
+   * @param[in]  pSrcA  points to the first input matrix structure
+   * @param[in]  pSrcB  points to the second input matrix structure
+   * @param[out] pDst   points to output matrix structure
+   * @return     The function returns either
+   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_sub_q31(
+  const arm_matrix_instance_q31 * pSrcA,
+  const arm_matrix_instance_q31 * pSrcB,
+        arm_matrix_instance_q31 * pDst);
+
+  /**
+   * @brief Floating-point matrix scaling.
+   * @param[in]  pSrc   points to the input matrix
+   * @param[in]  scale  scale factor
+   * @param[out] pDst   points to the output matrix
+   * @return     The function returns either
+   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_scale_f32(
+  const arm_matrix_instance_f32 * pSrc,
+        float32_t scale,
+        arm_matrix_instance_f32 * pDst);
+
+  /**
+   * @brief Q15 matrix scaling.
+   * @param[in]  pSrc        points to input matrix
+   * @param[in]  scaleFract  fractional portion of the scale factor
+   * @param[in]  shift       number of bits to shift the result by
+   * @param[out] pDst        points to output matrix
+   * @return     The function returns either
+   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_scale_q15(
+  const arm_matrix_instance_q15 * pSrc,
+        q15_t scaleFract,
+        int32_t shift,
+        arm_matrix_instance_q15 * pDst);
+
+  /**
+   * @brief Q31 matrix scaling.
+   * @param[in]  pSrc        points to input matrix
+   * @param[in]  scaleFract  fractional portion of the scale factor
+   * @param[in]  shift       number of bits to shift the result by
+   * @param[out] pDst        points to output matrix structure
+   * @return     The function returns either
+   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_scale_q31(
+  const arm_matrix_instance_q31 * pSrc,
+        q31_t scaleFract,
+        int32_t shift,
+        arm_matrix_instance_q31 * pDst);
+
+  /**
+   * @brief  Q31 matrix initialization.
+   * @param[in,out] S         points to an instance of the floating-point matrix structure.
+   * @param[in]     nRows     number of rows in the matrix.
+   * @param[in]     nColumns  number of columns in the matrix.
+   * @param[in]     pData     points to the matrix data array.
+   */
+void arm_mat_init_q31(
+        arm_matrix_instance_q31 * S,
+        uint16_t nRows,
+        uint16_t nColumns,
+        q31_t * pData);
+
+  /**
+   * @brief  Q15 matrix initialization.
+   * @param[in,out] S         points to an instance of the floating-point matrix structure.
+   * @param[in]     nRows     number of rows in the matrix.
+   * @param[in]     nColumns  number of columns in the matrix.
+   * @param[in]     pData     points to the matrix data array.
+   */
+void arm_mat_init_q15(
+        arm_matrix_instance_q15 * S,
+        uint16_t nRows,
+        uint16_t nColumns,
+        q15_t * pData);
+
+  /**
+   * @brief  Floating-point matrix initialization.
+   * @param[in,out] S         points to an instance of the floating-point matrix structure.
+   * @param[in]     nRows     number of rows in the matrix.
+   * @param[in]     nColumns  number of columns in the matrix.
+   * @param[in]     pData     points to the matrix data array.
+   */
+void arm_mat_init_f32(
+        arm_matrix_instance_f32 * S,
+        uint16_t nRows,
+        uint16_t nColumns,
+        float32_t * pData);
+
+
+
+  /**
+   * @brief Floating-point matrix inverse.
+   * @param[in]  src   points to the instance of the input floating-point matrix structure.
+   * @param[out] dst   points to the instance of the output floating-point matrix structure.
+   * @return The function returns ARM_MATH_SIZE_MISMATCH, if the dimensions do not match.
+   * If the input matrix is singular (does not have an inverse), then the algorithm terminates and returns error status ARM_MATH_SINGULAR.
+   */
+  arm_status arm_mat_inverse_f32(
+  const arm_matrix_instance_f32 * src,
+  arm_matrix_instance_f32 * dst);
+
+
+  /**
+   * @brief Floating-point matrix inverse.
+   * @param[in]  src   points to the instance of the input floating-point matrix structure.
+   * @param[out] dst   points to the instance of the output floating-point matrix structure.
+   * @return The function returns ARM_MATH_SIZE_MISMATCH, if the dimensions do not match.
+   * If the input matrix is singular (does not have an inverse), then the algorithm terminates and returns error status ARM_MATH_SINGULAR.
+   */
+  arm_status arm_mat_inverse_f64(
+  const arm_matrix_instance_f64 * src,
+  arm_matrix_instance_f64 * dst);
+
+ /**
+   * @brief Floating-point Cholesky decomposition of Symmetric Positive Definite Matrix.
+   * @param[in]  src   points to the instance of the input floating-point matrix structure.
+   * @param[out] dst   points to the instance of the output floating-point matrix structure.
+   * @return The function returns ARM_MATH_SIZE_MISMATCH, if the dimensions do not match.
+   * If the input matrix does not have a decomposition, then the algorithm terminates and returns error status ARM_MATH_DECOMPOSITION_FAILURE.
+   * If the matrix is ill conditioned or only semi-definite, then it is better using the LDL^t decomposition.
+   * The decomposition is returning a lower triangular matrix.
+   */
+  arm_status arm_mat_cholesky_f64(
+  const arm_matrix_instance_f64 * src,
+  arm_matrix_instance_f64 * dst);
+
+ /**
+   * @brief Floating-point Cholesky decomposition of Symmetric Positive Definite Matrix.
+   * @param[in]  src   points to the instance of the input floating-point matrix structure.
+   * @param[out] dst   points to the instance of the output floating-point matrix structure.
+   * @return The function returns ARM_MATH_SIZE_MISMATCH, if the dimensions do not match.
+   * If the input matrix does not have a decomposition, then the algorithm terminates and returns error status ARM_MATH_DECOMPOSITION_FAILURE.
+   * If the matrix is ill conditioned or only semi-definite, then it is better using the LDL^t decomposition.
+   * The decomposition is returning a lower triangular matrix.
+   */
+  arm_status arm_mat_cholesky_f32(
+  const arm_matrix_instance_f32 * src,
+  arm_matrix_instance_f32 * dst);
+
+  /**
+   * @brief Solve UT . X = A where UT is an upper triangular matrix
+   * @param[in]  ut  The upper triangular matrix
+   * @param[in]  a  The matrix a
+   * @param[out] dst The solution X of UT . X = A
+   * @return The function returns ARM_MATH_SINGULAR, if the system can't be solved.
+  */
+  arm_status arm_mat_solve_upper_triangular_f32(
+  const arm_matrix_instance_f32 * ut,
+  const arm_matrix_instance_f32 * a,
+  arm_matrix_instance_f32 * dst);
+
+ /**
+   * @brief Solve LT . X = A where LT is a lower triangular matrix
+   * @param[in]  lt  The lower triangular matrix
+   * @param[in]  a  The matrix a
+   * @param[out] dst The solution X of LT . X = A
+   * @return The function returns ARM_MATH_SINGULAR, if the system can't be solved.
+   */
+  arm_status arm_mat_solve_lower_triangular_f32(
+  const arm_matrix_instance_f32 * lt,
+  const arm_matrix_instance_f32 * a,
+  arm_matrix_instance_f32 * dst);
+
+
+  /**
+   * @brief Solve UT . X = A where UT is an upper triangular matrix
+   * @param[in]  ut  The upper triangular matrix
+   * @param[in]  a  The matrix a
+   * @param[out] dst The solution X of UT . X = A
+   * @return The function returns ARM_MATH_SINGULAR, if the system can't be solved.
+  */
+  arm_status arm_mat_solve_upper_triangular_f64(
+  const arm_matrix_instance_f64 * ut,
+  const arm_matrix_instance_f64 * a,
+  arm_matrix_instance_f64 * dst);
+
+ /**
+   * @brief Solve LT . X = A where LT is a lower triangular matrix
+   * @param[in]  lt  The lower triangular matrix
+   * @param[in]  a  The matrix a
+   * @param[out] dst The solution X of LT . X = A
+   * @return The function returns ARM_MATH_SINGULAR, if the system can't be solved.
+   */
+  arm_status arm_mat_solve_lower_triangular_f64(
+  const arm_matrix_instance_f64 * lt,
+  const arm_matrix_instance_f64 * a,
+  arm_matrix_instance_f64 * dst);
+
+
+  /**
+   * @brief Floating-point LDL decomposition of Symmetric Positive Semi-Definite Matrix.
+   * @param[in]  src   points to the instance of the input floating-point matrix structure.
+   * @param[out] l   points to the instance of the output floating-point triangular matrix structure.
+   * @param[out] d   points to the instance of the output floating-point diagonal matrix structure.
+   * @param[out] p   points to the instance of the output floating-point permutation vector.
+   * @return The function returns ARM_MATH_SIZE_MISMATCH, if the dimensions do not match.
+   * If the input matrix does not have a decomposition, then the algorithm terminates and returns error status ARM_MATH_DECOMPOSITION_FAILURE.
+   * The decomposition is returning a lower triangular matrix.
+   */
+  arm_status arm_mat_ldlt_f32(
+  const arm_matrix_instance_f32 * src,
+  arm_matrix_instance_f32 * l,
+  arm_matrix_instance_f32 * d,
+  uint16_t * pp);
+
+ /**
+   * @brief Floating-point LDL decomposition of Symmetric Positive Semi-Definite Matrix.
+   * @param[in]  src   points to the instance of the input floating-point matrix structure.
+   * @param[out] l   points to the instance of the output floating-point triangular matrix structure.
+   * @param[out] d   points to the instance of the output floating-point diagonal matrix structure.
+   * @param[out] p   points to the instance of the output floating-point permutation vector.
+   * @return The function returns ARM_MATH_SIZE_MISMATCH, if the dimensions do not match.
+   * If the input matrix does not have a decomposition, then the algorithm terminates and returns error status ARM_MATH_DECOMPOSITION_FAILURE.
+   * The decomposition is returning a lower triangular matrix.
+   */
+  arm_status arm_mat_ldlt_f64(
+  const arm_matrix_instance_f64 * src,
+  arm_matrix_instance_f64 * l,
+  arm_matrix_instance_f64 * d,
+  uint16_t * pp);
+
+/**
+  @brief         QR decomposition of a m x n floating point matrix with m >= n.
+  @param[in]     pSrc      points to input matrix structure. The source matrix is modified by the function.
+  @param[in]     threshold norm2 threshold.
+  @param[out]    pOutR     points to output R matrix structure of dimension m x n
+  @param[out]    pOutQ     points to output Q matrix structure of dimension m x m
+  @param[out]    pOutTau   points to Householder scaling factors of dimension n
+  @param[inout]  pTmpA     points to a temporary vector of dimension m.
+  @param[inout]  pTmpB     points to a temporary vector of dimension n.
+  @return        execution status
+                   - \ref ARM_MATH_SUCCESS       : Operation successful
+                   - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+                   - \ref ARM_MATH_SINGULAR      : Input matrix is found to be singular (non-invertible)
+ */
+
+arm_status arm_mat_qr_f32(
+    const arm_matrix_instance_f32 * pSrc,
+    const float32_t threshold,
+    arm_matrix_instance_f32 * pOutR,
+    arm_matrix_instance_f32 * pOutQ,
+    float32_t * pOutTau,
+    float32_t *pTmpA,
+    float32_t *pTmpB
+    );
+
+/**
+  @brief         QR decomposition of a m x n floating point matrix with m >= n.
+  @param[in]     pSrc      points to input matrix structure. The source matrix is modified by the function.
+  @param[in]     threshold norm2 threshold.  
+  @param[out]    pOutR     points to output R matrix structure of dimension m x n
+  @param[out]    pOutQ     points to output Q matrix structure of dimension m x m
+  @param[out]    pOutTau   points to Householder scaling factors of dimension n
+  @param[inout]  pTmpA     points to a temporary vector of dimension m.
+  @param[inout]  pTmpB     points to a temporary vector of dimension n.
+  @return        execution status
+                   - \ref ARM_MATH_SUCCESS       : Operation successful
+                   - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+                   - \ref ARM_MATH_SINGULAR      : Input matrix is found to be singular (non-invertible)
+ */
+
+arm_status arm_mat_qr_f64(
+    const arm_matrix_instance_f64 * pSrc,
+    const float64_t threshold,
+    arm_matrix_instance_f64 * pOutR,
+    arm_matrix_instance_f64 * pOutQ,
+    float64_t * pOutTau,
+    float64_t *pTmpA,
+    float64_t *pTmpB
+    );
+
+/**
+  @brief         Householder transform of a floating point vector.
+  @param[in]     pSrc        points to the input vector.
+  @param[in]     threshold   norm2 threshold.
+  @param[in]     blockSize   dimension of the vector space.
+  @param[outQ]   pOut        points to the output vector.
+  @return        beta        return the scaling factor beta
+ */
+
+float32_t arm_householder_f32(
+    const float32_t * pSrc,
+    const float32_t threshold,
+    uint32_t    blockSize,
+    float32_t * pOut
+    );
+
+/**
+  @brief         Householder transform of a double floating point vector.
+  @param[in]     pSrc        points to the input vector.
+  @param[in]     threshold   norm2 threshold.
+  @param[in]     blockSize   dimension of the vector space.
+  @param[outQ]   pOut        points to the output vector.
+  @return        beta        return the scaling factor beta
+ */
+
+float64_t arm_householder_f64(
+    const float64_t * pSrc,
+    const float64_t threshold,
+    uint32_t    blockSize,
+    float64_t * pOut
+    );
+
+#ifdef   __cplusplus
+}
+#endif
+
+#endif /* ifndef _MATRIX_FUNCTIONS_H_ */
diff --git a/ARM_math/matrix_utils.h b/ARM_math/matrix_utils.h
new file mode 100644
index 0000000..552406f
--- /dev/null
+++ b/ARM_math/matrix_utils.h
@@ -0,0 +1,640 @@
+/******************************************************************************
+ * @file     matrix_utils.h
+ * @brief    Public header file for CMSIS DSP Library
+ * @version  V1.11.0
+ * @date     30 May 2022
+ * Target Processor: Cortex-M and Cortex-A cores
+ ******************************************************************************/
+/*
+ * Copyright (c) 2010-2022 Arm Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ 
+#ifndef _MATRIX_UTILS_H_
+#define _MATRIX_UTILS_H_
+
+#include "arm_math_types.h"
+#include "arm_math_memory.h"
+
+#include "none.h"
+#include "utils.h"
+
+#ifdef   __cplusplus
+extern "C"
+{
+#endif
+
+#define ELEM(A,ROW,COL) &((A)->pData[(A)->numCols* (ROW) + (COL)])
+
+#define SCALE_COL_T(T,CAST,A,ROW,v,i)        \
+{                                       \
+  int32_t w;                            \
+  T *data = (A)->pData;                 \
+  const int32_t numCols = (A)->numCols; \
+  const int32_t nb = (A)->numRows - ROW;\
+                                        \
+  data += i + numCols * (ROW);          \
+                                        \
+  for(w=0;w < nb; w++)                  \
+  {                                     \
+     *data *= CAST v;                   \
+     data += numCols;                   \
+  }                                     \
+}
+
+#define COPY_COL_T(T,A,ROW,COL,DST)               \
+{                                                 \
+    uint32_t row;                                 \
+    T *pb=DST;                                    \
+    T *pa = (A)->pData + ROW * (A)->numCols + COL;\
+    for(row = ROW; row < (A)->numRows; row ++)    \
+    {                                             \
+         *pb++ = *pa;                             \
+         pa += (A)->numCols;                      \
+    }                                             \
+}
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#define SWAP_ROWS_F16(A,COL,i,j)                  \
+  {                                               \
+    int cnt = ((A)->numCols)-(COL);               \
+    int32_t w;                                   \
+    float16_t *data = (A)->pData;                 \
+    const int32_t numCols = (A)->numCols;        \
+                                                  \
+    for(w=(COL);w < numCols; w+=8)                \
+    {                                             \
+       f16x8_t tmpa,tmpb;                         \
+       mve_pred16_t p0 = vctp16q(cnt);            \
+                                                  \
+       tmpa=vldrhq_z_f16(&data[i*numCols + w],p0);\
+       tmpb=vldrhq_z_f16(&data[j*numCols + w],p0);\
+                                                  \
+       vstrhq_p(&data[i*numCols + w], tmpb, p0);  \
+       vstrhq_p(&data[j*numCols + w], tmpa, p0);  \
+                                                  \
+       cnt -= 8;                                  \
+    }                                             \
+  }
+
+#define SCALE_ROW_F16(A,COL,v,i)                   \
+{                                                   \
+  int cnt = ((A)->numCols)-(COL);                   \
+  int32_t w;                                       \
+  float16_t *data = (A)->pData;                     \
+  const int32_t numCols = (A)->numCols;            \
+                                                    \
+  for(w=(COL);w < numCols; w+=8)                    \
+  {                                                 \
+       f16x8_t tmpa;                                \
+       mve_pred16_t p0 = vctp16q(cnt);              \
+       tmpa = vldrhq_z_f16(&data[i*numCols + w],p0);\
+       tmpa = vmulq_n_f16(tmpa,(_Float16)v);                  \
+       vstrhq_p(&data[i*numCols + w], tmpa, p0);    \
+       cnt -= 8;                                    \
+  }                                                 \
+                                                    \
+}
+
+#define MAC_ROW_F16(COL,A,i,v,B,j)                   \
+{                                                    \
+  int cnt = ((A)->numCols)-(COL);                    \
+  int32_t w;                                        \
+  float16_t *dataA = (A)->pData;                     \
+  float16_t *dataB = (B)->pData;                     \
+  const int32_t numCols = (A)->numCols;             \
+                                                     \
+  for(w=(COL);w < numCols; w+=8)                     \
+  {                                                  \
+       f16x8_t tmpa,tmpb;                            \
+       mve_pred16_t p0 = vctp16q(cnt);               \
+       tmpa = vldrhq_z_f16(&dataA[i*numCols + w],p0);\
+       tmpb = vldrhq_z_f16(&dataB[j*numCols + w],p0);\
+       tmpa = vfmaq_n_f16(tmpa,tmpb,v);              \
+       vstrhq_p(&dataA[i*numCols + w], tmpa, p0);    \
+       cnt -= 8;                                     \
+  }                                                  \
+                                                     \
+}
+
+#define MAS_ROW_F16(COL,A,i,v,B,j)                   \
+{                                                    \
+  int cnt = ((A)->numCols)-(COL);                    \
+  int32_t w;                                        \
+  float16_t *dataA = (A)->pData;                     \
+  float16_t *dataB = (B)->pData;                     \
+  const int32_t numCols = (A)->numCols;             \
+  f16x8_t vec=vdupq_n_f16(v);                        \
+                                                     \
+  for(w=(COL);w < numCols; w+=8)                     \
+  {                                                  \
+       f16x8_t tmpa,tmpb;                            \
+       mve_pred16_t p0 = vctp16q(cnt);               \
+       tmpa = vldrhq_z_f16(&dataA[i*numCols + w],p0);\
+       tmpb = vldrhq_z_f16(&dataB[j*numCols + w],p0);\
+       tmpa = vfmsq_f16(tmpa,tmpb,vec);              \
+       vstrhq_p(&dataA[i*numCols + w], tmpa, p0);    \
+       cnt -= 8;                                     \
+  }                                                  \
+                                                     \
+}
+
+#else
+
+
+#define SWAP_ROWS_F16(A,COL,i,j)       \
+{                                      \
+  int32_t w;                           \
+  float16_t *dataI = (A)->pData;       \
+  float16_t *dataJ = (A)->pData;       \
+  const int32_t numCols = (A)->numCols;\
+  const int32_t nb = numCols-(COL);    \
+                                       \
+  dataI += i*numCols + (COL);          \
+  dataJ += j*numCols + (COL);          \
+                                       \
+  for(w=0;w < nb; w++)                 \
+  {                                    \
+     float16_t tmp;                    \
+     tmp = *dataI;                     \
+     *dataI++ = *dataJ;                \
+     *dataJ++ = tmp;                   \
+  }                                    \
+}
+
+#define SCALE_ROW_F16(A,COL,v,i)       \
+{                                      \
+  int32_t w;                           \
+  float16_t *data = (A)->pData;        \
+  const int32_t numCols = (A)->numCols;\
+  const int32_t nb = numCols-(COL);    \
+                                       \
+  data += i*numCols + (COL);           \
+                                       \
+  for(w=0;w < nb; w++)                 \
+  {                                    \
+     *data++ *= (_Float16)v;           \
+  }                                    \
+}
+
+
+#define MAC_ROW_F16(COL,A,i,v,B,j)                \
+{                                                 \
+  int32_t w;                                      \
+  float16_t *dataA = (A)->pData;                  \
+  float16_t *dataB = (B)->pData;                  \
+  const int32_t numCols = (A)->numCols;           \
+  const int32_t nb = numCols-(COL);               \
+                                                  \
+  dataA += i*numCols + (COL);                     \
+  dataB += j*numCols + (COL);                     \
+                                                  \
+  for(w=0;w < nb; w++)                            \
+  {                                               \
+     *dataA++ += (_Float16)v * (_Float16)*dataB++;\
+  }                                               \
+}
+
+#define MAS_ROW_F16(COL,A,i,v,B,j)                \
+{                                                 \
+  int32_t w;                                      \
+  float16_t *dataA = (A)->pData;                  \
+  float16_t *dataB = (B)->pData;                  \
+  const int32_t numCols = (A)->numCols;           \
+  const int32_t nb = numCols-(COL);               \
+                                                  \
+  dataA += i*numCols + (COL);                     \
+  dataB += j*numCols + (COL);                     \
+                                                  \
+  for(w=0;w < nb; w++)                            \
+  {                                               \
+     *dataA++ -= (_Float16)v * (_Float16)*dataB++;\
+  }                                               \
+}
+
+#endif /*defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)*/
+
+/* Functions with only a scalar version */
+#define COPY_COL_F16(A,ROW,COL,DST) \
+  COPY_COL_T(float16_t,A,ROW,COL,DST)
+
+#define SCALE_COL_F16(A,ROW,v,i)        \
+  SCALE_COL_T(float16_t,(_Float16),A,ROW,v,i)
+  
+#endif /* defined(ARM_FLOAT16_SUPPORTED)*/
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#define SWAP_ROWS_F32(A,COL,i,j)                  \
+  {                                               \
+    int cnt = ((A)->numCols)-(COL);               \
+    float32_t *data = (A)->pData;                 \
+    const int32_t numCols = (A)->numCols;        \
+    int32_t w;                                   \
+                                                  \
+    for(w=(COL);w < numCols; w+=4)                \
+    {                                             \
+       f32x4_t tmpa,tmpb;                         \
+       mve_pred16_t p0 = vctp32q(cnt);            \
+                                                  \
+       tmpa=vldrwq_z_f32(&data[i*numCols + w],p0);\
+       tmpb=vldrwq_z_f32(&data[j*numCols + w],p0);\
+                                                  \
+       vstrwq_p(&data[i*numCols + w], tmpb, p0);  \
+       vstrwq_p(&data[j*numCols + w], tmpa, p0);  \
+                                                  \
+       cnt -= 4;                                  \
+    }                                             \
+  }
+
+#define MAC_ROW_F32(COL,A,i,v,B,j)                   \
+{                                                    \
+  int cnt = ((A)->numCols)-(COL);                    \
+  float32_t *dataA = (A)->pData;                     \
+  float32_t *dataB = (B)->pData;                     \
+  const int32_t numCols = (A)->numCols;             \
+  int32_t w;                                        \
+                                                     \
+  for(w=(COL);w < numCols; w+=4)                     \
+  {                                                  \
+       f32x4_t tmpa,tmpb;                            \
+       mve_pred16_t p0 = vctp32q(cnt);               \
+       tmpa = vldrwq_z_f32(&dataA[i*numCols + w],p0);\
+       tmpb = vldrwq_z_f32(&dataB[j*numCols + w],p0);\
+       tmpa = vfmaq_n_f32(tmpa,tmpb,v);              \
+       vstrwq_p(&dataA[i*numCols + w], tmpa, p0);    \
+       cnt -= 4;                                     \
+  }                                                  \
+                                                     \
+}
+
+#define MAS_ROW_F32(COL,A,i,v,B,j)                   \
+{                                                    \
+  int cnt = ((A)->numCols)-(COL);                    \
+  float32_t *dataA = (A)->pData;                     \
+  float32_t *dataB = (B)->pData;                     \
+  const int32_t numCols = (A)->numCols;             \
+  int32_t w;                                        \
+  f32x4_t vec=vdupq_n_f32(v);                        \
+                                                     \
+  for(w=(COL);w < numCols; w+=4)                     \
+  {                                                  \
+       f32x4_t tmpa,tmpb;                            \
+       mve_pred16_t p0 = vctp32q(cnt);               \
+       tmpa = vldrwq_z_f32(&dataA[i*numCols + w],p0);\
+       tmpb = vldrwq_z_f32(&dataB[j*numCols + w],p0);\
+       tmpa = vfmsq_f32(tmpa,tmpb,vec);              \
+       vstrwq_p(&dataA[i*numCols + w], tmpa, p0);    \
+       cnt -= 4;                                     \
+  }                                                  \
+                                                     \
+}
+
+#define SCALE_ROW_F32(A,COL,v,i)                    \
+{                                                   \
+  int cnt = ((A)->numCols)-(COL);                   \
+  float32_t *data = (A)->pData;                     \
+  const int32_t numCols = (A)->numCols;            \
+  int32_t w;                                       \
+                                                    \
+  for(w=(COL);w < numCols; w+=4)                    \
+  {                                                 \
+       f32x4_t tmpa;                                \
+       mve_pred16_t p0 = vctp32q(cnt);              \
+       tmpa = vldrwq_z_f32(&data[i*numCols + w],p0);\
+       tmpa = vmulq_n_f32(tmpa,v);                  \
+       vstrwq_p(&data[i*numCols + w], tmpa, p0);    \
+       cnt -= 4;                                    \
+  }                                                 \
+                                                    \
+}
+
+#elif defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#define SWAP_ROWS_F32(A,COL,i,j)       \
+{                                      \
+  int32_t w;                           \
+  float32_t *dataI = (A)->pData;       \
+  float32_t *dataJ = (A)->pData;       \
+  const int32_t numCols = (A)->numCols;\
+  const int32_t nb = numCols - COL;    \
+                                       \
+  dataI += i*numCols + (COL);          \
+  dataJ += j*numCols + (COL);          \
+                                       \
+  float32_t tmp;                       \
+                                       \
+  for(w=0;w < nb; w++)                 \
+  {                                    \
+     tmp = *dataI;                     \
+     *dataI++ = *dataJ;                \
+     *dataJ++ = tmp;                   \
+  }                                    \
+}
+
+#define MAC_ROW_F32(COL,A,i,v,B,j)     \
+{                                      \
+  float32_t *dataA = (A)->pData;       \
+  float32_t *dataB = (B)->pData;       \
+  const int32_t numCols = (A)->numCols;\
+  const int32_t nb = numCols - (COL);  \
+  int32_t nbElems;                     \
+  f32x4_t vec = vdupq_n_f32(v);        \
+                                       \
+  nbElems = nb >> 2;                   \
+                                       \
+  dataA += i*numCols + (COL);          \
+  dataB += j*numCols + (COL);          \
+                                       \
+  while(nbElems>0)                     \
+  {                                    \
+       f32x4_t tmpa,tmpb;              \
+       tmpa = vld1q_f32(dataA,p0);     \
+       tmpb = vld1q_f32(dataB,p0);     \
+       tmpa = vmlaq_f32(tmpa,tmpb,vec);\
+       vst1q_f32(dataA, tmpa, p0);     \
+       nbElems--;                      \
+       dataA += 4;                     \
+       dataB += 4;                     \
+  }                                    \
+                                       \
+  nbElems = nb & 3;                    \
+  while(nbElems > 0)                   \
+  {                                    \
+     *dataA++ += v* *dataB++;          \
+     nbElems--;                        \
+  }                                    \
+}
+
+#define MAS_ROW_F32(COL,A,i,v,B,j)     \
+{                                      \
+  float32_t *dataA = (A)->pData;       \
+  float32_t *dataB = (B)->pData;       \
+  const int32_t numCols = (A)->numCols;\
+  const int32_t nb = numCols - (COL);  \
+  int32_t nbElems;                     \
+  f32x4_t vec = vdupq_n_f32(v);        \
+                                       \
+  nbElems = nb >> 2;                   \
+                                       \
+  dataA += i*numCols + (COL);          \
+  dataB += j*numCols + (COL);          \
+                                       \
+  while(nbElems>0)                     \
+  {                                    \
+       f32x4_t tmpa,tmpb;              \
+       tmpa = vld1q_f32(dataA);        \
+       tmpb = vld1q_f32(dataB);        \
+       tmpa = vmlsq_f32(tmpa,tmpb,vec);\
+       vst1q_f32(dataA, tmpa);         \
+       nbElems--;                      \
+       dataA += 4;                     \
+       dataB += 4;                     \
+  }                                    \
+                                       \
+  nbElems = nb & 3;                    \
+  while(nbElems > 0)                   \
+  {                                    \
+     *dataA++ -= v* *dataB++;          \
+     nbElems--;                        \
+  }                                    \
+}
+
+#define SCALE_ROW_F32(A,COL,v,i)        \
+{                                       \
+  float32_t *data = (A)->pData;         \
+  const int32_t numCols = (A)->numCols; \
+  const int32_t nb = numCols - (COL);   \
+  int32_t nbElems;                      \
+  f32x4_t vec = vdupq_n_f32(v);         \
+                                        \
+  nbElems = nb >> 2;                    \
+                                        \
+  data += i*numCols + (COL);            \
+  while(nbElems>0)                      \
+  {                                     \
+       f32x4_t tmpa;                    \
+       tmpa = vld1q_f32(data);          \
+       tmpa = vmulq_f32(tmpa,vec);      \
+       vst1q_f32(data, tmpa);           \
+       data += 4;                       \
+       nbElems --;                      \
+  }                                     \
+                                        \
+  nbElems = nb & 3;                     \
+  while(nbElems > 0)                    \
+  {                                     \
+     *data++ *= v;                      \
+     nbElems--;                         \
+  }                                     \
+                                        \
+}
+
+#else
+
+#define SWAP_ROWS_F32(A,COL,i,j)       \
+{                                      \
+  int32_t w;                           \
+  float32_t tmp;                       \
+  float32_t *dataI = (A)->pData;       \
+  float32_t *dataJ = (A)->pData;       \
+  const int32_t numCols = (A)->numCols;\
+  const int32_t nb = numCols - COL;    \
+                                       \
+  dataI += i*numCols + (COL);          \
+  dataJ += j*numCols + (COL);          \
+                                       \
+                                       \
+  for(w=0;w < nb; w++)                 \
+  {                                    \
+     tmp = *dataI;                     \
+     *dataI++ = *dataJ;                \
+     *dataJ++ = tmp;                   \
+  }                                    \
+}
+
+#define SCALE_ROW_F32(A,COL,v,i)       \
+{                                      \
+  int32_t w;                           \
+  float32_t *data = (A)->pData;        \
+  const int32_t numCols = (A)->numCols;\
+  const int32_t nb = numCols - COL;    \
+                                       \
+  data += i*numCols + (COL);           \
+                                       \
+  for(w=0;w < nb; w++)                 \
+  {                                    \
+     *data++ *= v;                     \
+  }                                    \
+}
+
+
+#define MAC_ROW_F32(COL,A,i,v,B,j)     \
+{                                      \
+  int32_t w;                           \
+  float32_t *dataA = (A)->pData;       \
+  float32_t *dataB = (B)->pData;       \
+  const int32_t numCols = (A)->numCols;\
+  const int32_t nb = numCols-(COL);    \
+                                       \
+  dataA = dataA + i*numCols + (COL);   \
+  dataB = dataB + j*numCols + (COL);   \
+                                       \
+  for(w=0;w < nb; w++)                 \
+  {                                    \
+     *dataA++ += v* *dataB++;          \
+  }                                    \
+}
+
+#define MAS_ROW_F32(COL,A,i,v,B,j)     \
+{                                      \
+  int32_t w;                           \
+  float32_t *dataA = (A)->pData;       \
+  float32_t *dataB = (B)->pData;       \
+  const int32_t numCols = (A)->numCols;\
+  const int32_t nb = numCols-(COL);    \
+                                       \
+  dataA = dataA + i*numCols + (COL);   \
+  dataB = dataB + j*numCols + (COL);   \
+                                       \
+  for(w=0;w < nb; w++)                 \
+  {                                    \
+     *dataA++ -= v* *dataB++;          \
+  }                                    \
+}
+
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+
+/* Functions with only a scalar version */
+
+#define COPY_COL_F32(A,ROW,COL,DST) \
+  COPY_COL_T(float32_t,A,ROW,COL,DST)
+
+#define COPY_COL_F64(A,ROW,COL,DST) \
+  COPY_COL_T(float64_t,A,ROW,COL,DST)
+
+#define SWAP_COLS_F32(A,COL,i,j)               \
+{                                              \
+  int32_t w;                                  \
+  float32_t *data = (A)->pData;                \
+  const int32_t numCols = (A)->numCols;       \
+  for(w=(COL);w < numCols; w++)                \
+  {                                            \
+     float32_t tmp;                            \
+     tmp = data[w*numCols + i];                \
+     data[w*numCols + i] = data[w*numCols + j];\
+     data[w*numCols + j] = tmp;                \
+  }                                            \
+}
+
+#define SCALE_COL_F32(A,ROW,v,i)        \
+  SCALE_COL_T(float32_t,,A,ROW,v,i)
+
+#define SWAP_ROWS_F64(A,COL,i,j)       \
+{                                      \
+  int32_t w;                           \
+  float64_t *dataI = (A)->pData;       \
+  float64_t *dataJ = (A)->pData;       \
+  const int32_t numCols = (A)->numCols;\
+  const int32_t nb = numCols-(COL);    \
+                                       \
+  dataI += i*numCols + (COL);          \
+  dataJ += j*numCols + (COL);          \
+                                       \
+  for(w=0;w < nb; w++)                 \
+  {                                    \
+     float64_t tmp;                    \
+     tmp = *dataI;                     \
+     *dataI++ = *dataJ;                \
+     *dataJ++ = tmp;                   \
+  }                                    \
+}
+
+#define SWAP_COLS_F64(A,COL,i,j)               \
+{                                              \
+  int32_t w;                                  \
+  float64_t *data = (A)->pData;                \
+  const int32_t numCols = (A)->numCols;       \
+  for(w=(COL);w < numCols; w++)                \
+  {                                            \
+     float64_t tmp;                            \
+     tmp = data[w*numCols + i];                \
+     data[w*numCols + i] = data[w*numCols + j];\
+     data[w*numCols + j] = tmp;                \
+  }                                            \
+}
+
+#define SCALE_ROW_F64(A,COL,v,i)       \
+{                                      \
+  int32_t w;                           \
+  float64_t *data = (A)->pData;        \
+  const int32_t numCols = (A)->numCols;\
+  const int32_t nb = numCols-(COL);    \
+                                       \
+  data += i*numCols + (COL);           \
+                                       \
+  for(w=0;w < nb; w++)                 \
+  {                                    \
+     *data++ *= v;                     \
+  }                                    \
+}
+
+#define SCALE_COL_F64(A,ROW,v,i)        \
+  SCALE_COL_T(float64_t,,A,ROW,v,i)
+
+#define MAC_ROW_F64(COL,A,i,v,B,j)      \
+{                                       \
+  int32_t w;                           \
+  float64_t *dataA = (A)->pData;        \
+  float64_t *dataB = (B)->pData;        \
+  const int32_t numCols = (A)->numCols;\
+  const int32_t nb = numCols-(COL);     \
+                                        \
+  dataA += i*numCols + (COL);           \
+  dataB += j*numCols + (COL);           \
+                                        \
+  for(w=0;w < nb; w++)                  \
+  {                                     \
+     *dataA++ += v* *dataB++;           \
+  }                                     \
+}
+
+#define MAS_ROW_F64(COL,A,i,v,B,j)      \
+{                                       \
+  int32_t w;                           \
+  float64_t *dataA = (A)->pData;        \
+  float64_t *dataB = (B)->pData;        \
+  const int32_t numCols = (A)->numCols;\
+  const int32_t nb = numCols-(COL);     \
+                                        \
+  dataA += i*numCols + (COL);           \
+  dataB += j*numCols + (COL);           \
+                                        \
+  for(w=0;w < nb; w++)                  \
+  {                                     \
+     *dataA++ -= v* *dataB++;           \
+  }                                     \
+}
+
+#ifdef   __cplusplus
+}
+#endif
+
+#endif /* ifndef _MATRIX_UTILS_H_ */
diff --git a/ARM_math/none.h b/ARM_math/none.h
new file mode 100644
index 0000000..130386e
--- /dev/null
+++ b/ARM_math/none.h
@@ -0,0 +1,576 @@
+/******************************************************************************
+ * @file     none.h
+ * @brief    Intrinsincs when no DSP extension available
+ * @version  V1.9.0
+ * @date     20. July 2020
+ ******************************************************************************/
+/*
+ * Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+
+Definitions in this file are allowing to reuse some versions of the
+CMSIS-DSP to build on a core (M0 for instance) or a host where
+DSP extension are not available.
+
+Ideally a pure C version should have been used instead.
+But those are not always available or use a restricted set
+of intrinsics.
+
+*/
+ 
+#ifndef _NONE_H_
+#define _NONE_H_
+
+#include "arm_math_types.h"
+
+#ifdef   __cplusplus
+extern "C"
+{
+#endif
+
+ 
+
+/*
+
+Normally those kind of definitions are in a compiler file
+in Core or Core_A.
+
+But for MSVC compiler it is a bit special. The goal is very specific
+to CMSIS-DSP and only to allow the use of this library from other
+systems like Python or Matlab.
+
+MSVC is not going to be used to cross-compile to ARM. So, having a MSVC
+compiler file in Core or Core_A would not make sense.
+
+*/
+#if defined ( _MSC_VER ) || defined(__GNUC_PYTHON__) || defined(__APPLE_CC__)
+    __STATIC_FORCEINLINE uint8_t __CLZ(uint32_t data)
+    {
+      if (data == 0U) { return 32U; }
+
+      uint32_t count = 0U;
+      uint32_t mask = 0x80000000U;
+
+      while ((data & mask) == 0U)
+      {
+        count += 1U;
+        mask = mask >> 1U;
+      }
+      return count;
+    }
+
+  __STATIC_FORCEINLINE int32_t __SSAT(int32_t val, uint32_t sat)
+  {
+    if ((sat >= 1U) && (sat <= 32U))
+    {
+      const int32_t max = (int32_t)((1U << (sat - 1U)) - 1U);
+      const int32_t min = -1 - max ;
+      if (val > max)
+      {
+        return max;
+      }
+      else if (val < min)
+      {
+        return min;
+      }
+    }
+    return val;
+  }
+
+  __STATIC_FORCEINLINE uint32_t __USAT(int32_t val, uint32_t sat)
+  {
+    if (sat <= 31U)
+    {
+      const uint32_t max = ((1U << sat) - 1U);
+      if (val > (int32_t)max)
+      {
+        return max;
+      }
+      else if (val < 0)
+      {
+        return 0U;
+      }
+    }
+    return (uint32_t)val;
+  }
+
+ /**
+  \brief   Rotate Right in unsigned value (32 bit)
+  \details Rotate Right (immediate) provides the value of the contents of a register rotated by a variable number of bits.
+  \param [in]    op1  Value to rotate
+  \param [in]    op2  Number of Bits to rotate
+  \return               Rotated value
+ */
+__STATIC_FORCEINLINE uint32_t __ROR(uint32_t op1, uint32_t op2)
+{
+  op2 %= 32U;
+  if (op2 == 0U)
+  {
+    return op1;
+  }
+  return (op1 >> op2) | (op1 << (32U - op2));
+}
+
+
+#endif
+
+/**
+   * @brief Clips Q63 to Q31 values.
+   */
+  __STATIC_FORCEINLINE q31_t clip_q63_to_q31(
+  q63_t x)
+  {
+    return ((q31_t) (x >> 32) != ((q31_t) x >> 31)) ?
+      ((0x7FFFFFFF ^ ((q31_t) (x >> 63)))) : (q31_t) x;
+  }
+
+  /**
+   * @brief Clips Q63 to Q15 values.
+   */
+  __STATIC_FORCEINLINE q15_t clip_q63_to_q15(
+  q63_t x)
+  {
+    return ((q31_t) (x >> 32) != ((q31_t) x >> 31)) ?
+      ((0x7FFF ^ ((q15_t) (x >> 63)))) : (q15_t) (x >> 15);
+  }
+
+  /**
+   * @brief Clips Q31 to Q7 values.
+   */
+  __STATIC_FORCEINLINE q7_t clip_q31_to_q7(
+  q31_t x)
+  {
+    return ((q31_t) (x >> 24) != ((q31_t) x >> 23)) ?
+      ((0x7F ^ ((q7_t) (x >> 31)))) : (q7_t) x;
+  }
+
+  /**
+   * @brief Clips Q31 to Q15 values.
+   */
+  __STATIC_FORCEINLINE q15_t clip_q31_to_q15(
+  q31_t x)
+  {
+    return ((q31_t) (x >> 16) != ((q31_t) x >> 15)) ?
+      ((0x7FFF ^ ((q15_t) (x >> 31)))) : (q15_t) x;
+  }
+
+  /**
+   * @brief Multiplies 32 X 64 and returns 32 bit result in 2.30 format.
+   */
+  __STATIC_FORCEINLINE q63_t mult32x64(
+  q63_t x,
+  q31_t y)
+  {
+    return ((((q63_t) (x & 0x00000000FFFFFFFF) * y) >> 32) +
+            (((q63_t) (x >> 32)                * y)      )  );
+  }
+
+/* SMMLAR */
+#define multAcc_32x32_keep32_R(a, x, y) \
+    a = (q31_t) (((((q63_t) a) << 32) + ((q63_t) x * y) + 0x80000000LL ) >> 32)
+
+/* SMMLSR */
+#define multSub_32x32_keep32_R(a, x, y) \
+    a = (q31_t) (((((q63_t) a) << 32) - ((q63_t) x * y) + 0x80000000LL ) >> 32)
+
+/* SMMULR */
+#define mult_32x32_keep32_R(a, x, y) \
+    a = (q31_t) (((q63_t) x * y + 0x80000000LL ) >> 32)
+
+/* SMMLA */
+#define multAcc_32x32_keep32(a, x, y) \
+    a += (q31_t) (((q63_t) x * y) >> 32)
+
+/* SMMLS */
+#define multSub_32x32_keep32(a, x, y) \
+    a -= (q31_t) (((q63_t) x * y) >> 32)
+
+/* SMMUL */
+#define mult_32x32_keep32(a, x, y) \
+    a = (q31_t) (((q63_t) x * y ) >> 32)
+
+#ifndef ARM_MATH_DSP
+  /**
+   * @brief definition to pack two 16 bit values.
+   */
+  #define __PKHBT(ARG1, ARG2, ARG3) ( (((int32_t)(ARG1) <<    0) & (int32_t)0x0000FFFF) | \
+                                      (((int32_t)(ARG2) << ARG3) & (int32_t)0xFFFF0000)  )
+  #define __PKHTB(ARG1, ARG2, ARG3) ( (((int32_t)(ARG1) <<    0) & (int32_t)0xFFFF0000) | \
+                                      (((int32_t)(ARG2) >> ARG3) & (int32_t)0x0000FFFF)  )
+#endif
+
+   /**
+   * @brief definition to pack four 8 bit values.
+   */
+#ifndef ARM_MATH_BIG_ENDIAN
+  #define __PACKq7(v0,v1,v2,v3) ( (((int32_t)(v0) <<  0) & (int32_t)0x000000FF) | \
+                                  (((int32_t)(v1) <<  8) & (int32_t)0x0000FF00) | \
+                                  (((int32_t)(v2) << 16) & (int32_t)0x00FF0000) | \
+                                  (((int32_t)(v3) << 24) & (int32_t)0xFF000000)  )
+#else
+  #define __PACKq7(v0,v1,v2,v3) ( (((int32_t)(v3) <<  0) & (int32_t)0x000000FF) | \
+                                  (((int32_t)(v2) <<  8) & (int32_t)0x0000FF00) | \
+                                  (((int32_t)(v1) << 16) & (int32_t)0x00FF0000) | \
+                                  (((int32_t)(v0) << 24) & (int32_t)0xFF000000)  )
+#endif
+
+
+ 
+
+/*
+ * @brief C custom defined intrinsic functions
+ */
+#if !defined (ARM_MATH_DSP)
+
+
+  /*
+   * @brief C custom defined QADD8
+   */
+  __STATIC_FORCEINLINE uint32_t __QADD8(
+  uint32_t x,
+  uint32_t y)
+  {
+    q31_t r, s, t, u;
+
+    r = __SSAT(((((q31_t)x << 24) >> 24) + (((q31_t)y << 24) >> 24)), 8) & (int32_t)0x000000FF;
+    s = __SSAT(((((q31_t)x << 16) >> 24) + (((q31_t)y << 16) >> 24)), 8) & (int32_t)0x000000FF;
+    t = __SSAT(((((q31_t)x <<  8) >> 24) + (((q31_t)y <<  8) >> 24)), 8) & (int32_t)0x000000FF;
+    u = __SSAT(((((q31_t)x      ) >> 24) + (((q31_t)y      ) >> 24)), 8) & (int32_t)0x000000FF;
+
+    return ((uint32_t)((u << 24) | (t << 16) | (s <<  8) | (r      )));
+  }
+
+
+  /*
+   * @brief C custom defined QSUB8
+   */
+  __STATIC_FORCEINLINE uint32_t __QSUB8(
+  uint32_t x,
+  uint32_t y)
+  {
+    q31_t r, s, t, u;
+
+    r = __SSAT(((((q31_t)x << 24) >> 24) - (((q31_t)y << 24) >> 24)), 8) & (int32_t)0x000000FF;
+    s = __SSAT(((((q31_t)x << 16) >> 24) - (((q31_t)y << 16) >> 24)), 8) & (int32_t)0x000000FF;
+    t = __SSAT(((((q31_t)x <<  8) >> 24) - (((q31_t)y <<  8) >> 24)), 8) & (int32_t)0x000000FF;
+    u = __SSAT(((((q31_t)x      ) >> 24) - (((q31_t)y      ) >> 24)), 8) & (int32_t)0x000000FF;
+
+    return ((uint32_t)((u << 24) | (t << 16) | (s <<  8) | (r      )));
+  }
+
+
+  /*
+   * @brief C custom defined QADD16
+   */
+  __STATIC_FORCEINLINE uint32_t __QADD16(
+  uint32_t x,
+  uint32_t y)
+  {
+/*  q31_t r,     s;  without initialisation 'arm_offset_q15 test' fails  but 'intrinsic' tests pass! for armCC */
+    q31_t r = 0, s = 0;
+
+    r = __SSAT(((((q31_t)x << 16) >> 16) + (((q31_t)y << 16) >> 16)), 16) & (int32_t)0x0000FFFF;
+    s = __SSAT(((((q31_t)x      ) >> 16) + (((q31_t)y      ) >> 16)), 16) & (int32_t)0x0000FFFF;
+
+    return ((uint32_t)((s << 16) | (r      )));
+  }
+
+
+  /*
+   * @brief C custom defined SHADD16
+   */
+  __STATIC_FORCEINLINE uint32_t __SHADD16(
+  uint32_t x,
+  uint32_t y)
+  {
+    q31_t r, s;
+
+    r = (((((q31_t)x << 16) >> 16) + (((q31_t)y << 16) >> 16)) >> 1) & (int32_t)0x0000FFFF;
+    s = (((((q31_t)x      ) >> 16) + (((q31_t)y      ) >> 16)) >> 1) & (int32_t)0x0000FFFF;
+
+    return ((uint32_t)((s << 16) | (r      )));
+  }
+
+
+  /*
+   * @brief C custom defined QSUB16
+   */
+  __STATIC_FORCEINLINE uint32_t __QSUB16(
+  uint32_t x,
+  uint32_t y)
+  {
+    q31_t r, s;
+
+    r = __SSAT(((((q31_t)x << 16) >> 16) - (((q31_t)y << 16) >> 16)), 16) & (int32_t)0x0000FFFF;
+    s = __SSAT(((((q31_t)x      ) >> 16) - (((q31_t)y      ) >> 16)), 16) & (int32_t)0x0000FFFF;
+
+    return ((uint32_t)((s << 16) | (r      )));
+  }
+
+
+  /*
+   * @brief C custom defined SHSUB16
+   */
+  __STATIC_FORCEINLINE uint32_t __SHSUB16(
+  uint32_t x,
+  uint32_t y)
+  {
+    q31_t r, s;
+
+    r = (((((q31_t)x << 16) >> 16) - (((q31_t)y << 16) >> 16)) >> 1) & (int32_t)0x0000FFFF;
+    s = (((((q31_t)x      ) >> 16) - (((q31_t)y      ) >> 16)) >> 1) & (int32_t)0x0000FFFF;
+
+    return ((uint32_t)((s << 16) | (r      )));
+  }
+
+
+  /*
+   * @brief C custom defined QASX
+   */
+  __STATIC_FORCEINLINE uint32_t __QASX(
+  uint32_t x,
+  uint32_t y)
+  {
+    q31_t r, s;
+
+    r = __SSAT(((((q31_t)x << 16) >> 16) - (((q31_t)y      ) >> 16)), 16) & (int32_t)0x0000FFFF;
+    s = __SSAT(((((q31_t)x      ) >> 16) + (((q31_t)y << 16) >> 16)), 16) & (int32_t)0x0000FFFF;
+
+    return ((uint32_t)((s << 16) | (r      )));
+  }
+
+
+  /*
+   * @brief C custom defined SHASX
+   */
+  __STATIC_FORCEINLINE uint32_t __SHASX(
+  uint32_t x,
+  uint32_t y)
+  {
+    q31_t r, s;
+
+    r = (((((q31_t)x << 16) >> 16) - (((q31_t)y      ) >> 16)) >> 1) & (int32_t)0x0000FFFF;
+    s = (((((q31_t)x      ) >> 16) + (((q31_t)y << 16) >> 16)) >> 1) & (int32_t)0x0000FFFF;
+
+    return ((uint32_t)((s << 16) | (r      )));
+  }
+
+
+  /*
+   * @brief C custom defined QSAX
+   */
+  __STATIC_FORCEINLINE uint32_t __QSAX(
+  uint32_t x,
+  uint32_t y)
+  {
+    q31_t r, s;
+
+    r = __SSAT(((((q31_t)x << 16) >> 16) + (((q31_t)y      ) >> 16)), 16) & (int32_t)0x0000FFFF;
+    s = __SSAT(((((q31_t)x      ) >> 16) - (((q31_t)y << 16) >> 16)), 16) & (int32_t)0x0000FFFF;
+
+    return ((uint32_t)((s << 16) | (r      )));
+  }
+
+
+  /*
+   * @brief C custom defined SHSAX
+   */
+  __STATIC_FORCEINLINE uint32_t __SHSAX(
+  uint32_t x,
+  uint32_t y)
+  {
+    q31_t r, s;
+
+    r = (((((q31_t)x << 16) >> 16) + (((q31_t)y      ) >> 16)) >> 1) & (int32_t)0x0000FFFF;
+    s = (((((q31_t)x      ) >> 16) - (((q31_t)y << 16) >> 16)) >> 1) & (int32_t)0x0000FFFF;
+
+    return ((uint32_t)((s << 16) | (r      )));
+  }
+
+
+  /*
+   * @brief C custom defined SMUSDX
+   */
+  __STATIC_FORCEINLINE uint32_t __SMUSDX(
+  uint32_t x,
+  uint32_t y)
+  {
+    return ((uint32_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y      ) >> 16)) -
+                       ((((q31_t)x      ) >> 16) * (((q31_t)y << 16) >> 16))   ));
+  }
+
+  /*
+   * @brief C custom defined SMUADX
+   */
+  __STATIC_FORCEINLINE uint32_t __SMUADX(
+  uint32_t x,
+  uint32_t y)
+  {
+    return ((uint32_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y      ) >> 16)) +
+                       ((((q31_t)x      ) >> 16) * (((q31_t)y << 16) >> 16))   ));
+  }
+
+
+  /*
+   * @brief C custom defined QADD
+   */
+  __STATIC_FORCEINLINE int32_t __QADD(
+  int32_t x,
+  int32_t y)
+  {
+    return ((int32_t)(clip_q63_to_q31((q63_t)x + (q31_t)y)));
+  }
+
+
+  /*
+   * @brief C custom defined QSUB
+   */
+  __STATIC_FORCEINLINE int32_t __QSUB(
+  int32_t x,
+  int32_t y)
+  {
+    return ((int32_t)(clip_q63_to_q31((q63_t)x - (q31_t)y)));
+  }
+
+
+  /*
+   * @brief C custom defined SMLAD
+   */
+  __STATIC_FORCEINLINE uint32_t __SMLAD(
+  uint32_t x,
+  uint32_t y,
+  uint32_t sum)
+  {
+    return ((uint32_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y << 16) >> 16)) +
+                       ((((q31_t)x      ) >> 16) * (((q31_t)y      ) >> 16)) +
+                       ( ((q31_t)sum    )                                  )   ));
+  }
+
+
+  /*
+   * @brief C custom defined SMLADX
+   */
+  __STATIC_FORCEINLINE uint32_t __SMLADX(
+  uint32_t x,
+  uint32_t y,
+  uint32_t sum)
+  {
+    return ((uint32_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y      ) >> 16)) +
+                       ((((q31_t)x      ) >> 16) * (((q31_t)y << 16) >> 16)) +
+                       ( ((q31_t)sum    )                                  )   ));
+  }
+
+
+  /*
+   * @brief C custom defined SMLSDX
+   */
+  __STATIC_FORCEINLINE uint32_t __SMLSDX(
+  uint32_t x,
+  uint32_t y,
+  uint32_t sum)
+  {
+    return ((uint32_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y      ) >> 16)) -
+                       ((((q31_t)x      ) >> 16) * (((q31_t)y << 16) >> 16)) +
+                       ( ((q31_t)sum    )                                  )   ));
+  }
+
+
+  /*
+   * @brief C custom defined SMLALD
+   */
+  __STATIC_FORCEINLINE uint64_t __SMLALD(
+  uint32_t x,
+  uint32_t y,
+  uint64_t sum)
+  {
+/*  return (sum + ((q15_t) (x >> 16) * (q15_t) (y >> 16)) + ((q15_t) x * (q15_t) y)); */
+    return ((uint64_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y << 16) >> 16)) +
+                       ((((q31_t)x      ) >> 16) * (((q31_t)y      ) >> 16)) +
+                       ( ((q63_t)sum    )                                  )   ));
+  }
+
+
+  /*
+   * @brief C custom defined SMLALDX
+   */
+  __STATIC_FORCEINLINE uint64_t __SMLALDX(
+  uint32_t x,
+  uint32_t y,
+  uint64_t sum)
+  {
+/*  return (sum + ((q15_t) (x >> 16) * (q15_t) y)) + ((q15_t) x * (q15_t) (y >> 16)); */
+    return ((uint64_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y      ) >> 16)) +
+                       ((((q31_t)x      ) >> 16) * (((q31_t)y << 16) >> 16)) +
+                       ( ((q63_t)sum    )                                  )   ));
+  }
+
+
+  /*
+   * @brief C custom defined SMUAD
+   */
+  __STATIC_FORCEINLINE uint32_t __SMUAD(
+  uint32_t x,
+  uint32_t y)
+  {
+    return ((uint32_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y << 16) >> 16)) +
+                       ((((q31_t)x      ) >> 16) * (((q31_t)y      ) >> 16))   ));
+  }
+
+
+  /*
+   * @brief C custom defined SMUSD
+   */
+  __STATIC_FORCEINLINE uint32_t __SMUSD(
+  uint32_t x,
+  uint32_t y)
+  {
+    return ((uint32_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y << 16) >> 16)) -
+                       ((((q31_t)x      ) >> 16) * (((q31_t)y      ) >> 16))   ));
+  }
+
+
+  /*
+   * @brief C custom defined SXTB16
+   */
+  __STATIC_FORCEINLINE uint32_t __SXTB16(
+  uint32_t x)
+  {
+    return ((uint32_t)(((((q31_t)x << 24) >> 24) & (q31_t)0x0000FFFF) |
+                       ((((q31_t)x <<  8) >>  8) & (q31_t)0xFFFF0000)  ));
+  }
+
+  /*
+   * @brief C custom defined SMMLA
+   */
+  __STATIC_FORCEINLINE int32_t __SMMLA(
+  int32_t x,
+  int32_t y,
+  int32_t sum)
+  {
+    return (sum + (int32_t) (((int64_t) x * y) >> 32));
+  }
+
+#endif /* !defined (ARM_MATH_DSP) */
+
+
+#ifdef   __cplusplus
+}
+#endif
+
+#endif /* ifndef _TRANSFORM_FUNCTIONS_H_ */
diff --git a/ARM_math/utils.h b/ARM_math/utils.h
new file mode 100644
index 0000000..7b1da5b
--- /dev/null
+++ b/ARM_math/utils.h
@@ -0,0 +1,250 @@
+/******************************************************************************
+ * @file     arm_math_utils.h
+ * @brief    Public header file for CMSIS DSP Library
+ * @version  V1.9.0
+ * @date     20. July 2020
+ ******************************************************************************/
+/*
+ * Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _ARM_MATH_UTILS_H_
+
+#define _ARM_MATH_UTILS_H_
+
+#include "arm_math_types.h"
+#include <limits.h>
+
+#ifdef   __cplusplus
+extern "C"
+{
+#endif
+
+  /**
+   * @brief Macros required for reciprocal calculation in Normalized LMS
+   */
+
+#define INDEX_MASK         0x0000003F
+
+
+#define SQ(x) ((x) * (x))
+
+#define ROUND_UP(N, S) ((((N) + (S) - 1) / (S)) * (S))
+
+
+  /**
+   * @brief Function to Calculates 1/in (reciprocal) value of Q31 Data type.
+     It should not be used with negative values.
+   */
+  __STATIC_FORCEINLINE uint32_t arm_recip_q31(
+        q31_t in,
+        q31_t * dst,
+  const q31_t * pRecipTable)
+  {
+    q31_t out;
+    uint32_t tempVal;
+    uint32_t index, i;
+    uint32_t signBits;
+
+    if (in > 0)
+    {
+      signBits = ((uint32_t) (__CLZ( (uint32_t)in) - 1));
+    }
+    else
+    {
+      signBits = ((uint32_t) (__CLZ((uint32_t)(-in)) - 1));
+    }
+
+    /* Convert input sample to 1.31 format */
+    in = (in << signBits);
+
+    /* calculation of index for initial approximated Val */
+    index = (uint32_t)(in >> 24);
+    index = (index & INDEX_MASK);
+
+    /* 1.31 with exp 1 */
+    out = pRecipTable[index];
+
+    /* calculation of reciprocal value */
+    /* running approximation for two iterations */
+    for (i = 0U; i < 2U; i++)
+    {
+      tempVal = (uint32_t) (((q63_t) in * out) >> 31);
+      tempVal = 0x7FFFFFFFu - tempVal;
+      /*      1.31 with exp 1 */
+      /* out = (q31_t) (((q63_t) out * tempVal) >> 30); */
+      out = clip_q63_to_q31(((q63_t) out * tempVal) >> 30);
+    }
+
+    /* write output */
+    *dst = out;
+
+    /* return num of signbits of out = 1/in value */
+    return (signBits + 1U);
+  }
+
+
+  /**
+   * @brief Function to Calculates 1/in (reciprocal) value of Q15 Data type.
+     It should not be used with negative values.
+   */
+  __STATIC_FORCEINLINE uint32_t arm_recip_q15(
+        q15_t in,
+        q15_t * dst,
+  const q15_t * pRecipTable)
+  {
+    q15_t out = 0;
+    int32_t tempVal = 0;
+    uint32_t index = 0, i = 0;
+    uint32_t signBits = 0;
+
+    if (in > 0)
+    {
+      signBits = ((uint32_t)(__CLZ( (uint32_t)in) - 17));
+    }
+    else
+    {
+      signBits = ((uint32_t)(__CLZ((uint32_t)(-in)) - 17));
+    }
+
+    /* Convert input sample to 1.15 format */
+    in = (q15_t)(in << signBits);
+
+    /* calculation of index for initial approximated Val */
+    index = (uint32_t)(in >>  8);
+    index = (index & INDEX_MASK);
+
+    /*      1.15 with exp 1  */
+    out = pRecipTable[index];
+
+    /* calculation of reciprocal value */
+    /* running approximation for two iterations */
+    for (i = 0U; i < 2U; i++)
+    {
+      tempVal = (((q31_t) in * out) >> 15);
+      tempVal = 0x7FFF - tempVal;
+      /*      1.15 with exp 1 */
+      out = (q15_t) (((q31_t) out * tempVal) >> 14);
+      /* out = clip_q31_to_q15(((q31_t) out * tempVal) >> 14); */
+    }
+
+    /* write output */
+    *dst = out;
+
+    /* return num of signbits of out = 1/in value */
+    return (signBits + 1);
+  }
+
+
+/**
+ * @brief  64-bit to 32-bit unsigned normalization
+ * @param[in]  in           is input unsigned long long value
+ * @param[out] normalized   is the 32-bit normalized value
+ * @param[out] norm         is norm scale
+ */
+__STATIC_INLINE  void arm_norm_64_to_32u(uint64_t in, int32_t * normalized, int32_t *norm)
+{
+    int32_t     n1;
+    int32_t     hi = (int32_t) (in >> 32);
+    int32_t     lo = (int32_t) ((in << 32) >> 32);
+
+    n1 = __CLZ((uint32_t)hi) - 32;
+    if (!n1)
+    {
+        /*
+         * input fits in 32-bit
+         */
+        n1 = __CLZ((uint32_t)lo);
+        if (!n1)
+        {
+            /*
+             * MSB set, need to scale down by 1
+             */
+            *norm = -1;
+            *normalized = (((uint32_t) lo) >> 1);
+        } else
+        {
+            if (n1 == 32)
+            {
+                /*
+                 * input is zero
+                 */
+                *norm = 0;
+                *normalized = 0;
+            } else
+            {
+                /*
+                 * 32-bit normalization
+                 */
+                *norm = n1 - 1;
+                *normalized = lo << *norm;
+            }
+        }
+    } else
+    {
+        /*
+         * input fits in 64-bit
+         */
+        n1 = 1 - n1;
+        *norm = -n1;
+        /*
+         * 64 bit normalization
+         */
+        *normalized = (int32_t)(((uint32_t)lo) >> n1) | (hi << (32 - n1));
+    }
+}
+
+__STATIC_INLINE int32_t arm_div_int64_to_int32(int64_t num, int32_t den)
+{
+    int32_t   result;
+    uint64_t   absNum;
+    int32_t   normalized;
+    int32_t   norm;
+
+    /*
+     * if sum fits in 32bits
+     * avoid costly 64-bit division
+     */
+    if (num == (int64_t)LONG_MIN)
+    {
+        absNum = LONG_MAX;
+    }
+    else
+    {
+       absNum = (uint64_t) (num > 0 ? num : -num);
+    }
+    arm_norm_64_to_32u(absNum, &normalized, &norm);
+    if (norm > 0)
+        /*
+         * 32-bit division
+         */
+        result = (int32_t) num / den;
+    else
+        /*
+         * 64-bit division
+         */
+        result = (int32_t) (num / den);
+
+    return result;
+}
+
+
+#ifdef   __cplusplus
+}
+#endif
+
+#endif /*ifndef _ARM_MATH_UTILS_H_ */
diff --git a/Filter_Design/Kalman_XVA_acc_offset.m b/Filter_Design/Kalman_XVA_acc_offset.m
index 99ce527..82306ea 100644
--- a/Filter_Design/Kalman_XVA_acc_offset.m
+++ b/Filter_Design/Kalman_XVA_acc_offset.m
@@ -91,33 +91,33 @@
 
 end  
 
-%subplot( 4, 1, 1);
-figure(1);
+subplot( 3, 1, 1);
+%figure(1);
 plot(acc_est, 'r');
 hold;
 plot(acc,'g');
-plot(acc_m, '+');
+plot(acc_m, '.');
 plot(acc_est_x);
 plot(acc_offset);
 legend('Estimated Acc.','True Acc.','Acc.Measurement','Acc. Offset est.','Acc. Offset');
 title('Kalmanfilter Movement Estimation');
 ylabel('Accel. / m/s/s');
 grid;
-%subplot( 4, 1, 2);
-figure(2);
+subplot( 3, 1, 2);
+%figure(2);
 plot(vel_est,'r');
 hold;
 plot(vel,'g');
-plot(vel_m, '+');
+plot(vel_m, '.');
 legend('Estimated Velocity','True Velocity','Velocity Measurement');
 ylabel('Velocity / m/s');
 grid;
-%subplot( 4, 1, 3)
-figure(3);
+subplot( 3, 1, 3)
+%figure(3);
 plot(pos_est, 'r');
 hold;
 plot(pos,'g');
-plot(pos_m, '+');
+plot(pos_m, '.');
 ylabel('Position / m');
 legend('Estimated Position','True Position','Position Measurement');
 xlabel('Time / 10ms');
diff --git a/Filter_Design/induction_sensor_matrix_plus_soft_iron_compensation.m b/Filter_Design/induction_sensor_matrix_plus_soft_iron_compensation.m
new file mode 100644
index 0000000..45ca54d
--- /dev/null
+++ b/Filter_Design/induction_sensor_matrix_plus_soft_iron_compensation.m
@@ -0,0 +1,144 @@
+if( 1)
+    start = 1
+    stop=length(x)
+else
+    start = 105 * 6000; % choose ideal flight section
+    stop  = 300 * 6000; 
+end
+
+new_sample_rate = 10; % Hz
+old_sample_rate = 100;
+
+if 1
+ a_start = round( start * new_sample_rate / old_sample_rate);
+ a_stop  = round( stop * new_sample_rate / old_sample_rate);
+else
+ a_start=1000;
+ a_stop=length ( mag_err_predicted_x) -1000;
+end
+
+off_x = 0; % artificial offset injection
+off_y = 0;
+off_z = 0;
+
+mag_x = resample( x(114,start:stop),new_sample_rate, old_sample_rate )- off_x; 
+mag_y = resample( x(115,start:stop),new_sample_rate, old_sample_rate )- off_y; 
+mag_z = resample( x(116,start:stop),new_sample_rate, old_sample_rate )- off_z; 
+
+err_x = resample( x(117,start:stop), new_sample_rate, old_sample_rate)- off_x;
+err_y = resample( x(118,start:stop), new_sample_rate, old_sample_rate)- off_y;
+err_z = resample( x(119,start:stop), new_sample_rate, old_sample_rate)- off_z;
+
+ideal_mag_x = mag_x - err_x;
+ideal_mag_y = mag_y - err_y;
+ideal_mag_z = mag_z - err_z;
+
+q1 = resample( x(53,start:stop),new_sample_rate, old_sample_rate );
+q2 = resample( x(54,start:stop),new_sample_rate, old_sample_rate );
+q3 = resample( x(55,start:stop),new_sample_rate, old_sample_rate );
+q4 = resample( x(56,start:stop),new_sample_rate, old_sample_rate );
+
+% calculate sensor mapping matrix
+
+input = [ mag_x' mag_y' mag_z'];
+
+    modelterms = ...
+[   
+     0     0     0; ...
+     1     0     0; ...
+     0     1     0; ...
+     0     0     1];
+
+sensor_matrix_x = polyfitn( input, ideal_mag_x, modelterms)
+induction_calibrated_x = polyvaln( sensor_matrix_x, input)';
+
+sensor_matrix_y = polyfitn( input, ideal_mag_y, modelterms)
+induction_calibrated_y = polyvaln( sensor_matrix_y, input)';
+
+sensor_matrix_z = polyfitn( input, ideal_mag_z, modelterms)
+induction_calibrated_z = polyvaln( sensor_matrix_z, input)';
+
+
+% calculate soft iron correction terms
+
+input = [ q1' q2' q3' q4'];
+
+    modelterms = ...
+[   
+     0     0     0     0; ...
+     2     0     0     0; ...
+     1     1     0     0; ...
+     1     0     1     0; ...
+     1     0     0     1; ...
+     0     2     0     0; ...
+     0     1     1     0; ...
+     0     1     0     1; ...
+     0     0     2     0; ...
+     0     0     1     1; ...
+     0     0     0     2];
+
+compensation_poly_x = polyfitn( input, induction_calibrated_x - ideal_mag_x, modelterms)
+compensation_poly_y = polyfitn( input, induction_calibrated_y - ideal_mag_y, modelterms)
+compensation_poly_z = polyfitn( input, induction_calibrated_z - ideal_mag_z, modelterms)
+
+soft_iron_err_predicted_x = polyvaln( compensation_poly_x, input)';
+soft_iron_err_predicted_y = polyvaln( compensation_poly_y, input)';
+soft_iron_err_predicted_z = polyvaln( compensation_poly_z, input)';
+
+induction_corrected_x = induction_calibrated_x - soft_iron_err_predicted_x;
+induction_corrected_y = induction_calibrated_y - soft_iron_err_predicted_y;
+induction_corrected_z = induction_calibrated_z - soft_iron_err_predicted_z;
+
+residual_error_x = induction_corrected_x - ideal_mag_x;
+residual_error_y = induction_corrected_y - ideal_mag_y;
+residual_error_z = induction_corrected_z - ideal_mag_z;
+
+std_error_x = std( err_x)
+std_error_x_corrected = std( residual_error_x)
+
+std_error_y = std( err_y)
+std_error_y_corrected = std( residual_error_y)
+
+std_error_z = std( err_z)
+std_error_z_corrected = std( residual_error_z)
+
+orientation_angle_error = sqrt( std_error_x * std_error_x + std_error_y * std_error_y +std_error_z * std_error_z)*180/pi
+
+time = linspace( start / 6000, stop / 6000, length( err_x));
+
+a=subplot( 3,1,1);
+plot( time, err_x,'Color','red','LineWidth',2.0);
+grid
+hold
+plot( time, induction_calibrated_x - ideal_mag_x,'Color','blue','LineWidth',2.0);
+plot( time, induction_corrected_x - ideal_mag_x,'Color','black','LineWidth',2.0);
+legend('Observed','Calibrated','Soft-Iron-Corrected')
+title('Magnetic Field Error Front')
+xlabel('Time / min.');
+ylabel('Magn. Ind. Normalized');
+%axis([0 max(time) -0.1 0.1]);
+
+b=subplot( 3,1,2);
+plot( time, err_y,'Color','red','LineWidth',2.0);
+grid
+hold
+plot( time, induction_calibrated_y - ideal_mag_y,'Color','blue','LineWidth',2.0);
+plot( time, induction_corrected_y - ideal_mag_y,'Color','black','LineWidth',2.0);
+legend('Observed','Calibrated','Soft-Iron-Corrected')
+title('Magnetic Field Error Right')
+xlabel('Time / min.');
+ylabel('Magn. Ind. Normalized');
+%axis([0 max(time) -0.1 0.1]);
+
+c=subplot( 3,1,3);
+plot( time, err_z,'Color','red','LineWidth',2.0);
+grid
+hold
+plot( time, induction_calibrated_z - ideal_mag_z,'Color','blue','LineWidth',2.0);
+plot( time, induction_corrected_z - ideal_mag_z,'Color','black','LineWidth',2.0);
+legend('Observed','Calibrated','Soft-Iron-Corrected')
+title('Magnetic Field Error Down')
+xlabel('Time / min.');
+ylabel('Magn. Ind. Normalized');
+
+linkaxes([a b c],'x');
diff --git a/Filter_Design/magnetic_calibration_SVD.m b/Filter_Design/magnetic_calibration_SVD.m
new file mode 100644
index 0000000..5d66f08
--- /dev/null
+++ b/Filter_Design/magnetic_calibration_SVD.m
@@ -0,0 +1,125 @@
+start =5*6000; 
+stop = length(x);
+
+format long;
+
+new_sample_rate = 10; % Hz
+old_sample_rate = 100;
+
+mag_x = resample( x(114,start:stop),new_sample_rate, old_sample_rate ); 
+mag_y = resample( x(115,start:stop),new_sample_rate, old_sample_rate ); 
+mag_z = resample( x(116,start:stop),new_sample_rate, old_sample_rate ); 
+
+err_x = resample( x(117,start:stop), new_sample_rate, old_sample_rate);
+err_y = resample( x(118,start:stop), new_sample_rate, old_sample_rate);
+err_z = resample( x(119,start:stop), new_sample_rate, old_sample_rate);
+
+q1 = resample( x(53,start:stop),new_sample_rate, old_sample_rate );
+q2 = resample( x(54,start:stop),new_sample_rate, old_sample_rate );
+q3 = resample( x(55,start:stop),new_sample_rate, old_sample_rate );
+q4 = resample( x(56,start:stop),new_sample_rate, old_sample_rate );
+
+input = [mag_x' mag_y' mag_z' q1' q2' q3' q4'];
+
+modelterms = ...
+[   
+     0     0     0     0     0     0     0;
+     1     0     0     0     0     0     0; ...
+     0     0     0     1     1     0     0; ...
+     0     0     0     1     0     1     0; ...
+     0     0     0     1     0     0     1; ...
+     0     0     0     0     2     0     0; ...
+     0     0     0     0     1     1     0; ...
+     0     0     0     0     1     0     1; ...
+     0     0     0     0     0     2     0; ...
+     0     0     0     0     0     1     1; ...
+     0     0     0     0     0     0     2];
+
+poly_x = polyfitn( input, err_x, modelterms)
+modelterms = ...
+[   
+     0     0     0     0     0     0     0;
+     0     1     0     0     0     0     0; ...
+     0     0     0     1     1     0     0; ...
+     0     0     0     1     0     1     0; ...
+     0     0     0     1     0     0     1; ...
+     0     0     0     0     2     0     0; ...
+     0     0     0     0     1     1     0; ...
+     0     0     0     0     1     0     1; ...
+     0     0     0     0     0     2     0; ...
+     0     0     0     0     0     1     1; ...
+     0     0     0     0     0     0     2];
+poly_y = polyfitn( input, err_y, modelterms)
+modelterms = ...
+[   
+     0     0     0     0     0     0     0;
+     0     0     1     0     0     0     0; ...
+     0     0     0     1     1     0     0; ...
+     0     0     0     1     0     1     0; ...
+     0     0     0     1     0     0     1; ...
+     0     0     0     0     2     0     0; ...
+     0     0     0     0     1     1     0; ...
+     0     0     0     0     1     0     1; ...
+     0     0     0     0     0     2     0; ...
+     0     0     0     0     0     1     1; ...
+     0     0     0     0     0     0     2];
+poly_z = polyfitn( input, err_z, modelterms)
+
+mag_err_predicted_x = polyvaln( poly_x, input)';
+mag_err_predicted_y = polyvaln( poly_y, input)';
+mag_err_predicted_z = polyvaln( poly_z, input)';
+
+std_error_x = std( err_x)
+std_error_x_corrected = std( err_x - mag_err_predicted_x)
+
+std_error_y = std( err_y)
+std_error_y_corrected = std( err_y - mag_err_predicted_y)
+
+std_error_z = std( err_z)
+std_error_z_corrected = std( err_z - mag_err_predicted_z)
+
+std_error_mag_induction_abs = std( sqrt( ...
+    (err_x - mag_err_predicted_x).*(err_x - mag_err_predicted_x)+ ...
+    (err_y - mag_err_predicted_y).*(err_y - mag_err_predicted_y)+ ...
+    (err_z - mag_err_predicted_z).*(err_z - mag_err_predicted_z)))
+
+orintation_angle_error = sqrt( std_error_x * std_error_x + std_error_y * std_error_y +std_error_z * std_error_z)*180/pi
+
+time = linspace( start / 6000, stop/6000, length( err_x));
+
+a=subplot( 3,1,1);
+plot( time, err_x,'Color','red');
+grid
+hold
+plot( time, err_x - mag_err_predicted_x,'Color','black');
+legend('Observed','Corrected')
+title('Magnetic Field Error Front')
+xlabel('Time / min.');
+ylabel('Magn. Ind. Normalized');
+axis([0 max(time) -0.1 0.1]);
+
+b=subplot( 3,1,2);
+plot( time, err_y,'Color','red');
+grid
+hold
+plot( time, err_y - mag_err_predicted_y,'Color','black');
+legend('Observed','Corrected')
+title('Magnetic Field Error Right')
+xlabel('Time / min.');
+ylabel('Magn. Ind. Normalized');
+axis([0 max(time) -0.1 0.1]);
+
+c=subplot( 3,1,3);
+plot( time, err_z,'Color','red');
+grid
+hold
+plot( time, err_z - mag_err_predicted_z,'Color','black');
+%plot( time, err_z - testz','Color','blue');
+legend('Observed','Corrected')
+title('Magnetic Field Error Down')
+xlabel('Time / min.');
+ylabel('Magn. Ind. Normalized');
+axis([0 max(time) -0.1 0.1]);
+
+linkaxes([a b c],'x');
+
diff --git a/Filter_Design/magnetic_compensation_SVD_work.m b/Filter_Design/magnetic_compensation_SVD_work.m
new file mode 100644
index 0000000..019bb49
--- /dev/null
+++ b/Filter_Design/magnetic_compensation_SVD_work.m
@@ -0,0 +1,99 @@
+start = 80*6000; 
+stop = 368*6000; 
+%stop = length(x);
+
+format long;
+
+new_sample_rate = 10; % Hz
+old_sample_rate = 100;
+
+mag_x = resample( x(114,start:stop),new_sample_rate, old_sample_rate ); 
+mag_y = resample( x(115,start:stop),new_sample_rate, old_sample_rate ); 
+mag_z = resample( x(116,start:stop),new_sample_rate, old_sample_rate ); 
+
+err_x = resample( x(117,start:stop), new_sample_rate, old_sample_rate);
+err_y = resample( x(118,start:stop), new_sample_rate, old_sample_rate);
+err_z = resample( x(119,start:stop), new_sample_rate, old_sample_rate);
+
+q1 = resample( x(53,start:stop),new_sample_rate, old_sample_rate );
+q2 = resample( x(54,start:stop),new_sample_rate, old_sample_rate );
+q3 = resample( x(55,start:stop),new_sample_rate, old_sample_rate );
+q4 = resample( x(56,start:stop),new_sample_rate, old_sample_rate );
+
+input = [q1' q2' q3' q4'];
+
+modelterms = ...
+[   
+     0     0     0     0;
+     1     1     0     0; ...
+     1     0     1     0; ...
+     1     0     0     1; ...
+     0     2     0     0; ...
+     0     1     1     0; ...
+     0     1     0     1; ...
+     0     0     2     0; ...
+     0     0     1     1; ...
+     0     0     0     2];
+
+poly_x = polyfitn( input, err_x, modelterms)
+poly_y = polyfitn( input, err_y,  modelterms)
+poly_z = polyfitn( input, err_z,  modelterms)
+
+mag_err_predicted_x = polyvaln( poly_x, input)';
+mag_err_predicted_y = polyvaln( poly_y, input)';
+mag_err_predicted_z = polyvaln( poly_z, input)';
+
+std_error_x = std( err_x)
+std_error_x_corrected = std( err_x - mag_err_predicted_x)
+
+std_error_y = std( err_y)
+std_error_y_corrected = std( err_y - mag_err_predicted_y)
+
+std_error_z = std( err_z)
+std_error_z_corrected = std( err_z - mag_err_predicted_z)
+
+std_error_mag_induction_abs = std( sqrt( ...
+    (err_x - mag_err_predicted_x).*(err_x - mag_err_predicted_x)+ ...
+    (err_y - mag_err_predicted_y).*(err_y - mag_err_predicted_y)+ ...
+    (err_z - mag_err_predicted_z).*(err_z - mag_err_predicted_z)))
+
+orintation_angle_error = sqrt( std_error_x * std_error_x + std_error_y * std_error_y +std_error_z * std_error_z)*180/pi
+
+time = linspace( start / 6000, stop/6000, length( err_x));
+
+a=subplot( 3,1,1);
+plot( time, err_x,'Color','red');
+grid
+hold
+plot( time, err_x - mag_err_predicted_x,'Color','black');
+legend('Observed','Corrected')
+title('Magnetic Field Error Front')
+xlabel('Time / min.');
+ylabel('Magn. Ind. Normalized');
+axis([0 max(time) -0.1 0.1]);
+
+b=subplot( 3,1,2);
+plot( time, err_y,'Color','red');
+grid
+hold
+plot( time, err_y - mag_err_predicted_y,'Color','black');
+legend('Observed','Corrected')
+title('Magnetic Field Error Right')
+xlabel('Time / min.');
+ylabel('Magn. Ind. Normalized');
+axis([0 max(time) -0.1 0.1]);
+
+c=subplot( 3,1,3);
+plot( time, err_z,'Color','red');
+grid
+hold
+plot( time, err_z - mag_err_predicted_z,'Color','black');
+%plot( time, err_z - testz','Color','blue');
+legend('Observed','Corrected')
+title('Magnetic Field Error Down')
+xlabel('Time / min.');
+ylabel('Magn. Ind. Normalized');
+axis([0 max(time) -0.1 0.1]);
+
+linkaxes([a b c],'x');
+
diff --git a/Generic_Algorithms/accumulating_averager.h b/Generic_Algorithms/accumulating_averager.h
index b0e964e..5630c55 100644
--- a/Generic_Algorithms/accumulating_averager.h
+++ b/Generic_Algorithms/accumulating_averager.h
@@ -1,6 +1,7 @@
 #ifndef GENERIC_ALGORITHMS_ACCUMULATING_AVERAGER_H_
 #define GENERIC_ALGORITHMS_ACCUMULATING_AVERAGER_H_
 
+//! simple average mechanism using the sum of input values
 template <typename type> class accumulating_averager
 {
 public:
diff --git a/Generic_Algorithms/boxcar_averager.h b/Generic_Algorithms/boxcar_averager.h
index 8964283..00da3e7 100644
--- a/Generic_Algorithms/boxcar_averager.h
+++ b/Generic_Algorithms/boxcar_averager.h
@@ -8,6 +8,7 @@
 #ifndef GENERIC_ALGORITHMS_BOXCAR_AVERAGER_H_
 #define GENERIC_ALGORITHMS_BOXCAR_AVERAGER_H_
 
+//! FIR averager using a rectangle window
 template <class data_t, unsigned length> class boxcar_averager
 {
 public:
diff --git a/Generic_Algorithms/differentiator.h b/Generic_Algorithms/differentiator.h
index f6384eb..09a6962 100644
--- a/Generic_Algorithms/differentiator.h
+++ b/Generic_Algorithms/differentiator.h
@@ -62,7 +62,7 @@ template <class basetype, class datatype>
 
 private:
 //! sampling time
-   const basetype time_constant;
+   basetype time_constant;
 //! maintains old output
    datatype old_value;
    datatype differentiation;
diff --git a/Generic_Algorithms/euler.h b/Generic_Algorithms/euler.h
index d0a297e..3ccd0ca 100644
--- a/Generic_Algorithms/euler.h
+++ b/Generic_Algorithms/euler.h
@@ -11,10 +11,10 @@
 template <class datatype> class eulerangle
 {
 public:
-	eulerangle( datatype ir=0, datatype in=0, datatype iy=0)
-	: r(ir), p(in), y(iy)
+	eulerangle( datatype ir=0, datatype ip=0, datatype iy=0)
+	: roll(ir), pitch(ip), yaw(iy)
 	{}
-	datatype r,p,y;
+	datatype roll,pitch,yaw;
 };
 
 #endif
diff --git a/Generic_Algorithms/mean_and_variance_finder.h b/Generic_Algorithms/mean_and_variance_finder.h
index e4e5ac0..d87c609 100644
--- a/Generic_Algorithms/mean_and_variance_finder.h
+++ b/Generic_Algorithms/mean_and_variance_finder.h
@@ -3,6 +3,7 @@
 #ifndef GENERIC_ALGORITHMS_MEAN_AND_VARIANCE_FINDER_H_
 #define GENERIC_ALGORITHMS_MEAN_AND_VARIANCE_FINDER_H_
 
+//! generic statistics class for average and variance data
 template <class sample_data>class mean_and_variance_finder_t
 {
 public:
diff --git a/Generic_Algorithms/quaternion.h b/Generic_Algorithms/quaternion.h
index 882a594..305ba81 100644
--- a/Generic_Algorithms/quaternion.h
+++ b/Generic_Algorithms/quaternion.h
@@ -35,7 +35,7 @@
 template <class datatype > class quaternion: public vector <datatype, 4>
 {
 public:
-	//! constructor from eulerwinkel
+	//! constructor from euler angle
 	quaternion( vector <datatype, 3> &init)
 	{
 		from_euler( init[0], init[1], init[2]);
@@ -51,78 +51,72 @@ template <class datatype > class quaternion: public vector <datatype, 4>
 	//! normalize quaternion absolute value to ONE
 	inline void normalize(void)
 	{
-	  datatype tmp = vector<datatype, 4>::e[0] * vector<datatype, 4>::e[0]
-		       + vector<datatype, 4>::e[1] * vector<datatype, 4>::e[1]
-		       + vector<datatype, 4>::e[2] * vector<datatype, 4>::e[2]
-		       + vector<datatype, 4>::e[3] * vector<datatype, 4>::e[3] ;
+	  datatype tmp = this->e[0] * this->e[0]
+		       + this->e[1] * this->e[1]
+		       + this->e[2] * this->e[2]
+		       + this->e[3] * this->e[3] ;
 	  tmp = ONE / tmp;
 	  tmp = SQRT( tmp);
 	  for( int i = 0; i < 4; ++i)
-		  vector<datatype, 4>::e[i] *= tmp;
+		  this->e[i] *= tmp;
 	};
 
 	//! quaternion -> euler angle transformation
 	operator eulerangle <datatype> () const
 	{
-		datatype e0 = vector<datatype, 4>::e[0];
-		datatype e1 = vector<datatype, 4>::e[1];
-		datatype e2 = vector<datatype, 4>::e[2];
-		datatype e3 = vector<datatype, 4>::e[3];
+		datatype e0 = this->e[0];
+		datatype e1 = this->e[1];
+		datatype e2 = this->e[2];
+		datatype e3 = this->e[3];
 
 		eulerangle <datatype> _euler;
 
 		//! formula from roenbaeck p34
-		_euler.r = ATAN2(  TWO * (e0*e1 + e2*e3) , e0*e0 - e1*e1 - e2*e2 + e3*e3 );
-		_euler.p = ASIN(   TWO * (e0*e2 - e3*e1));
-		_euler.y = ATAN2(  TWO * (e0*e3 + e1*e2) , e0*e0 + e1*e1 - e2*e2 - e3*e3 );
+		_euler.roll  = ATAN2(  TWO * (e0*e1 + e2*e3) , e0*e0 - e1*e1 - e2*e2 + e3*e3 );
+		_euler.pitch = ASIN(   TWO * (e0*e2 - e3*e1));
+		_euler.yaw   = ATAN2(  TWO * (e0*e3 + e1*e2) , e0*e0 + e1*e1 - e2*e2 - e3*e3 );
 		return _euler;
 	}
-	//! linearized euler component e0 (approximate nick angle / rad)
-	inline datatype lin_e0( void) const
-	{
-		datatype e0 = vector<datatype, 4>::e[0];
-		datatype e1 = vector<datatype, 4>::e[1];
-		datatype e2 = vector<datatype, 4>::e[2];
-		datatype e3 = vector<datatype, 4>::e[3];
-		return TWO * (e0*e1 + e2*e3) / ( e0*e0 - e1*e1 - e2*e2 + e3*e3);
 
-	}
-	//! linearized euler component e1 (approximate roll angle / rad)
-	inline datatype lin_e1( void) const
+	//! quaternion chaining, multiplication
+	quaternion<datatype> operator *( const quaternion<datatype> right) const
 	{
-		datatype e0 = vector<datatype, 4>::e[0];
-		datatype e1 = vector<datatype, 4>::e[1];
-		datatype e2 = vector<datatype, 4>::e[2];
-		datatype e3 = vector<datatype, 4>::e[3];
-		return TWO * (e0*e2 - e3*e1);
-	}
-	//! euler component e2
-	inline datatype get_e2( void) const
-	{
-		datatype e0 = vector<datatype, 4>::e[0];
-		datatype e1 = vector<datatype, 4>::e[1];
-		datatype e2 = vector<datatype, 4>::e[2];
-		datatype e3 = vector<datatype, 4>::e[3];
-		return ATAN2(  TWO * (e0*e3 + e1*e2) , e0*e0 + e1*e1 - e2*e2 - e3*e3 );
+	  quaternion<datatype> result;
+	  datatype w1 = vector<datatype, 4>::e[0];
+	  datatype x1 = vector<datatype, 4>::e[1];
+	  datatype y1 = vector<datatype, 4>::e[2];
+	  datatype z1 = vector<datatype, 4>::e[3];
+
+	  datatype w2 = right[0];
+	  datatype x2 = right[1];
+	  datatype y2 = right[2];
+	  datatype z2 = right[3];
+
+	  result[0] = w1*w2 - x1*x2 - y1*y2 - z1*z2;
+	  result[1] = w1*x2 + x1*w2 + y1*z2 - z1*y2;
+	  result[2] = w1*y2 - x1*z2 + y1*w2 + z1*x2;
+	  result[3] = w1*z2 + x1*y2 - y1*x2 + z1*w2;
+
+	  return result;
 	}
 
 	//! get north component of attitude
 	inline datatype get_north( void) const
 	{
-		datatype e0 = vector<datatype, 4>::e[0];
-		datatype e1 = vector<datatype, 4>::e[1];
-		datatype e2 = vector<datatype, 4>::e[2];
-		datatype e3 = vector<datatype, 4>::e[3];
+		datatype e0 = this->e[0];
+		datatype e1 = this->e[1];
+		datatype e2 = this->e[2];
+		datatype e3 = this->e[3];
 		return e0*e0 + e1*e1 - e2*e2 - e3*e3;
 	}
 
 	//! get east component of attitude
 	inline datatype get_east( void) const
 	{
-		datatype e0 = vector<datatype, 4>::e[0];
-		datatype e1 = vector<datatype, 4>::e[1];
-		datatype e2 = vector<datatype, 4>::e[2];
-		datatype e3 = vector<datatype, 4>::e[3];
+		datatype e0 = this->e[0];
+		datatype e1 = this->e[1];
+		datatype e2 = this->e[2];
+		datatype e3 = this->e[3];
 		return TWO * (e0*e3 + e1*e2);
 	}
 
@@ -136,19 +130,30 @@ template <class datatype > class quaternion: public vector <datatype, 4>
 		return TWO * (e1*e3-e0*e2);
 	}
 
-	//! quaternion update using rotation vector
-	void rotate( datatype p, datatype q, datatype r)
+	//! get heading component of attitude
+	datatype get_heading( void) const
 	{
 		datatype e0 = vector<datatype, 4>::e[0];
 		datatype e1 = vector<datatype, 4>::e[1];
 		datatype e2 = vector<datatype, 4>::e[2];
 		datatype e3 = vector<datatype, 4>::e[3];
 
+		return ATAN2(  TWO * (e0*e3 + e1*e2) , e0*e0 + e1*e1 - e2*e2 - e3*e3 );
+	}
+
+	//! quaternion update using rotation vector
+	void rotate( datatype p, datatype q, datatype r)
+	{
+		datatype e0 = this->e[0];
+		datatype e1 = this->e[1];
+		datatype e2 = this->e[2];
+		datatype e3 = this->e[3];
+
 		//! R.Rogers formula 2.92
-		vector<datatype, 4>::e[0] += (-e1*p -e2*q -e3*r);
-		vector<datatype, 4>::e[1] += ( e0*p +e2*r -e3*q);
-		vector<datatype, 4>::e[2] += ( e0*q -e1*r +e3*p);
-		vector<datatype, 4>::e[3] += ( e0*r +e1*q -e2*p);
+		this->e[0] += (-e1*p -e2*q -e3*r);
+		this->e[1] += ( e0*p +e2*r -e3*q);
+		this->e[2] += ( e0*q -e1*r +e3*p);
+		this->e[3] += ( e0*r +e1*q -e2*p);
 
 		normalize();
 	}
@@ -156,6 +161,7 @@ template <class datatype > class quaternion: public vector <datatype, 4>
 	//! euler angle -> quaternion transformation
 	void from_euler( datatype p, datatype q, datatype r)
 	{
+	// 3-2-1: yaw, then pitch, then roll
 		p *= HALF; q *= HALF; r *= HALF;
 		datatype sinphi   = SIN( p);
 		datatype cosphi   = COS( p);
@@ -164,20 +170,20 @@ template <class datatype > class quaternion: public vector <datatype, 4>
 		datatype sinpsi   = SIN( r);
 		datatype cospsi   = COS( r);
 
-		vector<datatype, 4>::e[0] = cosphi*costheta*cospsi + sinphi*sintheta*sinpsi;
-		vector<datatype, 4>::e[1] = sinphi*costheta*cospsi + cosphi*sintheta*sinpsi;
-		vector<datatype, 4>::e[2] = cosphi*sintheta*cospsi + sinphi*costheta*sinpsi;
-		vector<datatype, 4>::e[3] = cosphi*costheta*sinpsi - sinphi*sintheta*cospsi;
+		this->e[0] = cosphi*costheta*cospsi + sinphi*sintheta*sinpsi;
+		this->e[1] = sinphi*costheta*cospsi - cosphi*sintheta*sinpsi;
+		this->e[2] = cosphi*sintheta*cospsi + sinphi*costheta*sinpsi;
+		this->e[3] = cosphi*costheta*sinpsi - sinphi*sintheta*cospsi;
 	}
 
 	//! quaternion -> rotation matrix transformation
 	void get_rotation_matrix (matrix<datatype, 3> &m) const
 	{
 		//! R.Rogers formula 2.90
-		datatype e0=vector<datatype, 4>::e[0];
-		datatype e1=vector<datatype, 4>::e[1];
-		datatype e2=vector<datatype, 4>::e[2];
-		datatype e3=vector<datatype, 4>::e[3];
+		datatype e0=this->e[0];
+		datatype e1=this->e[1];
+		datatype e2=this->e[2];
+		datatype e3=this->e[3];
 
 	    m.e[0][0] = TWO * (e0*e0+e1*e1) - ONE;
 	    m.e[0][1] = TWO * (e1*e2-e0*e3);
@@ -191,21 +197,21 @@ template <class datatype > class quaternion: public vector <datatype, 4>
 	    m.e[2][1] = TWO * (e2*e3+e0*e1);
 	    m.e[2][2] = TWO * (e0*e0+e3*e3) - ONE;
 	}
-	quaternion <datatype> operator * ( quaternion <datatype> & right) //!< quaternion multiplication
+	quaternion <datatype> operator * ( const quaternion <datatype> & right) const //!< quaternion multiplication
 		{
 		quaternion <datatype> result;
-		datatype e0=vector<datatype, 4>::e[0];
-		datatype e1=vector<datatype, 4>::e[1];
-		datatype e2=vector<datatype, 4>::e[2];
-		datatype e3=vector<datatype, 4>::e[3];
-		datatype re0=right.vector<datatype, 4>::e[0];
-		datatype re1=right.vector<datatype, 4>::e[1];
-		datatype re2=right.vector<datatype, 4>::e[2];
-		datatype re3=right.vector<datatype, 4>::e[3];
-		result.vector<datatype, 4>::e[0] = e0 * re0 - e1 * re1 - e2 * re2 + e3 * re3;
-		result.vector<datatype, 4>::e[1] = e1 * re0 + e2 * re3 - e3 * re2 + e0 * re1;
-		result.vector<datatype, 4>::e[2] = e3 * re1 + e0 * re2 - e1 * re3 + e2 * re0;
-		result.vector<datatype, 4>::e[3] = e1 * re2 - e2 * re1 + e0 * re3 + e3 * re0;
+		datatype e0=this->e[0];
+		datatype e1=this->e[1];
+		datatype e2=this->e[2];
+		datatype e3=this->e[3];
+		datatype re0=right.e[0];
+		datatype re1=right.e[1];
+		datatype re2=right.e[2];
+		datatype re3=right.e[3];
+		result.e[0] = e0 * re0 - e1 * re1 - e2 * re2 + e3 * re3;
+		result.e[1] = e1 * re0 + e2 * re3 - e3 * re2 + e0 * re1;
+		result.e[2] = e3 * re1 + e0 * re2 - e1 * re3 + e2 * re0;
+		result.e[3] = e1 * re2 - e2 * re1 + e0 * re3 + e3 * re0;
 		return result;
 		}
 
@@ -216,13 +222,12 @@ template <class datatype > class quaternion: public vector <datatype, 4>
 		//! formula from roenbaeck p35
 		tmp = SQRT( tmp);
 		tmp *= HALF;
-		vector<datatype, 4>::e[0] = tmp;
+		this->e[0] = tmp;
 		tmp = QUARTER / tmp;
-		vector<datatype, 4>::e[1] = tmp * (rotm.e[2][1] - rotm.e[1][2]);
-		vector<datatype, 4>::e[2] = tmp * (rotm.e[0][2] - rotm.e[2][0]);
-		vector<datatype, 4>::e[3] = tmp * (rotm.e[1][0] - rotm.e[0][1]);
+		this->e[1] = tmp * (rotm.e[2][1] - rotm.e[1][2]);
+		this->e[2] = tmp * (rotm.e[0][2] - rotm.e[2][0]);
+		this->e[3] = tmp * (rotm.e[1][0] - rotm.e[0][1]);
 		normalize(); // compensate computational inaccuracies
 		};
 };
-
 #endif // QUATERNION_H
diff --git a/Generic_Algorithms/trigger.h b/Generic_Algorithms/trigger.h
index e68bf49..d509b5b 100644
--- a/Generic_Algorithms/trigger.h
+++ b/Generic_Algorithms/trigger.h
@@ -25,7 +25,7 @@
 #ifndef TRIGGER_H_
 #define TRIGGER_H_
 
-//! class for hysteresis-based triggering on input data
+//! Hysteresis-based triggering on input data
 class trigger
 {
 public:
diff --git a/NAV_Algorithms/AHRS.cpp b/NAV_Algorithms/AHRS.cpp
index 26f3490..3a99a57 100644
--- a/NAV_Algorithms/AHRS.cpp
+++ b/NAV_Algorithms/AHRS.cpp
@@ -28,6 +28,10 @@
 #include "magnetic_induction_report.h"
 #include "embedded_memory.h"
 #include "NAV_tuning_parameters.h"
+#include "soft_iron_compensator.h"
+#include "compass_calibrator_3D.h"
+
+#define TEST_CALIBRATION_AND_COMPENSATION 0
 
 #if USE_HARDWARE_EEPROM	== 0
 #include "EEPROM_emulation.h"
@@ -43,8 +47,8 @@ AHRS_type::attitude_setup (const float3vector &acceleration,
   float3vector north, east, down;
 
   float3vector induction;
-  if( compass_calibration.isCalibrationDone()) // use calibration if available
-    induction = compass_calibration.calibrate(mag);
+  if( compass_calibration.available())
+    induction = compass_calibration.calibrate( mag);
   else
     induction = mag;
 
@@ -104,21 +108,28 @@ AHRS_type::update_circling_state ()
 #endif
 }
 
-void AHRS_type::feed_magnetic_induction_observer(const float3vector &mag_sensor)
+void AHRS_type::feed_magnetic_induction_observer( const float3vector &mag_sensor, const float3vector &mag_delta)
 {
-  float3vector expected_body_induction = body2nav.reverse_map(expected_nav_induction);
+  float error_margin = nav_correction.abs();
+  if(  error_margin > NAV_CORRECTION_LIMIT)
+    return;
+
   bool turning_right = turn_rate_averager.get_output() > 0.0f;
 
+#if USE_SOFT_IRON_COMPENSATION
+  if( mag_delta.abs() < 0.05) // only if the precision is already reasonably good ...
+    {
+      bool calibration_data_complete = soft_iron_compensator.learn( mag_delta, attitude, turning_right, error_margin);
+      if( calibration_data_complete)
+	trigger_soft_iron_compensator_calculation();
+    }
+#endif
+
   for (unsigned i = 0; i < 3; ++i)
     if( turning_right)
       mag_calibration_data_collector_right_turn[i].add_value ( MAG_SCALE * expected_body_induction[i], MAG_SCALE * mag_sensor[i]);
     else
       mag_calibration_data_collector_left_turn[i].add_value ( MAG_SCALE * expected_body_induction[i], MAG_SCALE * mag_sensor[i]);
-
-#if USE_EARTH_INDUCTION_DATA_COLLECTOR
-  // measurement of earth induction to find the local earth field parameters
-  earth_induction_data_collector.feed( induction_nav_frame, turning_right);
-#endif
 }
 
 AHRS_type::AHRS_type (float sampling_time)
@@ -141,17 +152,13 @@ AHRS_type::AHRS_type (float sampling_time)
   turn_rate_averager( ANGLE_F_BY_FS),
   G_load_averager(     G_LOAD_F_BY_FS),
   compass_calibration(),
-#if USE_EARTH_INDUCTION_DATA_COLLECTOR
-  earth_induction_data_collector( MAG_SCALE),
-#endif
   antenna_DOWN_correction(  configuration( ANT_SLAVE_DOWN)  / configuration( ANT_BASELENGTH)),
   antenna_RIGHT_correction( configuration( ANT_SLAVE_RIGHT) / configuration( ANT_BASELENGTH)),
   heading_difference_AHRS_DGNSS(0.0f),
   cross_acc_correction(0.0f),
   magnetic_disturbance(0.0f),
   magnetic_control_gain(1.0f),
-  automatic_magnetic_calibration(configuration(MAG_AUTO_CALIB)),
-  automatic_earth_field_parameters( false),
+  automatic_magnetic_calibration( (magnetic_calibration_type)(round)(configuration(MAG_AUTO_CALIB))),
   magnetic_calibration_updated( false)
 {
   update_magnetic_loop_gain(); // adapt to magnetic inclination
@@ -221,22 +228,41 @@ AHRS_type::update_diff_GNSS (const float3vector &gyro,
 			     const float3vector &GNSS_acceleration,
 			     float GNSS_heading)
 {
-  circle_state_t old_circle_state = circling_state;
-  update_circling_state ();
+  expected_body_induction = body2nav.reverse_map( expected_nav_induction);
 
-  if( compass_calibration.isCalibrationDone()) // use calibration if available
-      body_induction = compass_calibration.calibrate(mag_sensor);
-  else
-      body_induction = mag_sensor;
+#if TEST_CALIBRATION_AND_COMPENSATION
+  float fcoordinates[] =
+      {
+	  	-0.05, 0.95, 0.03,
+		1, 0.1, -0.1,
+		-0.03, 0.1, 1.1
+      };
+
+   float3matrix rotator (fcoordinates);
+
+   float3vector mag_sensor_tilted = rotator * mag_sensor;
+
+   body_induction = compass_calibrator_3D.calibrate( mag_sensor_tilted, attitude);
+#else
+   body_induction = compass_calibration.calibrate(mag_sensor);
+#endif
+
+  float3vector mag_delta = body_induction - expected_body_induction; // for the training of the compensator
+
+#if USE_SOFT_IRON_COMPENSATION
+  if( automatic_magnetic_calibration == AUTO_SOFT_IRON_COMPENSATE)
+    body_induction = body_induction - soft_iron_compensator.compensate( expected_body_induction, attitude);
+#endif
+
+  body_induction_error = body_induction - expected_body_induction;
 
-  body_induction_error = body_induction - body2nav.reverse_map( expected_nav_induction);
   float3vector nav_acceleration = body2nav * acc;
 
   float heading_gnss_work = GNSS_heading	// correct for antenna alignment
-      + antenna_DOWN_correction  * SIN (euler.r)
-      - antenna_RIGHT_correction * COS (euler.r);
+      + antenna_DOWN_correction  * SIN (euler.roll)
+      - antenna_RIGHT_correction * COS (euler.roll);
 
-  heading_gnss_work = heading_gnss_work - euler.y; // = heading difference D-GNSS - AHRS
+  heading_gnss_work = heading_gnss_work - euler.yaw; // = heading difference D-GNSS - AHRS
 
   if (heading_gnss_work > M_PI_F) // map into { -PI PI}
     heading_gnss_work -= 2.0f * M_PI_F;
@@ -252,6 +278,9 @@ AHRS_type::update_diff_GNSS (const float3vector &gyro,
 	   + nav_acceleration[NORTH] * GNSS_acceleration[EAST]
 	   - nav_acceleration[EAST]  * GNSS_acceleration[NORTH];
 
+  circle_state_t old_circle_state = circling_state;
+  update_circling_state ();
+
   if( circling_state == CIRCLING) // heading correction using acceleration cross product GNSS * INS
     {
 
@@ -275,12 +304,18 @@ AHRS_type::update_diff_GNSS (const float3vector &gyro,
       gyro_integrator += gyro_correction; // update integrator
 
   gyro_correction = gyro_correction + gyro_integrator * I_GAIN;
+  gyro_correction_power = SQR( gyro_correction[0]) + SQR( gyro_correction[1]) +SQR( gyro_correction[2]);
+
   update_attitude (acc, gyro + gyro_correction, body_induction);
 
   // only here we get fresh magnetic entropy
   // and: wait for low control loop error
-  if ( (circling_state == CIRCLING) && ( nav_correction.abs() < NAV_CORRECTION_LIMIT))
-	feed_magnetic_induction_observer (mag_sensor);
+  if ( circling_state == CIRCLING)
+#if TEST_CALIBRATION_AND_COMPENSATION
+	feed_magnetic_induction_observer (mag_sensor_tilted, mag_delta);
+#else
+	feed_magnetic_induction_observer (mag_sensor, mag_delta);
+#endif
 
   // when circling is finished eventually update the magnetic calibration
   if (automatic_magnetic_calibration && (old_circle_state == CIRCLING) && (circling_state == TRANSITION))
@@ -295,12 +330,36 @@ AHRS_type::update_compass (const float3vector &gyro, const float3vector &acc,
 			   const float3vector &mag_sensor,
 			   const float3vector &GNSS_acceleration)
 {
-  if( compass_calibration.isCalibrationDone()) // use calibration if available
+  expected_body_induction = body2nav.reverse_map( expected_nav_induction);
+
+#if TEST_CALIBRATION_AND_COMPENSATION
+  float fcoordinates[] =
+      {
+	  	-0.05, 0.95, 0.03,
+		1, 0.1, -0.1,
+		-0.03, 0.1, 1.1
+      };
+
+   float3matrix rotator (fcoordinates);
+
+   float3vector mag_sensor_tilted = rotator * mag_sensor;
+
+   body_induction = compass_calibrator_3D.calibrate( mag_sensor_tilted, attitude);
+#else
       body_induction = compass_calibration.calibrate(mag_sensor);
-  else
-    body_induction = mag_sensor;
+#if USE_SOFT_IRON_COMPENSATION
+      if( automatic_magnetic_calibration == AUTO_SOFT_IRON_COMPENSATE)
+	body_induction = body_induction - soft_iron_compensator.compensate( expected_body_induction, attitude);
+#endif
+
+#endif
+
+   float3vector mag_delta = body_induction - expected_body_induction; // for the training of the compensator
 
-  body_induction_error = body_induction - body2nav.reverse_map( expected_nav_induction);
+// if( (automatic_magnetic_calibration == AUTO_3D)
+//     body_induction = body_induction - soft_iron_compensator.calibrate( expected_body_induction, attitude);
+
+  body_induction_error = body_induction - expected_body_induction;
 
   float3vector nav_acceleration = body2nav * acc;
   float3vector nav_induction = body2nav * body_induction;
@@ -313,9 +372,6 @@ AHRS_type::update_compass (const float3vector &gyro, const float3vector &acc,
   // calculate heading error depending on the present circling state
   // on state changes handle MAG auto calibration
 
-  circle_state_t old_circle_state = circling_state;
-  update_circling_state ();
-
   float mag_correction =
       + nav_induction[NORTH] * expected_nav_induction[EAST]
       - nav_induction[EAST]  * expected_nav_induction[NORTH];
@@ -324,6 +380,9 @@ AHRS_type::update_compass (const float3vector &gyro, const float3vector &acc,
 	   + nav_acceleration[NORTH] * GNSS_acceleration[EAST]
 	   - nav_acceleration[EAST]  * GNSS_acceleration[NORTH];
 
+  circle_state_t old_circle_state = circling_state;
+  update_circling_state ();
+
   switch (circling_state)
     {
     case STRAIGHT_FLIGHT:
@@ -352,23 +411,25 @@ AHRS_type::update_compass (const float3vector &gyro, const float3vector &acc,
     }
 
   gyro_correction = gyro_correction + gyro_integrator * I_GAIN;
+  gyro_correction_power = SQR( gyro_correction[0]) + SQR( gyro_correction[1]) +SQR( gyro_correction[2]);
 
   // feed quaternion update with corrected sensor readings
   update_attitude (acc, gyro + gyro_correction, body_induction);
 
-  body_induction_error = mag_sensor - body2nav.reverse_map( expected_nav_induction);
-
   // only here we get fresh magnetic entropy
   // and: wait for low control loop error
-  if ( (circling_state == CIRCLING) && ( nav_correction.abs() < NAV_CORRECTION_LIMIT))
-	feed_magnetic_induction_observer (mag_sensor);
+//  if ( (circling_state == CIRCLING) && ( nav_correction.abs() < NAV_CORRECTION_LIMIT))
+#if TEST_CALIBRATION_AND_COMPENSATION
+	feed_magnetic_induction_observer (mag_sensor_tilted, mag_delta);
+#else
+	feed_magnetic_induction_observer (mag_sensor, mag_delta);
+#endif
 
-  // when circling is finished eventually update the magnetic calibration
-  if (automatic_magnetic_calibration && (old_circle_state == CIRCLING) && (circling_state == TRANSITION))
-	  handle_magnetic_calibration('m');
+ // when circling is finished eventually update the magnetic calibration
+ if (automatic_magnetic_calibration && (old_circle_state == CIRCLING) && (circling_state == TRANSITION))
+  handle_magnetic_calibration('m');
 }
 
-
 /**
  * @brief  update attitude from IMU data NOT using magnetometer of D-GNSS
  */
@@ -376,11 +437,15 @@ void AHRS_type::update_ACC_only (const float3vector &gyro, const float3vector &a
 			   const float3vector &mag_sensor,
 			   const float3vector &GNSS_acceleration)
 {
-  float3vector mag;
-  if (compass_calibration.isCalibrationDone ()) // use calibration if available
-    mag = compass_calibration.calibrate (mag_sensor);
-  else
-    mag = mag_sensor;
+  expected_body_induction = body2nav.reverse_map( expected_nav_induction);
+  body_induction = compass_calibration.calibrate(mag_sensor);
+
+#if USE_SOFT_IRON_COMPENSATION
+  if( automatic_magnetic_calibration == AUTO_SOFT_IRON_COMPENSATE)
+	body_induction = body_induction - soft_iron_compensator.compensate( expected_body_induction, attitude);
+#endif
+
+  body_induction_error = body_induction - expected_body_induction;
 
   float3vector nav_acceleration = body2nav * acc;
 
@@ -407,7 +472,7 @@ void AHRS_type::update_ACC_only (const float3vector &gyro, const float3vector &a
   gyro_correction = gyro_correction + gyro_integrator * I_GAIN; // use integrator
 
   // feed quaternion update with corrected sensor readings
-  update_attitude(acc, gyro + gyro_correction, mag);
+  update_attitude(acc, gyro + gyro_correction, body_induction);
 }
 
 void AHRS_type::write_calibration_into_EEPROM( void)
@@ -427,40 +492,14 @@ void AHRS_type::handle_magnetic_calibration ( char type)
 {
   bool calibration_changed = compass_calibration.set_calibration_if_changed ( mag_calibration_data_collector_right_turn, mag_calibration_data_collector_left_turn, MAG_SCALE);
 
-  float induction_error = 0.0f;
-
   float3vector new_induction_estimate;
 
-#if USE_EARTH_INDUCTION_DATA_COLLECTOR
-
-  if (earth_induction_data_collector.data_valid ())
-	{
-	  new_induction_estimate = earth_induction_data_collector.get_estimated_induction();
-	  induction_error = SQRT(earth_induction_data_collector.get_variance ());
-
-	  if ( automatic_earth_field_parameters && ( induction_error < INDUCTION_STD_DEVIATION_LIMIT))
-	    {
-	      expected_nav_induction = new_induction_estimate;
-	      expected_nav_induction.normalize();
-	      update_magnetic_loop_gain(); // adapt to magnetic inclination
-
-	      calibration_changed = true;
-	    }
-	  earth_induction_data_collector.reset ();
-	}
-#endif
-
   if( calibration_changed)
     {
       magnetic_induction_report_t magnetic_induction_report;
       for( unsigned i=0; i<3; ++i)
 	magnetic_induction_report.calibration[i] = (compass_calibration.get_calibration())[i];
 
-#if USE_EARTH_INDUCTION_DATA_COLLECTOR
-      magnetic_induction_report.nav_induction=new_induction_estimate;
-      magnetic_induction_report.nav_induction_std_deviation = induction_error;
-#endif
-
       report_magnetic_calibration_has_changed( &magnetic_induction_report, type);
       magnetic_calibration_updated = true;
     }
diff --git a/NAV_Algorithms/AHRS.h b/NAV_Algorithms/AHRS.h
index 4ab7988..1797130 100644
--- a/NAV_Algorithms/AHRS.h
+++ b/NAV_Algorithms/AHRS.h
@@ -31,8 +31,8 @@
 #include "float3matrix.h"
 #include "integrator.h"
 #include "compass_calibration.h"
+#include "soft_iron_compensator.h"
 #include "HP_LP_fusion.h"
-#include "induction_observer.h"
 #include "pt2.h"
 
 extern float3vector nav_induction;
@@ -45,7 +45,7 @@ typedef enum  { STRAIGHT_FLIGHT, TRANSITION, CIRCLING} circle_state_t;
 
 typedef integrator<float, float3vector> vector3integrator;
 
-//! Attitude and heading reference system class
+//! Attitude and heading reference system
 class AHRS_type
 {
 public:
@@ -91,18 +91,6 @@ class AHRS_type
 	{
 		return induction_nav_frame;
 	}
-	inline float get_lin_e0(void) const
-	{
-		return attitude.lin_e0();
-	}
-	inline float get_lin_e1(void) const
-	{
-		return attitude.lin_e1();
-	}
-	inline float get_e2(void) const
-	{
-		return attitude.get_e2();
-	}
 	inline float get_north(void) const
 	{
 		return attitude.get_north();
@@ -198,6 +186,11 @@ class AHRS_type
     return body_induction;
   }
 
+  float getGyro_correction_Power () const
+  {
+    return gyro_correction_power;
+  }
+  enum magnetic_calibration_type { NONE, AUTO_1D, AUTO_SOFT_IRON_COMPENSATE};
 private:
   void handle_magnetic_calibration( char type);
 
@@ -210,7 +203,7 @@ class AHRS_type
       magnetic_control_gain = M_H_GAIN / expected_horizontal_induction;
   }
 
-  void feed_magnetic_induction_observer(const float3vector &mag_sensor);
+  void feed_magnetic_induction_observer( const float3vector &mag_sensor, const float3vector &mag_delta);
   circle_state_t update_circling_state( void);
 
   void update_diff_GNSS( const float3vector &gyro, const float3vector &acc, const float3vector &mag,
@@ -230,6 +223,7 @@ class AHRS_type
   float3vector acceleration_nav_frame;
   float3vector induction_nav_frame; 	//!< observed NAV induction
   float3vector expected_nav_induction;	//!< expected NAV induction
+  float3vector expected_body_induction;	//!< expected body frame induction
   float3matrix body2nav;
   eulerangle<ftype> euler;
   pt2<float,float> slip_angle_averager;
@@ -239,20 +233,17 @@ class AHRS_type
   linear_least_square_fit<int64_t, float> mag_calibration_data_collector_right_turn[3];
   linear_least_square_fit<int64_t, float> mag_calibration_data_collector_left_turn[3];
   compass_calibration_t <int64_t, float> compass_calibration;
-#if USE_EARTH_INDUCTION_DATA_COLLECTOR
-  induction_observer_t <int64_t> earth_induction_data_collector;
-#endif
   float antenna_DOWN_correction;  //!< slave antenna lower / DGNSS base length
   float antenna_RIGHT_correction; //!< slave antenna more right / DGNSS base length
   float heading_difference_AHRS_DGNSS;
   float cross_acc_correction;
   float magnetic_disturbance; //!< abs( observed_induction - expected_induction)
   float magnetic_control_gain; //!< declination-dependent magnetic control loop gain
-  bool automatic_magnetic_calibration;
-  bool automatic_earth_field_parameters; // todo unused, remove me some day
+  magnetic_calibration_type automatic_magnetic_calibration;
   bool magnetic_calibration_updated;
   float3vector body_induction;
   float3vector body_induction_error;
+  float gyro_correction_power;
 };
 
 #endif /* AHRS_H_ */
diff --git a/NAV_Algorithms/GNSS.h b/NAV_Algorithms/GNSS.h
index fdff995..07bf511 100644
--- a/NAV_Algorithms/GNSS.h
+++ b/NAV_Algorithms/GNSS.h
@@ -99,7 +99,7 @@ typedef enum { GNSS_HAVE_FIX, GNSS_NO_FIX, GNSS_ERROR} GNSS_Result;
 #define SAT_FIX 	1 // bits within sat_fix
 #define SAT_HEADING 	2
 
-//! this structure contains all important data from the GNSS
+//! Contains all important data from the GNSS
 typedef struct
 {
   float3vector position;  	//!< NED / meters
@@ -131,7 +131,7 @@ typedef struct
   uint16_t dummy;
 } coordinates_t;
 
-//! this class is organizing the data transfer from uBlox-GNSS to Larus
+//! Organizing the data transfer from a uBlox-GNSS receiver
 class GNSS_type
 {
 public:
diff --git a/NAV_Algorithms/NAV_tuning_parameters.h b/NAV_Algorithms/NAV_tuning_parameters.h
index 576dc91..c711aa2 100644
--- a/NAV_Algorithms/NAV_tuning_parameters.h
+++ b/NAV_Algorithms/NAV_tuning_parameters.h
@@ -25,11 +25,9 @@
 #ifndef NAV_ALGORITHMS_NAV_TUNING_PARAMETERS_H_
 #define NAV_ALGORITHMS_NAV_TUNING_PARAMETERS_H_
 
-#define USE_EARTH_INDUCTION_DATA_COLLECTOR 0
-
 #define MINIMUM_MAG_CALIBRATION_SAMPLES 6000
-#define MAG_OFFSET_CHANGE_LIMIT 0.01f
-#define MAG_SCALE_CHANGE_LIMIT 0.01f
+#define MAG_OFFSET_CHANGE_LIMIT 0.02f
+#define MAG_SCALE_CHANGE_LIMIT 0.02f
 
 #define CIRCLE_LIMIT (10 * 100) //!< 10 * 1/100 s delay into / out of circling state
 
@@ -51,18 +49,20 @@
 #define I_GAIN 0.00006f 		//!< Attitude controller: integral gain
 #define H_GAIN 38.0f			//!< Attitude controller: horizontal gain
 #define M_H_GAIN 6.0f			//!< Attitude controller: horizontal gain magnetic
-#define CROSS_GAIN 0.05f		//!< Attitude controller: cross-product gain
+#define CROSS_GAIN 0.05			//!< Attitude controller: cross-product gain
 #define INDUCTION_ERROR	0.015		//!< Maximum std deviation to update earth induction parameters
 #define NAV_CORRECTION_LIMIT 5.0f	//!< limit for "low AHRS correcting variable"
 #define HIGH_TURN_RATE 8.0*M_PI/180.0f	//!< turn rate high limit
 #define LOW_TURN_RATE  1.0*M_PI/180.0f	//!< turn rate low limit
 #define SPEED_COMPENSATION_FUSIONER_FEEDBACK 0.998f // empirically tuned alpha
+#define USE_OLD_FASHIONED_PRESSURE_VARIO 0 // for vario comparison tests (offline)
 
-#define USE_ACCELERATION_CROSS_GAIN_ALONE_WHEN_CIRCLING 1 	//!< if 1: do not use induction to control attitude while circling
+#define USE_ACCELERATION_CROSS_GAIN_ALONE_WHEN_CIRCLING 0 	//!< if 1: do not use induction to control attitude while circling
 #define DISABLE_CIRCLING_STATE		0	//!< for tests only: never use circling AHRS algorithm
 #define INDUCTION_STD_DEVIATION_LIMIT	0.03 	//!< results outperforming this number will be used further on
 
-#define AIRBORNE_TRIGGER_SPEED		0.5f //!< speed-compensator vario value m/s
+#define AIRBORNE_TRIGGER_SPEED_COMP	0.5f //!< speed-compensator vario value m/s
+#define AIRBORNE_TRIGGER_SPEED		15.0f //!< ground speed / m/s
 
 #define MAG_SCALE			10000.0f //!< scale factor for high-precision integer statistics
 #define FAST_SAMPLING_REQUENCY 		100.0f
@@ -71,5 +71,6 @@
 #define SLOW_SAMPLING_TIME 		0.1f
 
 #define GRAVITY				9.81f
+#define VECTOR_AVERAGE_COUNT		100 //!< average count for sensor orientation setup
 
 #endif /* NAV_ALGORITHMS_NAV_TUNING_PARAMETERS_H_ */
diff --git a/NAV_Algorithms/air_density_observer.cpp b/NAV_Algorithms/air_density_observer.cpp
index 7e4c1c5..45d3d21 100644
--- a/NAV_Algorithms/air_density_observer.cpp
+++ b/NAV_Algorithms/air_density_observer.cpp
@@ -25,60 +25,69 @@
 #include "embedded_math.h"
 #include <air_density_observer.h>
 
-air_data_result air_density_observer::feed_metering( float pressure, float MSL_altitude)
+air_data_result air_density_observer_t::feed_metering( float pressure, float GNSS_altitude)
 {
   air_data_result air_data;
 
-#if DENSITY_MEASURMENT_COLLECTS_INTEGER
-  density_QFF_calculator.add_value( MSL_altitude * 100.0f, pressure);
-#else
-  // use values normalized around 1.0
-  density_QFF_calculator.add_value( MSL_altitude * 1e-3, pressure * 1e-5);
-#endif
+  pressure_decimation_filter.respond( pressure);
+  altitude_decimation_filter.respond( GNSS_altitude);
+  --decimation_counter;
+  if( decimation_counter > 0)
+    return air_data;
+  decimation_counter = AIR_DENSITY_DECIMATION;
+
+  density_QFF_calculator.add_value( GNSS_altitude * 100.0f, pressure);
 
-  if( MSL_altitude > max_altitude)
-    max_altitude = MSL_altitude;
+  // update elevation range
+  if( GNSS_altitude > max_altitude)
+    max_altitude = GNSS_altitude;
 
-  if( MSL_altitude < min_altitude)
-    min_altitude = MSL_altitude;
+  if( GNSS_altitude < min_altitude)
+    min_altitude = GNSS_altitude;
 
-  if( false == altitude_trigger.process(MSL_altitude))
+  // if range too low: continue
+  if( (max_altitude - min_altitude) < MINIMUM_ALTITUDE_RANGE)
     return air_data;
 
-  if( ((max_altitude - min_altitude) < MINIMUM_ALTITUDE_RANGE) // ... forget this measurement
-    || (density_QFF_calculator.get_count() < 3000))
-    {
-      max_altitude = min_altitude = MSL_altitude;
-      density_QFF_calculator.reset();
-      return air_data;
-    }
+  // elevation range triggering
+  if( (max_altitude - min_altitude < MAXIMUM_ALTITUDE_RANGE) &&
+      false == altitude_trigger.process(GNSS_altitude))
+    return air_data;
+
+  // if data points too rare: continue
+  if (density_QFF_calculator.get_count() < 100)
+    return air_data;
 
+  // process last acquisition phase data
   linear_least_square_result<evaluation_type> result;
   density_QFF_calculator.evaluate(result);
 
-//  Due to numeric effects, when using float the
-//  variance has been observed to be negative in some cases !
-//  Therefore using float this test can not be used.
- assert( result.variance_slope > 0);
- assert( result.variance_offset > 0);
+//  Due to numeric effects, the variance has been observed
+//  to be negative in some cases.
+//  If this is the case: Throw away this result.
+  if( ( result.variance_slope < 0) || ( result.variance_offset < 0))
+      {
+      density_QFF_calculator.reset();
+      air_data.valid=false;
+      return air_data;
+      }
 
- if( result.variance_slope < MAX_ALLOWED_VARIANCE)
+ if( (result.variance_slope < MAX_ALLOWED_SLOPE_VARIANCE) &&
+     (result.variance_offset < MAX_ALLOWED_OFFSET_VARIANCE) )
     {
       air_data.QFF = (float)(result.y_offset);
-      float density = 100.0f * (float)(result.slope) * -0.10194f; // div by -9.81f;
-
-#if DENSITY_MEASURMENT_COLLECTS_INTEGER
-      float pressure = density_QFF_calculator.get_mean_y();
-#else
-      float pressure = 1e5 * density_QFF_calculator.get_mean_y();
-#endif
+      float density = 100.0f * (float)(result.slope) * -0.101936f; // div by -9.81f;
 
-      float std_density = 1.0496346613e-5f * pressure + 0.1671546011f;
+      float reference_altitude = density_QFF_calculator.get_mean_x() * 0.01f;
+      float std_density =
+	  reference_altitude * reference_altitude *   0.000000003547494f
+	  -0.000115412739613f * reference_altitude +1.224096628212817f;
       air_data.density_correction = density / std_density;
+      air_data.density_variance = result.variance_slope;
       air_data.valid = true;
     }
 
-  max_altitude = min_altitude = MSL_altitude;
+  max_altitude = min_altitude = GNSS_altitude;
   density_QFF_calculator.reset();
 
   return air_data;
diff --git a/NAV_Algorithms/air_density_observer.h b/NAV_Algorithms/air_density_observer.h
index f5cbc0e..5f126f5 100644
--- a/NAV_Algorithms/air_density_observer.h
+++ b/NAV_Algorithms/air_density_observer.h
@@ -25,39 +25,49 @@
 #ifndef AIR_DENSITY_OBSERVER_H_
 #define AIR_DENSITY_OBSERVER_H_
 
+#include "pt2.h"
 #include "Linear_Least_Square_Fit.h"
 #include "trigger.h"
 
-#define DENSITY_MEASURMENT_COLLECTS_INTEGER 1
 typedef double evaluation_type;
 typedef uint64_t measurement_type;
 
-#define MAX_ALLOWED_VARIANCE	1e-9
-#define MINIMUM_ALTITUDE_RANGE	300.0f
-#define ALTITUDE_TRIGGER_HYSTERESIS 50.0f
+#define ALTITUDE_TRIGGER_HYSTERESIS	50.0f
+#define MAX_ALLOWED_SLOPE_VARIANCE	3e-9
+#define MAX_ALLOWED_OFFSET_VARIANCE	200
+#define MINIMUM_ALTITUDE_RANGE		300.0f
+#define MAXIMUM_ALTITUDE_RANGE		800.0f
+#define USE_AIR_DENSITY_LETHARGY	1
+#define AIR_DENSITY_LETHARGY 		0.7
+#define AIR_DENSITY_DECIMATION		20
 
-//! this class maintains offset and slope of the air density measurement
+//! Maintains offset and slope of the air density measurement
 class air_data_result
 {
 public:
   air_data_result( void)
     : density_correction(1.0f),
+      density_variance(1.0f),
       QFF(101325.0f),
       valid( false)
   {}
   float density_correction;
+  float density_variance;
   float QFF;
   bool valid;
 };
 
-//! this class measures air density and QFF
-class air_density_observer
+//! Measures air density and reference pressure
+class air_density_observer_t
 {
 public:
-  air_density_observer (void)
+  air_density_observer_t (void)
   : min_altitude(10000.0f),
     max_altitude(0.0f),
-    altitude_trigger( ALTITUDE_TRIGGER_HYSTERESIS)
+    altitude_trigger( ALTITUDE_TRIGGER_HYSTERESIS),
+    decimation_counter( 20),
+    altitude_decimation_filter( 1.0f / AIR_DENSITY_DECIMATION * 0.25f),
+    pressure_decimation_filter( 1.0f / AIR_DENSITY_DECIMATION * 0.25f)
   {
   }
   air_data_result feed_metering( float pressure, float MSL_altitude);
@@ -75,6 +85,9 @@ class air_density_observer
     float min_altitude;
     float max_altitude;
     trigger altitude_trigger;
+    unsigned decimation_counter;
+    pt2 <float, float> altitude_decimation_filter;
+    pt2 <float, float> pressure_decimation_filter;
 };
 
 #endif /* AIR_DENSITY_OBSERVER_H_ */
diff --git a/NAV_Algorithms/airborne_detector.h b/NAV_Algorithms/airborne_detector.h
index d3fb97e..2ccd63a 100644
--- a/NAV_Algorithms/airborne_detector.h
+++ b/NAV_Algorithms/airborne_detector.h
@@ -26,6 +26,7 @@
 #ifndef NAV_ALGORITHMS_AIRBORNE_DETECTOR_H_
 #define NAV_ALGORITHMS_AIRBORNE_DETECTOR_H_
 
+//! observation of the aircraft state (ground / flying)
 class airborne_detector_t
 {
 public:
@@ -37,12 +38,14 @@ class airborne_detector_t
   {
     if( yes)
       {
-	if( airborne_counter < LEVEL)
+	if( airborne_counter == 0)
+	  airborne_counter = 100; // create hysteresis
+	else if( airborne_counter < LEVEL)
 	  ++ airborne_counter;
       }
     else
       {
-      if( airborne_counter > 0)
+	      if( airborne_counter > 0)
 	{
 	  --airborne_counter;
 	  if( airborne_counter == 0)
@@ -50,7 +53,13 @@ class airborne_detector_t
 	}
       }
   }
-    bool detect_just_landed( void)
+
+  bool is_airborne( void)
+    {
+      return airborne_counter > 0;
+    }
+
+  bool detect_just_landed( void)
     {
       if( just_landed)
 	{
diff --git a/NAV_Algorithms/atmosphere.h b/NAV_Algorithms/atmosphere.h
index 851ab12..cc72cb2 100644
--- a/NAV_Algorithms/atmosphere.h
+++ b/NAV_Algorithms/atmosphere.h
@@ -28,6 +28,7 @@
 
 #include "embedded_math.h"
 #include <air_density_observer.h>
+#include "NAV_tuning_parameters.h"
 #include <pt2.h>
 
 #define RECIP_STD_DENSITY_TIMES_2 1.632f
@@ -41,7 +42,7 @@
 /*! The offest for the conversion from degree celsius to kelvin */
 #define CELSIUS_TO_KELVIN_OFFSET 273.15f
 
-//! this class maintains instant atmosphere data like pressure, density etc
+//! Maintenance of atmosphere data like pressure, density etc.
 class atmosphere_t
 {
 public:
@@ -52,18 +53,27 @@ class atmosphere_t
     temperature(20.0f),
     humidity( 0.0f),
     density_correction(1.0f),
-    density_correction_averager(0.001f),
-    QFF(101325)
+    extrapolated_sea_level_pressure(101325),
+    GNSS_altitude_based_density_available(false),
+    GNSS_altitude_based_density(1.2255f),
+    old_density_correction(1.0f),
+    weight_sum(0.0f),
+    density_factor_weighed_sum(0.0f)
   {
-    density_correction_averager.settle(1.0f);
   }
-  void update_density_correction( void)
+  void update_density( float GNSS_altitude, bool valid)
   {
-    density_correction_averager.respond(density_correction);
+    if( valid)
+      {
+	GNSS_altitude_based_density = get_std_density( GNSS_altitude) * density_correction;
+	GNSS_altitude_based_density_available = true;
+      }
+    else
+      GNSS_altitude_based_density_available = false;
   }
   void initialize( float altitude)
   {
-    density_QFF_calculator.initialize(altitude);
+    air_density_observer.initialize(altitude);
   }
   void set_pressure( float p_abs)
   {
@@ -73,11 +83,25 @@ class atmosphere_t
   {
     return pressure;
   }
+  float get_std_density( float GNSS_altitude)
+  {
+    float std_density =
+	0.000000003547494f * GNSS_altitude * GNSS_altitude
+	-0.000115412739613f * GNSS_altitude + 1.224096628212817f;
+    return std_density;
+  }
+  float get_pressure_density( float static_pressure)
+  {
+    return 1.0496346613e-5f * static_pressure + 0.1671546011f;
+  }
   float get_density( void) const
   {
-    return  (1.0496346613e-5f * pressure + 0.1671546011f) * density_correction_averager.get_output();
+    if( GNSS_altitude_based_density_available)
+      return GNSS_altitude_based_density;
+    else
+      return  (1.0496346613e-5f * pressure + 0.1671546011f) * density_correction;
   }
-  float get_negative_altitude( void) const
+  float get_negative_pressure_altitude( void) const
   {
     float tmp = 8.104381531e-4f * pressure;
     return - tmp * tmp  + 0.20867299170f * pressure - 14421.43945f;
@@ -101,18 +125,28 @@ class atmosphere_t
     have_ambient_air_data = false;
   }
 
-  float get_QFF () const
+  float get_extrapolated_sea_level_pressure () const
   {
-    return QFF;
+    return extrapolated_sea_level_pressure;
   }
 
-  void feed_QFF_density_metering( float pressure, float MSL_altitude)
+  void air_density_metering( float pressure, float MSL_altitude)
     {
-    air_data_result result = density_QFF_calculator.feed_metering( pressure, MSL_altitude);
+    air_data_result result = air_density_observer.feed_metering( pressure, MSL_altitude);
       if( result.valid)
 	{
-	  QFF = result.QFF;
+#if USE_AIR_DENSITY_LETHARGY
+	  bool first_measurement = (weight_sum == 0.0f);
+
+	  weight_sum =  weight_sum * AIR_DENSITY_LETHARGY + (AIR_DENSITY_LETHARGY -1) / result.density_variance;
+	  density_factor_weighed_sum =  density_factor_weighed_sum * AIR_DENSITY_LETHARGY + (AIR_DENSITY_LETHARGY -1) * result.density_correction / result.density_variance;
+
+	  // postpone update unless we have two measurements
+	  if( ! first_measurement)
+	    density_correction = density_factor_weighed_sum / weight_sum;
+#else
 	  density_correction = result.density_correction;
+#endif
 	}
     }
 
@@ -127,9 +161,13 @@ class atmosphere_t
   float temperature;
   float humidity;
   float density_correction;
-  pt2<float,float> density_correction_averager;
-  float QFF;
-  air_density_observer density_QFF_calculator;
+  float extrapolated_sea_level_pressure;
+  air_density_observer_t air_density_observer;
+  bool GNSS_altitude_based_density_available;
+  float GNSS_altitude_based_density;
+  float old_density_correction;
+  float weight_sum;
+  float density_factor_weighed_sum;
 };
 
 #endif /* APPLICATION_ATMOSPHERE_H_ */
diff --git a/NAV_Algorithms/compass_calibration.h b/NAV_Algorithms/compass_calibration.h
index 8e1b3cf..919b8b9 100644
--- a/NAV_Algorithms/compass_calibration.h
+++ b/NAV_Algorithms/compass_calibration.h
@@ -68,7 +68,7 @@ class single_axis_calibration_t
   float variance; 	//!< measure of precision: sensor calibration parameter variance
 };
 
-//! this class maintains 3d magnetic calibration data
+//! Maintains 3 axes magnetic calibration data
 template <class sample_type, class evaluation_type> class compass_calibration_t
 {
 public:
@@ -91,7 +91,6 @@ template <class sample_type, class evaluation_type> class compass_calibration_t
 
   bool set_default (void)
   {
-    float variance;
     calibration_done = true;
     for( unsigned i=0; i<3; ++i)
       {
@@ -145,7 +144,7 @@ template <class sample_type, class evaluation_type> class compass_calibration_t
     return true;
   }
 
-  bool isCalibrationDone () const
+  bool available () const
   {
     return calibration_done;
   }
diff --git a/NAV_Algorithms/compass_calibrator_3D.cpp b/NAV_Algorithms/compass_calibrator_3D.cpp
new file mode 100644
index 0000000..3178d74
--- /dev/null
+++ b/NAV_Algorithms/compass_calibrator_3D.cpp
@@ -0,0 +1,204 @@
+/***********************************************************************//**
+ * @file		compass_calibrator_3D.cpp
+ * @brief		induction sensor calibration and magnetic disturbance compensation
+ * @author		Dr. Klaus Schaefer
+ * @copyright 		Copyright 20.8.2024 Dr. Klaus Schaefer. All rights reserved.
+ * @license 		This project is released under the GNU Public License GPL-3.0
+
+    <Larus Flight Sensor Firmware>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+ **************************************************************************/
+
+#define PRINT_PARAMETERS 1
+#if PRINT_PARAMETERS
+#include "stdio.h"
+#endif
+
+#include "embedded_math.h"
+#include <matrix_functions.h>
+#include "compass_calibrator_3D.h"
+
+ROM float RECIP_SECTOR_SIZE = compass_calibrator_3D_t::OBSERVATIONS / M_PI_F / TWO / TWO;
+
+bool compass_calibrator_3D_t::learn (const float3vector &observed_induction,const float3vector &expected_induction, const quaternion<float> &q, bool turning_right, float error_margin)
+{
+  float present_heading = q.get_heading();
+  if( present_heading <0.0f)
+    present_heading += M_PI_F * TWO;
+
+  unsigned sector_index = (turning_right ? OBSERVATIONS / TWO : 0) + (unsigned)(present_heading * RECIP_SECTOR_SIZE);
+
+  // if we have just left the last sector to be collected: report ready for computation
+  if( ( last_sector_collected != -1) && ( sector_index != last_sector_collected) && (++measurement_counter > 10000))
+    return true;
+
+  if( heading_sector_error[sector_index] > 1e19) // this sector has not been written before
+    ++populated_sectors;
+
+  if ( heading_sector_error[sector_index] < error_margin)
+    return false; // we have collected more precise data for this observation earlier
+
+  heading_sector_error[sector_index] = error_margin;
+
+  for( unsigned axis = 0; axis < AXES; ++axis)
+    {
+      target_vector[axis][sector_index] = expected_induction[axis];
+
+      observation_matrix[axis][sector_index][0] = 1.0f;
+      observation_matrix[axis][sector_index][1] = observed_induction[0];
+      observation_matrix[axis][sector_index][2] = observed_induction[1];
+      observation_matrix[axis][sector_index][3] = observed_induction[2];
+    }
+
+  if( (last_sector_collected == INVALID) && populated_sectors >= OBSERVATIONS)
+    // now we are collecting data within the last sector to be collected
+    last_sector_collected = sector_index;
+
+  return false;
+}
+
+bool compass_calibrator_3D_t::calculate( void)
+{
+//  if( buffer_used_for_calibration != INVALID)
+//    return false;
+
+  computation_float_type temporary_solution_matrix[PARAMETERS][PARAMETERS];
+  ARM_MATRIX_INSTANCE solution;
+  solution.numCols=PARAMETERS;
+  solution.numRows=PARAMETERS;
+  solution.pData = (computation_float_type *)temporary_solution_matrix;
+
+  ARM_MATRIX_INSTANCE  observations;
+  observations.numCols=PARAMETERS;
+  observations.numRows=OBSERVATIONS;
+
+  computation_float_type transposed_matrix[PARAMETERS][OBSERVATIONS];
+  ARM_MATRIX_INSTANCE observations_transposed;
+  observations_transposed.numCols=OBSERVATIONS;
+  observations_transposed.numRows=PARAMETERS;
+  observations_transposed.pData = (computation_float_type *)transposed_matrix;
+
+  computation_float_type matrix_to_be_inverted_data[PARAMETERS][PARAMETERS];
+  ARM_MATRIX_INSTANCE matrix_to_be_inverted;
+  matrix_to_be_inverted.numCols=PARAMETERS;
+  matrix_to_be_inverted.numRows=PARAMETERS;
+  matrix_to_be_inverted.pData = (computation_float_type *)matrix_to_be_inverted_data;
+
+  computation_float_type solution_mapping_data[PARAMETERS][OBSERVATIONS];
+  ARM_MATRIX_INSTANCE solution_mapping;
+  solution_mapping.numCols=OBSERVATIONS;
+  solution_mapping.numRows=PARAMETERS;
+  solution_mapping.pData = (computation_float_type *)solution_mapping_data;
+
+  int next_buffer;
+  if( buffer_used_for_calibration == 0)
+    next_buffer = 1;
+  else
+    next_buffer = 0;
+
+  for( unsigned axis = 0; axis < AXES; ++axis)
+    {
+      observations.pData = &(observation_matrix[axis][0][0]);
+
+      // calculation, once per axis (FRONT, RIGHT, DOWN):
+      // target vector:        T = ideal induction values for all observations
+      // single measurement:   < 1 induction q0q1 q0q2 q0q3 q1q1 q1q2 q1q3 q2q2 q2q3 q3q3 > (single row)
+      // measurement matrix:   M = single measurement * # OBSERVATIONS
+      // solution matrix:      S = inverse( M_transposed * M) * M_transposed
+      // axis parameter set:   P = S * T
+
+      arm_status result = ARM_MAT_TRANS( &observations, &observations_transposed);
+      if( result != ARM_MATH_SUCCESS)
+	{
+	  start_learning(); // discard data
+	  return false;
+	}
+
+      result = ARM_MAT_MULT( &observations_transposed, &observations, &matrix_to_be_inverted);
+      if( result != ARM_MATH_SUCCESS)
+	{
+	  start_learning(); // discard data
+	  return false;
+	}
+
+      result = ARM_MAT_INVERSE( &matrix_to_be_inverted, &solution);
+      if( result != ARM_MATH_SUCCESS)
+	{
+	  start_learning(); // discard data
+	  return false;
+	}
+
+      result = ARM_MAT_MULT( &solution, &observations_transposed, &solution_mapping);
+      if( result != ARM_MATH_SUCCESS)
+	{
+	  start_learning(); // discard data
+	  return false;
+	}
+
+      ARM_MATRIX_INSTANCE target_vector_inst;
+      target_vector_inst.numCols=1;
+      target_vector_inst.numRows=OBSERVATIONS;
+      target_vector_inst.pData=&(target_vector[axis][0]);
+
+      ARM_MATRIX_INSTANCE axis_parameter_set;
+      axis_parameter_set.numCols=1;
+      axis_parameter_set.numRows=PARAMETERS;
+      axis_parameter_set.pData=&(c[next_buffer][axis][0]);
+      result = ARM_MAT_MULT( &solution_mapping, &target_vector_inst, &axis_parameter_set);
+      if( result != ARM_MATH_SUCCESS)
+	{
+	  start_learning(); // discard data
+	  return false;
+	}
+    }
+
+  buffer_used_for_calibration = next_buffer; // switch now in a thread-save manner
+  
+#if PRINT_PARAMETERS
+
+  for( unsigned k=0; k < AXES; ++k)
+    {
+      for( unsigned i=0; i<PARAMETERS; ++i)
+	printf("%e\t", (double)(c[next_buffer][k][i]));
+      printf("\n");
+    }
+  printf("\n");
+#endif
+  
+  start_learning(); // ... again
+
+  return true;
+}
+
+float3vector compass_calibrator_3D_t::calibrate( const float3vector &induction, const quaternion<float> &q)
+  {
+    if( buffer_used_for_calibration == INVALID) // we do not have a valid calibration
+      return induction;
+
+    unsigned b=buffer_used_for_calibration; // just to save expensive characters ...
+
+    float3vector retv;
+    for( int i = 0; i < AXES; ++i)
+      {
+	retv[i] =
+	    c[b][i][0] +
+	    c[b][i][1] * induction[0] +
+	    c[b][i][2] * induction[1] +
+	    c[b][i][3] * induction[2];
+      }
+
+    return retv;
+  }
diff --git a/NAV_Algorithms/compass_calibrator_3D.h b/NAV_Algorithms/compass_calibrator_3D.h
new file mode 100644
index 0000000..100cf9f
--- /dev/null
+++ b/NAV_Algorithms/compass_calibrator_3D.h
@@ -0,0 +1,116 @@
+/***********************************************************************//**
+ * @file		compass_calibrator_3D.h
+ * @brief		induction sensor calibration and magnetic disturbance compensation
+ * @author		Dr. Klaus Schaefer
+ * @copyright 		Copyright 20.8.2024 Dr. Klaus Schaefer. All rights reserved.
+ * @license 		This project is released under the GNU Public License GPL-3.0
+
+    <Larus Flight Sensor Firmware>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+ **************************************************************************/
+
+#ifndef NAV_ALGORITHMS_COMPASS_CALIBRATOR_3D_H_
+#define NAV_ALGORITHMS_COMPASS_CALIBRATOR_3D_H_
+
+#include "embedded_memory.h"
+#include "embedded_math.h"
+#include "float3vector.h"
+#include "quaternion.h"
+
+#include "matrix_functions.h"
+
+#if 0
+typedef double computation_float_type;
+#define ARM_MATRIX_INSTANCE arm_matrix_instance_f64
+#define ARM_MAT_TRANS arm_mat_trans_f64
+#define ARM_MAT_MULT arm_mat_mult_f64
+#define ARM_MAT_INVERSE arm_mat_inverse_f64
+#else
+typedef float computation_float_type;
+#define ARM_MATRIX_INSTANCE arm_matrix_instance_f32
+#define ARM_MAT_TRANS arm_mat_trans_f32
+#define ARM_MAT_MULT arm_mat_mult_f32
+#define ARM_MAT_INVERSE arm_mat_inverse_f32
+#endif
+
+//! 3 dimensional magnetic calibration and error compensation mechanism
+class compass_calibrator_3D_t
+{
+public:
+  enum { AXES=3, PARAMETERS=4, OBSERVATIONS=22, INVALID=-1};
+
+  compass_calibrator_3D_t( void)
+    : buffer_used_for_calibration(INVALID),
+      measurement_counter(0)
+{
+    start_learning();
+  }
+
+  void start_learning( void)
+  {
+    populated_sectors = 0;
+    last_sector_collected = INVALID;
+    measurement_counter=0;
+    for( unsigned i=0; i < OBSERVATIONS; ++i)
+      heading_sector_error[i]=1e20f;
+  }
+
+  bool learn (const float3vector &observed_induction,const float3vector &expected_induction, const quaternion<float> &q, bool turning_right, float error_margin);
+  float3vector calibrate( const float3vector &induction, const quaternion<float> &q);
+  bool calculate( void);
+  bool available( void) const
+  {
+    return buffer_used_for_calibration != INVALID;
+  }
+
+  const computation_float_type * get_current_parameters( void) const
+  {
+    if( buffer_used_for_calibration == INVALID)
+      return 0;
+
+    return &(c[buffer_used_for_calibration][0][0]);
+  }
+
+  void set_current_parameters( const float * source)
+  {
+    int next_buffer;
+    if( buffer_used_for_calibration != 0)
+      next_buffer = 0;
+    else
+      next_buffer = 1;
+
+    computation_float_type * destination = &(c[next_buffer][0][0]);
+    for( unsigned i=0; i < AXES * PARAMETERS; ++i)
+      *destination++ = *source++;
+
+    buffer_used_for_calibration = next_buffer;
+  }
+
+private:
+  int buffer_used_for_calibration;
+  unsigned populated_sectors;
+  unsigned last_sector_collected;
+  unsigned measurement_counter;
+  computation_float_type c[2][AXES][PARAMETERS]; // double buffering for multi-thrading support
+  computation_float_type target_vector[AXES][OBSERVATIONS];
+  computation_float_type observation_matrix[AXES][OBSERVATIONS][PARAMETERS];
+  computation_float_type heading_sector_error[OBSERVATIONS];
+};
+
+extern compass_calibrator_3D_t compass_calibrator_3D;
+void trigger_compass_calibrator_3D_calculation(void);
+
+#endif /* NAV_ALGORITHMS_COMPASS_CALIBRATOR_3D_H_ */
diff --git a/NAV_Algorithms/compass_ground_calibration.h b/NAV_Algorithms/compass_ground_calibration.h
index 69ab926..1d070c8 100644
--- a/NAV_Algorithms/compass_ground_calibration.h
+++ b/NAV_Algorithms/compass_ground_calibration.h
@@ -8,6 +8,7 @@
 
 #define CUTOFF_DIV_BY_SAMPLING_FREQ 0.01f
 
+//! helper class to support manual compass calibration
 class compass_ground_calibration_t
 {
 public:
diff --git a/NAV_Algorithms/data_structures.h b/NAV_Algorithms/data_structures.h
index fb8f0c7..b10aa43 100644
--- a/NAV_Algorithms/data_structures.h
+++ b/NAV_Algorithms/data_structures.h
@@ -31,8 +31,7 @@
 
 #pragma pack(push, 1)
 
-
-//! contains all calibrated data from the sensors
+//! contains all input data from the sensors
 typedef struct
 {
   float3vector acc;   //XSENSE MTi1 IMU
@@ -48,6 +47,15 @@ typedef struct
 #endif
 } measurement_data_t;
 
+//! this structure contains all the observations from all sensors and the GNSS-receiver
+typedef struct
+{
+  float3vector acc;
+  float3vector gyro;
+  float3vector mag;
+  float temperature;
+} extra_sensor_data_t;
+
 //! this structure contains all the observations from sensors and GNSS
 typedef struct
 {
@@ -57,6 +65,9 @@ typedef struct
   float dummy2;
 #endif
   coordinates_t c;
+#if WITH_EXTERNAL_IMU
+  extra_sensor_data_t extra;
+#endif
 #if RUN_MICROPHONE
   float sound_intensity;
 #endif
@@ -67,6 +78,9 @@ typedef struct
 {
   measurement_data_t m;
   coordinates_t c;
+#if WITH_EXTERNAL_IMU
+  extra_sensor_data_t extra;
+#endif
 #if RUN_MICROPHONE
   float sound_intensity;
 #endif
@@ -120,6 +134,7 @@ typedef struct
   float vario_wind_E;
   float3vector body_induction;
   float3vector body_induction_error;
+  float gyro_correction_power;
 #endif
 
 } output_data_t;
diff --git a/NAV_Algorithms/earth_induction_model.h b/NAV_Algorithms/earth_induction_model.h
index 6ec8b98..0f8a285 100644
--- a/NAV_Algorithms/earth_induction_model.h
+++ b/NAV_Algorithms/earth_induction_model.h
@@ -30,6 +30,7 @@
 
 enum { N_AREAS=8, N_COEFFICIENTS=10};
 
+//! struct containing magnetic induction data for a point
 typedef struct
 {
   float declination; //!< positive to the east
@@ -37,6 +38,7 @@ typedef struct
   bool valid;
 } induction_values;
 
+//! struct containing magnetic induction data for a regional region
 typedef struct
 {
   double longitude_limit_west;
@@ -47,6 +49,7 @@ typedef struct
   double coefficients_inclination[N_COEFFICIENTS];
 } induction_model_area_t;
 
+//! Providing worldwide magnetic induction data
 class earth_induction_model_t
 {
 private:
diff --git a/NAV_Algorithms/induction_observer.h b/NAV_Algorithms/induction_observer.h
deleted file mode 100644
index f1ac2c9..0000000
--- a/NAV_Algorithms/induction_observer.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * induction_observer.h
- *
- *  Created on: Feb 6, 2023
- *      Author: schaefer
- */
-
-#ifndef NAV_ALGORITHMS_INDUCTION_OBSERVER_H_
-#define NAV_ALGORITHMS_INDUCTION_OBSERVER_H_
-#include "mean_and_variance_finder.h"
-#include "float3vector.h"
-
-template <class sample_type> class induction_observer_t
-{
-public:
-  induction_observer_t( float _scale_factor)
-  : scale_factor( _scale_factor)
-  {}
-  void feed (float3vector induction, bool right_turn)
-  {
-    if (right_turn)
-	for( unsigned i=0; i<3; ++i)
-	  induction_observer_right[i].feed (induction[i] * scale_factor);
-    else
-	for( unsigned i=0; i<3; ++i)
-	  induction_observer_left[i].feed (induction[i] * scale_factor);
-  }
-  void
-  reset (void)
-  {
-    for( unsigned i=0; i<3; ++i)
-      {
-	induction_observer_right[i].reset ();
-	induction_observer_left[i].reset ();
-      }
-}
-  bool data_valid( void) const
-  {
-    return (
-	(induction_observer_right[0].get_samples() > MINIMUM_MAG_CALIBRATION_SAMPLES) &&
-	(induction_observer_left[0].get_samples() > MINIMUM_MAG_CALIBRATION_SAMPLES) );
-  }
-  float3vector get_estimated_induction( void) const
-  {
-    float3vector retv;
-    for( unsigned i=0; i<3; ++i)
-      retv[i] = (induction_observer_right[i].get_mean() + induction_observer_left[i].get_mean()) * 0.5f / scale_factor;
-    return retv;
-  }
-  float get_variance( void ) const
-  {
-    float sum = 0.0f;
-    for( unsigned i=0; i<3; ++i)
-      {
-	sum += induction_observer_right[i].get_variance();
-	sum += induction_observer_left[i].get_variance();
-      }
-    return sum * 0.1666666666f / scale_factor / scale_factor;
-  }
-private:
-  enum{ MINIMUM_SAMPLES = 10000};
-  float scale_factor;
-  mean_and_variance_finder_t <sample_type> induction_observer_right[3];
-  mean_and_variance_finder_t <sample_type> induction_observer_left[3];
-};
-
-#endif /* NAV_ALGORITHMS_INDUCTION_OBSERVER_H_ */
diff --git a/NAV_Algorithms/magnetic_induction_report.h b/NAV_Algorithms/magnetic_induction_report.h
index fda876d..12dce15 100644
--- a/NAV_Algorithms/magnetic_induction_report.h
+++ b/NAV_Algorithms/magnetic_induction_report.h
@@ -12,13 +12,11 @@
 #include "float3vector.h"
 #include "NAV_tuning_parameters.h"
 
+//! helper struct containing magnetic calibration data
 struct magnetic_induction_report_t
 {
   single_axis_calibration_t calibration[3];
-#if USE_EARTH_INDUCTION_DATA_COLLECTOR
-  float3vector nav_induction;
-  float nav_induction_std_deviation;
-#endif
+  bool valid;
 };
 
 void report_magnetic_calibration_has_changed( magnetic_induction_report_t *p_magnetic_induction_report, char type);
diff --git a/NAV_Algorithms/navigator.cpp b/NAV_Algorithms/navigator.cpp
index 5949775..d71d630 100644
--- a/NAV_Algorithms/navigator.cpp
+++ b/NAV_Algorithms/navigator.cpp
@@ -52,7 +52,7 @@ void navigator_t::update_at_100Hz (
       ahrs.get_nav_acceleration (),
       heading_vector,
       GNSS_negative_altitude,
-      atmosphere.get_negative_altitude(),
+      atmosphere.get_negative_pressure_altitude(),
       IAS,
       wind_observer.get_speed_compensator_wind(),
       (GNSS_fix_type != 0)
@@ -85,24 +85,39 @@ void navigator_t::update_GNSS_data( const coordinates_t &coordinates)
 bool navigator_t::update_at_10Hz ()
 {
   bool landing_detected=false;
-  atmosphere.feed_QFF_density_metering(
-	air_pressure_resampler_100Hz_10Hz.get_output(),
-	flight_observer.get_filtered_GNSS_altitude());
 
-  atmosphere.update_density_correction(); // here because of the 10 Hz call frequency
+  atmosphere.update_density( -GNSS_negative_altitude, GNSS_fix_type > 0);
 
   wind_observer.process_at_10_Hz( ahrs);
 
   vario_integrator.update (flight_observer.get_vario_GNSS(), // here because of the update rate 10Hz
-			   ahrs.get_euler ().y,
+			   ahrs.get_euler ().yaw,
 			   ahrs.get_circling_state ());
 
-  airborne_detector.report_to_be_airborne( abs( flight_observer.get_speed_compensation_GNSS()) > AIRBORNE_TRIGGER_SPEED);
+  unsigned airborne_criteria_fulfilled = 0;
+  if( abs( flight_observer.get_speed_compensation_GNSS()) > AIRBORNE_TRIGGER_SPEED_COMP)
+    ++ airborne_criteria_fulfilled;
+
+  if( GNSS_velocity.abs() > AIRBORNE_TRIGGER_SPEED)
+    ++ airborne_criteria_fulfilled;
+
+  if( IAS > AIRBORNE_TRIGGER_SPEED)
+    ++ airborne_criteria_fulfilled;
+
+  // if at least 2 of 3 criteria apply, we believe to be airborne
+  airborne_detector.report_to_be_airborne( airborne_criteria_fulfilled > 1);
   if( airborne_detector.detect_just_landed())
     {
       ahrs.write_calibration_into_EEPROM();
       landing_detected = true;
     }
+
+  if( airborne_detector.is_airborne())
+    atmosphere.air_density_metering(
+	air_pressure_resampler_100Hz_10Hz.get_output(),
+	flight_observer.get_filtered_GNSS_altitude());
+
+
   return landing_detected;
 }
 
@@ -133,14 +148,14 @@ void navigator_t::report_data( output_data_t &d)
     d.slip_angle		= ahrs.getSlipAngle();
     d.pitch_angle		= ahrs.getPitchAngle();
     d.G_load			= ahrs.get_G_load();
-    d.pressure_altitude		= - atmosphere.get_negative_altitude();
+    d.pressure_altitude		= - atmosphere.get_negative_pressure_altitude();
     d.magnetic_disturbance	= ahrs.getMagneticDisturbance();
     d.air_density		= atmosphere.get_density();
     d.nav_induction_gnss 	= ahrs.get_nav_induction();
 
 #if DEVELOPMENT_ADDITIONS
 
-    d.QFF			= atmosphere.get_QFF();
+    d.QFF			= atmosphere.get_extrapolated_sea_level_pressure();
     d.nav_correction		= ahrs.get_nav_correction();
     d.gyro_correction		= ahrs.get_gyro_correction();
     d.nav_acceleration_gnss 	= ahrs.get_nav_acceleration();
@@ -165,5 +180,7 @@ void navigator_t::report_data( output_data_t &d)
     d.vario_wind_E		= wind_observer.get_speed_compensator_wind()[EAST];
     d.body_induction		= ahrs.getBodyInduction();
     d.body_induction_error	= ahrs.getBodyInductionError();
+//    d.body_induction_error	= ahrs_magnetic.getBodyInductionError();
+    d.gyro_correction_power	= ahrs.getGyro_correction_Power();
 #endif
 }
diff --git a/NAV_Algorithms/navigator.h b/NAV_Algorithms/navigator.h
index 1fe285f..815993f 100644
--- a/NAV_Algorithms/navigator.h
+++ b/NAV_Algorithms/navigator.h
@@ -50,7 +50,7 @@ class navigator_t
 	 flight_observer(),
 	 wind_observer(),
 	 airborne_detector(),
-	 air_pressure_resampler_100Hz_10Hz(0.04f), // f / fcutoff = 80% * 0.5 * 0.1
+	 air_pressure_resampler_100Hz_10Hz(0.025f), // = 2.5 Hz @ 100Hz
 	 pitot_pressure(0.0f),
 	 TAS( 0.0f),
 	 IAS( 0.0f),
@@ -74,7 +74,6 @@ class navigator_t
     ahrs_magnetic.update_magnetic_induction_data( declination, inclination);
 #endif
   }
-
   void set_density_data( float temperature, float humidity)
   {
     if( ! isnan( temperature) && ! isnan( humidity) )
@@ -89,7 +88,7 @@ class navigator_t
 
   void feed_QFF_density_metering( float pressure, float MSL_altitude)
   {
-    atmosphere.feed_QFF_density_metering( pressure, MSL_altitude);
+    atmosphere.air_density_metering( pressure, MSL_altitude);
   }
 
   void disregard_density_data( void)
@@ -119,7 +118,7 @@ class navigator_t
 
   void reset_altitude( void)
   {
-    flight_observer.reset( atmosphere.get_negative_altitude(), GNSS_negative_altitude);
+    flight_observer.reset( atmosphere.get_negative_pressure_altitude(), GNSS_negative_altitude);
   }
   /**
    * @brief update pitot pressure
diff --git a/NAV_Algorithms/old_data_structures.h b/NAV_Algorithms/old_data_structures.h
deleted file mode 100644
index aed6212..0000000
--- a/NAV_Algorithms/old_data_structures.h
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
- * old_data_structures.h
- *
- *  Created on: Nov 5, 2022
- *      Author: schaefer
- */
-
-#ifndef OLD_DATA_STRUCTURES_H_
-#define OLD_DATA_STRUCTURES_H_
-
-#include "system_configuration.h"
-#include "quaternion.h"
-#include "GNSS.h"
-#include "data_structures.h"
-#include "cmath"
-
-#pragma pack(push, 1)
-
-typedef struct
-{
-  float3vector acc;
-  float3vector gyro;
-  float3vector mag;
-  float3vector lowcost_acc;
-  float3vector lowcost_gyro;
-  float3vector lowcost_mag;
-  float pitot_pressure;
-  float static_pressure;
-  float absolute_pressure;  		//this is the second ms5611 on the PCB.
-  float static_sensor_temperature;  	//log temperature to monitor temperature in enclosure
-  float absolute_sensor_temperature;
-  float supply_voltage;  		//Measuring the supply voltage. Might be related to sensor noise.
-#if WITH_DENSITY_DATA
-  float outside_air_temperature; //!< OAT from external sensor if installed
-  float outside_air_humidity; //!< 0.0 -> 1.0 NOT percent
-#endif
-} old_measurement_data_t;
-
-typedef struct
-{
-  float3vector position;  	//!< NED / meters
-  float3vector velocity;  	//!< NED / m/s
-  float3vector acceleration;  	//!< NED / m/s^2 (from velocity delta)
-  float  heading_motion;	// degrees
-  float  speed_motion;		// m/s
-  float3vector relPosNED;	//
-  float relPosHeading;
-  float speed_acc;		// speed accuracy m/s
-  double latitude;		//!< degrees
-  double longitude;		//!< degrees
-
-  uint8_t year;
-  uint8_t month;
-  uint8_t day;
-  uint8_t hour;
-
-  uint8_t minute;
-  uint8_t second;
-  uint8_t SATS_number;	//!< number of tracked satellites
-  uint8_t sat_fix_type;	//!< bit 0: SAT FIX, bit 1: SAT HEADING availale
-
-#if INCLUDING_NANO
-  int32_t nano;		// nanoseconds from time stamp
-#endif
-
-  int16_t geo_sep_dm; 	//!< (WGS ellipsoid height - elevation MSL) in 0.1m units
-  uint16_t dummy;
-} old_coordinates_t;
-
-typedef struct
-{
-  old_measurement_data_t m;
-  old_coordinates_t c;
-} old_input_data_t;
-
-#pragma pack(pop)
-
-inline void new_format_from_old( measurement_data_t & out_m, coordinates_t & out_c, const old_input_data_t & in)
-{
-  out_m.acc = in.m.acc;
-  out_m.mag = in.m.mag;
-  out_m.gyro = in.m.gyro;
-
-  out_m.static_pressure = in.m.static_pressure;
-  out_m.static_sensor_temperature = in.m.static_sensor_temperature;
-  out_m.pitot_pressure = in.m.pitot_pressure;
-  out_m.supply_voltage = in.m.supply_voltage;
-
-  out_c.position = in.c.position;
-  out_c.velocity = in.c.velocity;
-  out_c.acceleration = in.c.acceleration;  	//!< NED / m/s^2 (from velocity delta)
-  out_c.heading_motion = in.c.heading_motion;	// degrees
-  out_c.speed_motion = in.c.speed_motion;
-  out_c.relPosNED = in.c.relPosNED;	//
-  out_c.relPosHeading = in.c.relPosHeading;
-  out_c.speed_acc = in.c.speed_acc;
-  out_c.latitude = in.c.latitude;
-  out_c.longitude = in.c.longitude;
-
-  out_c.year = in.c.year;
-  out_c.month = in.c.month;
-  out_c.day = in.c.day;
-  out_c.hour = in.c.hour;
-
-  out_c.minute = in.c.minute;
-  out_c.second = in.c.second;
-  out_c.geo_sep_dm = in.c.geo_sep_dm;
-
-  out_c.nano = 0xefffffff; // not used in old format
-
-  // patches
-  if( isnormal(out_c.relPosHeading))
-    out_c.sat_fix_type = 3; // D-GNSS available
-  else
-    out_c.sat_fix_type = 1;
-
-  out_c.SATS_number  = 55; // just a joke ...
-
-#if GNSS_VERTICAL_SPEED_INVERTED // for simulation with some old data
-  out_c.velocity[DOWN] *= -1.0f; // earlier we recorded the wrong sign
-#endif
-}
-
-#endif /* OLD_DATA_STRUCTURES_H_ */
diff --git a/NAV_Algorithms/organizer.h b/NAV_Algorithms/organizer.h
index 73efd95..a53ea20 100644
--- a/NAV_Algorithms/organizer.h
+++ b/NAV_Algorithms/organizer.h
@@ -30,6 +30,13 @@
 #include "navigator.h"
 #include "earth_induction_model.h"
 
+typedef struct
+{
+  float3vector acc_observed_left;
+  float3vector acc_observed_right;
+  float3vector acc_observed_level;
+} vector_average_collection_t;
+
 //! set of algorithms and data to be used by Larus flight sensor
 class organizer_t
 {
@@ -43,20 +50,55 @@ class organizer_t
 
   }
 
+  void update_sensor_orientation_data( const vector_average_collection_t & values)
+  {
+    // resolve sensor orientation using measurements
+    float3vector front_down_sensor_helper = values.acc_observed_right.vector_multiply( values.acc_observed_left);
+    float3vector u_right_sensor = front_down_sensor_helper.vector_multiply( values.acc_observed_level);
+    u_right_sensor.normalize();
+
+    float3vector u_down_sensor = values.acc_observed_level * -1.0f;
+    u_down_sensor.normalize();
+
+    float3vector u_front_sensor=u_right_sensor.vector_multiply(u_down_sensor);
+    u_front_sensor.normalize();
+
+    // calculate the new rotation matrix using our calibration data
+    sensor_mapping.e[0][0]=u_front_sensor[0];
+    sensor_mapping.e[0][1]=u_front_sensor[1];
+    sensor_mapping.e[0][2]=u_front_sensor[2];
+
+    sensor_mapping.e[1][0]=u_right_sensor[0];
+    sensor_mapping.e[1][1]=u_right_sensor[1];
+    sensor_mapping.e[1][2]=u_right_sensor[2];
+
+    sensor_mapping.e[2][0]=u_down_sensor[0];
+    sensor_mapping.e[2][1]=u_down_sensor[1];
+    sensor_mapping.e[2][2]=u_down_sensor[2];
+
+    quaternion<float> q;
+    q.from_rotation_matrix( sensor_mapping);
+    eulerangle<float> euler = q;
+
+    // make the change permanent
+    lock_EEPROM( false);
+    write_EEPROM_value( SENS_TILT_ROLL,  euler.roll);
+    write_EEPROM_value( SENS_TILT_PITCH, euler.pitch);
+    write_EEPROM_value( SENS_TILT_YAW,   euler.yaw);
+    lock_EEPROM( true);
+  }
+
   void initialize_before_measurement( void)
   {
     pitot_offset= configuration (PITOT_OFFSET);
     pitot_span 	= configuration (PITOT_SPAN);
     QNH_offset	= configuration (QNH_OFFSET);
 
-      {
-        quaternion<float> q;
-        q.from_euler (configuration (SENS_TILT_ROLL),
-  		      configuration (SENS_TILT_PITCH),
-  		      configuration (SENS_TILT_YAW));
-        q.get_rotation_matrix (sensor_mapping);
-      }
-
+    quaternion<float> q;
+    q.from_euler (configuration (SENS_TILT_ROLL),
+		  configuration (SENS_TILT_PITCH),
+		  configuration (SENS_TILT_YAW));
+    q.get_rotation_matrix (sensor_mapping);
   }
 
   void update_magnetic_induction_data( double latitude, double longitude)
@@ -104,10 +146,11 @@ class organizer_t
     return landing_detected;
   }
 
-  void set_attitude ( float roll, float nick, float present_heading)
+  void set_attitude ( float roll, float pitch, float present_heading)
   {
-    navigator.set_attitude ( roll, nick, present_heading);
+    navigator.set_attitude ( roll, pitch, present_heading);
   }
+
   void update_GNSS_data( const coordinates_t &coordinates)
   {
     navigator.update_GNSS_data(coordinates);
@@ -116,15 +159,9 @@ class organizer_t
   void update_every_10ms( output_data_t & output_data)
   {
     // rotate sensor coordinates into airframe coordinates
-#if USE_LOWCOST_IMU == 1
-    acc  = sensor_mapping * output_data.m.lowcost_acc;
-    mag  = sensor_mapping * output_data.m.lowcost_mag;
-    gyro = sensor_mapping * output_data.m.lowcost_gyro;
-#else
     acc  = sensor_mapping * output_data.m.acc;
     mag  = sensor_mapping * output_data.m.mag;
     gyro = sensor_mapping * output_data.m.gyro;
-#endif
 
 #if DEVELOPMENT_ADDITIONS
     output_data.body_acc  = acc;
diff --git a/NAV_Algorithms/persistent_data.cpp b/NAV_Algorithms/persistent_data.cpp
index 9bc7c34..ea40781 100644
--- a/NAV_Algorithms/persistent_data.cpp
+++ b/NAV_Algorithms/persistent_data.cpp
@@ -45,7 +45,7 @@ ROM persistent_data_t PERSISTENT_DATA[]=
 	{MAG_Z_OFF,	"Mag_Z_Off",		false,  0.0f, 0},	//! Induction sensor x offset signed / ( 10.0f / 32768 )
 	{MAG_Z_SCALE,	"Mag_Z_Scale",		false,  1.0f, 0},	//! Induction sensor x gain signed ( scale-factor = 1.0f + value / 32768 )
 	{MAG_STD_DEVIATION, "Mag_Calib_Err",	false,  1e-2f, 0},	//! Magnetic calibration STD deviation / ( 1 % / 65536 )
-	{MAG_AUTO_CALIB, "Mag_Auto_Calib",	false,  1.0f, 0},	//! Magnetic calibration adjusted automatically
+	{MAG_AUTO_CALIB, "Mag_Auto_Calib",	false,  2.0f, 0},	//! Magnetic calibration automatic { OFF=0, 2D=1, 3D=2 }
 
 	{VARIO_TC,	"Vario_TC",		false, 2.0f, 0}, 	//! Vario time constant unsigned s / ( 100.0f / 65536 )
 	{VARIO_INT_TC,	"Vario_Int_TC",		false, 30.0f, 0},	//! Vario integrator time constant unsigned s / ( 100.0f / 65536 )
diff --git a/NAV_Algorithms/setup_tester.cpp b/NAV_Algorithms/setup_tester.cpp
new file mode 100644
index 0000000..3f94c9c
--- /dev/null
+++ b/NAV_Algorithms/setup_tester.cpp
@@ -0,0 +1,90 @@
+#include "float3vector.h"
+#include "float3matrix.h"
+#include "quaternion.h"
+
+const float ldi[]={0.1f, +0.25f, -10.0f}; // left wing down
+const float rdi[]={0.1f, -0.15f, -10.0f}; // right wing down
+const float hi[] ={0.0f, 0.0f, -10.0f};   // horizontal
+
+const float test1[] ={ 1.0f, 0.0f, 0.0f};
+const float test2[] ={ 0.57735f, 0.57735f, -0.57735f};
+const float test3[] ={ 0.57735f, -0.57735f, 0.57735f};
+
+void setup_tester(void)
+{
+  // create measurements in aircraft body frame
+  float3vector left_down_body( ldi);
+  float3vector right_down_body( rdi);
+  float3vector horiz_body( hi);
+
+  // setup the sensor orientation to test
+  quaternion<float>sensor_q;
+//  sensor_q.from_euler( 45.0 * M_PI/180.0, 135.0 * M_PI/180.0, -30.0 * M_PI/180.0);
+//  sensor_q.from_euler( -135.0 * M_PI/180.0, 45.0 * M_PI/180.0, 150.0 * M_PI/180.0);
+//  sensor_q.from_euler( -10.0 * M_PI/180.0, 75.0 * M_PI/180.0, -45.0 * M_PI/180.0);
+  sensor_q.from_euler( 10.0 * M_PI/180.0, -75.0 * M_PI/180.0, +125.0 * M_PI/180.0);
+
+  float3matrix sensor_2_body_assumption;
+  sensor_q.get_rotation_matrix(sensor_2_body_assumption);
+
+  //check if we get the rotation angles using the matrix given
+  quaternion<float> qin;
+  qin.from_rotation_matrix(sensor_2_body_assumption);
+  eulerangle<float> euler_in = qin;
+  euler_in.roll  *= 180/M_PI;
+  euler_in.pitch *= 180/M_PI;
+  euler_in.yaw   *= 180/M_PI;
+
+  // map measurements into the sensor frame
+  float3vector left_down_sensor	= sensor_2_body_assumption.reverse_map( left_down_body);
+  float3vector right_down_sensor= sensor_2_body_assumption.reverse_map( right_down_body);
+  float3vector horiz_sensor 	= sensor_2_body_assumption.reverse_map( horiz_body);
+
+  // resolve sensor orientation using measurements
+  float3vector front_down_sensor_helper = right_down_sensor.vector_multiply(left_down_sensor);
+  float3vector u_right_sensor = front_down_sensor_helper.vector_multiply(horiz_sensor);
+  u_right_sensor.normalize();
+
+  float3vector u_down_sensor = horiz_sensor * -1.0f;
+  u_down_sensor.normalize();
+
+  float3vector u_front_sensor=u_right_sensor.vector_multiply(u_down_sensor);
+  u_front_sensor.normalize();
+
+  // calculate the rotation matrix using our calibration data
+  float3matrix sensor_2_body;
+  sensor_2_body.e[0][0]=u_front_sensor[0];
+  sensor_2_body.e[0][1]=u_front_sensor[1];
+  sensor_2_body.e[0][2]=u_front_sensor[2];
+
+  sensor_2_body.e[1][0]=u_right_sensor[0];
+  sensor_2_body.e[1][1]=u_right_sensor[1];
+  sensor_2_body.e[1][2]=u_right_sensor[2];
+
+  sensor_2_body.e[2][0]=u_down_sensor[0];
+  sensor_2_body.e[2][1]=u_down_sensor[1];
+  sensor_2_body.e[2][2]=u_down_sensor[2];
+
+  // test if the rotation matrix recovers the body frame data
+  float3vector test_left_down_body = sensor_2_body * left_down_sensor;
+  float3vector test_right_down_body = sensor_2_body * right_down_sensor;
+  float3vector test_horiz_body = sensor_2_body * horiz_sensor;
+
+  // test if the matrix provides pure rotation
+  float3vector vtest1 = sensor_2_body * float3vector( test1);
+  float test1_abs=vtest1.abs();
+  float3vector vtest2 = sensor_2_body * float3vector( test2);
+  float test2_abs=vtest2.abs();
+  float3vector vtest3 = sensor_2_body * float3vector( test3);
+  float test3_abs=vtest3.abs();
+
+  //check if we get the rotation angles using the matrix we have calculated
+  quaternion<float> q;
+  q.from_rotation_matrix(sensor_2_body);
+  eulerangle<float> euler = q;
+  euler.roll  *= 180/M_PI;
+  euler.pitch *= 180/M_PI;
+  euler.yaw   *= 180/M_PI;
+}
+
+
diff --git a/NAV_Algorithms/soaring_flight_averager.h b/NAV_Algorithms/soaring_flight_averager.h
index f912401..9aeddd4 100644
--- a/NAV_Algorithms/soaring_flight_averager.h
+++ b/NAV_Algorithms/soaring_flight_averager.h
@@ -88,7 +88,8 @@ template<class value_t, bool CLAMP_OUTPUT_FIRST_CIRCLE = false, bool SOFT_TAKEOF
 	  if (new_state == STRAIGHT_FLIGHT)
 	    {
 	      active_state = STRAIGHT_FLIGHT;
-	      reset( get_boxcar_average());
+//	      reset( get_boxcar_average()); todo patch
+	      reset( present_output); // smooth transition
 	    }
 
 	  record_input (current_value, heading);
diff --git a/NAV_Algorithms/soft_iron_compensator.cpp b/NAV_Algorithms/soft_iron_compensator.cpp
new file mode 100644
index 0000000..d91a5ab
--- /dev/null
+++ b/NAV_Algorithms/soft_iron_compensator.cpp
@@ -0,0 +1,216 @@
+/***********************************************************************//**
+ * @file		compass_calibrator_3D.cpp
+ * @brief		induction sensor calibration and magnetic disturbance compensation
+ * @author		Dr. Klaus Schaefer
+ * @copyright 		Copyright 20.8.2024 Dr. Klaus Schaefer. All rights reserved.
+ * @license 		This project is released under the GNU Public License GPL-3.0
+
+    <Larus Flight Sensor Firmware>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+ **************************************************************************/
+
+#define PRINT_PARAMETERS 1
+#if PRINT_PARAMETERS
+#include "stdio.h"
+#endif
+
+#include "soft_iron_compensator.h"
+#include "embedded_math.h"
+
+#include <matrix_functions.h>
+
+ROM float RECIP_SECTOR_SIZE = soft_iron_compensator_t::OBSERVATIONS / M_PI_F / TWO / TWO;
+
+bool soft_iron_compensator_t::learn ( const float3vector &induction_error, const quaternion<float> &q, bool turning_right, float error_margin)
+{
+  float present_heading = q.get_heading();
+  if( present_heading <0.0f)
+    present_heading += M_PI_F * TWO;
+
+  int sector_index = (turning_right ? OBSERVATIONS / TWO : 0) + (unsigned)(present_heading * RECIP_SECTOR_SIZE);
+
+  // if we have just left the last sector to be collected: report ready for computation
+  if( ( last_sector_collected != -1) && ( sector_index != last_sector_collected) && (++measurement_counter > 10000))
+    return true;
+
+  if( heading_sector_error[sector_index] > 1e19) // this sector has not been written before
+    ++populated_sectors;
+
+  if ( heading_sector_error[sector_index] < error_margin)
+    return false; // we have collected more precise data for this observation earlier
+
+  heading_sector_error[sector_index] = error_margin;
+
+  for( unsigned axis = 0; axis < AXES; ++axis)
+    {
+      target_vector[axis][sector_index] = induction_error[axis];
+
+      observation_matrix[axis][sector_index][0] = 1.0f;
+      observation_matrix[axis][sector_index][1] = q[0] * q[1];
+      observation_matrix[axis][sector_index][2] = q[0] * q[2];
+      observation_matrix[axis][sector_index][3] = q[0] * q[3];
+      observation_matrix[axis][sector_index][4] = q[1] * q[1];
+      observation_matrix[axis][sector_index][5] = q[1] * q[2];
+      observation_matrix[axis][sector_index][6] = q[1] * q[3];
+      observation_matrix[axis][sector_index][7] = q[2] * q[2];
+      observation_matrix[axis][sector_index][8] = q[2] * q[3];
+      observation_matrix[axis][sector_index][9] = q[3] * q[3];
+    }
+
+  if( (last_sector_collected == -1) && populated_sectors >= OBSERVATIONS)
+    last_sector_collected = sector_index;
+
+  return false;
+}
+
+bool soft_iron_compensator_t::calculate( void)
+{
+//  if( buffer_used_for_calibration != INVALID)
+//    return false;
+
+  computation_float_type temporary_solution_matrix[PARAMETERS][PARAMETERS];
+  ARM_MATRIX_INSTANCE solution;
+  solution.numCols=PARAMETERS;
+  solution.numRows=PARAMETERS;
+  solution.pData = (computation_float_type *)temporary_solution_matrix;
+
+  ARM_MATRIX_INSTANCE  observations;
+  observations.numCols=PARAMETERS;
+  observations.numRows=OBSERVATIONS;
+
+  computation_float_type transposed_matrix[PARAMETERS][OBSERVATIONS];
+  ARM_MATRIX_INSTANCE observations_transposed;
+  observations_transposed.numCols=OBSERVATIONS;
+  observations_transposed.numRows=PARAMETERS;
+  observations_transposed.pData = (computation_float_type *)transposed_matrix;
+
+  computation_float_type matrix_to_be_inverted_data[PARAMETERS][PARAMETERS];
+  ARM_MATRIX_INSTANCE matrix_to_be_inverted;
+  matrix_to_be_inverted.numCols=PARAMETERS;
+  matrix_to_be_inverted.numRows=PARAMETERS;
+  matrix_to_be_inverted.pData = (computation_float_type *)matrix_to_be_inverted_data;
+
+  computation_float_type solution_mapping_data[PARAMETERS][OBSERVATIONS];
+  ARM_MATRIX_INSTANCE solution_mapping;
+  solution_mapping.numCols=OBSERVATIONS;
+  solution_mapping.numRows=PARAMETERS;
+  solution_mapping.pData = (computation_float_type *)solution_mapping_data;
+
+  int next_buffer;
+  if( buffer_used_for_calibration == 0)
+    next_buffer = 1;
+  else
+    next_buffer = 0;
+
+  for( unsigned axis = 0; axis < AXES; ++axis)
+    {
+      observations.pData = &(observation_matrix[axis][0][0]);
+
+      // calculation, once per axis (FRONT, RIGHT, DOWN):
+      // target vector:        T = ideal induction values for all observations
+      // single measurement:   < 1 induction q0q1 q0q2 q0q3 q1q1 q1q2 q1q3 q2q2 q2q3 q3q3 > (single row)
+      // measurement matrix:   M = single measurement * # OBSERVATIONS
+      // solution matrix:      S = inverse( M_transposed * M) * M_transposed
+      // axis parameter set:   P = S * T
+
+      arm_status result = ARM_MAT_TRANS( &observations, &observations_transposed);
+      if( result != ARM_MATH_SUCCESS)
+	{
+	  start_learning(); // discard data
+	  return false;
+	}
+
+      result = ARM_MAT_MULT( &observations_transposed, &observations, &matrix_to_be_inverted);
+      if( result != ARM_MATH_SUCCESS)
+	{
+	  start_learning(); // discard data
+	  return false;
+	}
+
+      result = ARM_MAT_INVERSE( &matrix_to_be_inverted, &solution);
+      if( result != ARM_MATH_SUCCESS)
+	{
+	  start_learning(); // discard data
+	  return false;
+	}
+
+      result = ARM_MAT_MULT( &solution, &observations_transposed, &solution_mapping);
+      if( result != ARM_MATH_SUCCESS)
+	{
+	  start_learning(); // discard data
+	  return false;
+	}
+
+      ARM_MATRIX_INSTANCE target_vector_inst;
+      target_vector_inst.numCols=1;
+      target_vector_inst.numRows=OBSERVATIONS;
+      target_vector_inst.pData=&(target_vector[axis][0]);
+
+      ARM_MATRIX_INSTANCE axis_parameter_set;
+      axis_parameter_set.numCols=1;
+      axis_parameter_set.numRows=PARAMETERS;
+      axis_parameter_set.pData=&(c[next_buffer][axis][0]);
+      result = ARM_MAT_MULT( &solution_mapping, &target_vector_inst, &axis_parameter_set);
+      if( result != ARM_MATH_SUCCESS)
+	{
+	  start_learning(); // discard data
+	  return false;
+	}
+      if( buffer_used_for_calibration != INVALID) // if we already had a valid parameter set
+	{
+	  int other_buffer = next_buffer == 1 ? 0 : 1;
+	  for( unsigned i=0; i<PARAMETERS; ++i)
+	    // use average between new and old parameter set
+	    c[next_buffer][axis][i] = c[next_buffer][axis][i] * 0.25 + c[other_buffer][axis][i] * 0.75;
+	}
+    }
+
+  buffer_used_for_calibration = next_buffer; // switch now in a thread-save manner
+
+#if PRINT_PARAMETERS
+
+  for( unsigned k=0; k < AXES; ++k)
+    {
+      for( unsigned i=0; i<PARAMETERS; ++i)
+	printf("%e\t", (double)(c[next_buffer][k][i]));
+      printf("\n");
+    }
+  printf("\n");
+#endif
+
+  start_learning(); // ... again
+
+  return true;
+}
+
+float3vector soft_iron_compensator_t::compensate( const float3vector &induction, const quaternion<float> &q)
+  {
+    if( buffer_used_for_calibration == INVALID) // we do not have a valid calibration
+      return float3vector();
+
+    unsigned b=buffer_used_for_calibration; // just to save expensive characters ...
+
+    float3vector retv;
+    for( int i = 0; i < 3; ++i)
+      {
+	retv[i] =
+	    c[b][i][0] +
+	    c[b][i][1] * q[0] * q[1] + c[b][i][2] * q[0] * q[2] + c[b][i][3] * q[0] * q[3] +
+	    c[b][i][4] * q[1] * q[1] + c[b][i][5] * q[1] * q[2] + c[b][i][6] * q[1] * q[3] +
+	    c[b][i][7] * q[2] * q[2] + c[b][i][8] * q[2] * q[3] + c[b][i][9] * q[3] * q[3] ;
+      }
+    return retv;
+  }
diff --git a/NAV_Algorithms/soft_iron_compensator.h b/NAV_Algorithms/soft_iron_compensator.h
new file mode 100644
index 0000000..328ebbf
--- /dev/null
+++ b/NAV_Algorithms/soft_iron_compensator.h
@@ -0,0 +1,116 @@
+/***********************************************************************//**
+ * @file		compass_calibrator_3D.h
+ * @brief		induction sensor calibration and magnetic disturbance compensation
+ * @author		Dr. Klaus Schaefer
+ * @copyright 		Copyright 20.8.2024 Dr. Klaus Schaefer. All rights reserved.
+ * @license 		This project is released under the GNU Public License GPL-3.0
+
+    <Larus Flight Sensor Firmware>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+ **************************************************************************/
+
+#ifndef NAV_ALGORITHMS_SOFT_IRON_COMPENSATOR_H_
+#define NAV_ALGORITHMS_SOFT_IRON_COMPENSATOR_H_
+
+#include "embedded_memory.h"
+#include "embedded_math.h"
+#include "float3vector.h"
+#include "quaternion.h"
+
+#include "matrix_functions.h"
+
+#if 0
+typedef double computation_float_type;
+#define ARM_MATRIX_INSTANCE arm_matrix_instance_f64
+#define ARM_MAT_TRANS arm_mat_trans_f64
+#define ARM_MAT_MULT arm_mat_mult_f64
+#define ARM_MAT_INVERSE arm_mat_inverse_f64
+#else
+typedef float computation_float_type;
+#define ARM_MATRIX_INSTANCE arm_matrix_instance_f32
+#define ARM_MAT_TRANS arm_mat_trans_f32
+#define ARM_MAT_MULT arm_mat_mult_f32
+#define ARM_MAT_INVERSE arm_mat_inverse_f32
+#endif
+
+//! 3 dimensional magnetic calibration and error compensation mechanism
+class soft_iron_compensator_t
+{
+public:
+  enum { AXES=3, PARAMETERS=10, OBSERVATIONS=24, INVALID=-1};
+
+  soft_iron_compensator_t( void)
+    : buffer_used_for_calibration(INVALID),
+      measurement_counter(0)
+  {
+    start_learning();
+  }
+
+  void start_learning( void)
+  {
+    populated_sectors = 0;
+    last_sector_collected=-1;
+    measurement_counter=0;
+    for( unsigned i=0; i<OBSERVATIONS; ++i)
+      heading_sector_error[i]=1e20f;
+  }
+
+  bool learn ( const float3vector &induction_error, const quaternion<float> &q, bool turning_right, float error_margin);
+float3vector compensate( const float3vector &induction, const quaternion<float> &q);
+  bool calculate( void);
+  bool available( void) const
+  {
+    return buffer_used_for_calibration != INVALID;
+  }
+
+  const computation_float_type * get_current_parameters( void) const
+  {
+    if( buffer_used_for_calibration == INVALID)
+      return 0;
+
+    return &(c[buffer_used_for_calibration][0][0]);
+  }
+
+  void set_current_parameters( const float * source)
+  {
+    int next_buffer;
+    if( buffer_used_for_calibration != 0)
+      next_buffer = 0;
+    else
+      next_buffer = 1;
+
+    computation_float_type * destination = &(c[next_buffer][0][0]);
+    for( unsigned i=0; i < AXES * PARAMETERS; ++i)
+      *destination++ = *source++;
+
+    buffer_used_for_calibration = next_buffer;
+  }
+
+private:
+  int buffer_used_for_calibration;
+  unsigned populated_sectors;
+  int last_sector_collected;
+  unsigned measurement_counter;
+  computation_float_type c[2][AXES][PARAMETERS]; // double buffering for multi-thrading support
+  computation_float_type target_vector[AXES][OBSERVATIONS];
+  computation_float_type observation_matrix[AXES][OBSERVATIONS][PARAMETERS];
+  computation_float_type heading_sector_error[OBSERVATIONS];
+};
+
+extern soft_iron_compensator_t soft_iron_compensator;
+void trigger_soft_iron_compensator_calculation(void);
+
+#endif /* NAV_ALGORITHMS_SOFT_IRON_COMPENSATOR_H_ */
diff --git a/NAV_Algorithms/variometer.cpp b/NAV_Algorithms/variometer.cpp
index bec1e47..a10caee 100644
--- a/NAV_Algorithms/variometer.cpp
+++ b/NAV_Algorithms/variometer.cpp
@@ -41,8 +41,12 @@ void variometer_t::update_at_100Hz (
     bool GNSS_fix_avaliable
   )
 {
+#if USE_OLD_FASHIONED_PRESSURE_VARIO
+  vario_uncompensated_pressure = pressure_vario_differentiator.respond( pressure_altitude);
+#else
   vario_uncompensated_pressure = KalmanVario_pressure.update (
       pressure_altitude, ahrs_acceleration[DOWN]);
+#endif
   speed_compensation_IAS = kinetic_energy_differentiator.respond (
       IAS * IAS * ONE_DIV_BY_GRAVITY_TIMES_2);
   vario_averager_pressure.respond (
@@ -90,8 +94,23 @@ void variometer_t::update_at_100Hz (
       // speed compensation type 3 comes from the derivative of the specific energy
       speed_compensation_energy_3 = specific_energy_differentiator.respond ( specific_energy);
 
+      // speed-compensation type 4 is the product of acceleration and velocity, both calculated along the heading axis
+      float3vector kalman_air_velocity;
+      kalman_air_velocity[NORTH] = Kalman_v_a_observer_N.get_x ( Kalman_V_A_Aoff_observer_t::VELOCITY);
+      kalman_air_velocity[EAST]  = Kalman_v_a_observer_E.get_x ( Kalman_V_A_Aoff_observer_t::VELOCITY);
+      kalman_air_velocity[DOWN]  = KalmanVario_GNSS.get_x (  	   KalmanVario_PVA_t::VARIO);
+      float air_velocity_projected = kalman_air_velocity * heading_vector;
+
+      acceleration[NORTH] = Kalman_v_a_observer_N.get_x ( Kalman_V_A_Aoff_observer_t::ACCELERATION);
+      acceleration[EAST]  = Kalman_v_a_observer_E.get_x ( Kalman_V_A_Aoff_observer_t::ACCELERATION);
+      acceleration[DOWN]  = KalmanVario_GNSS.get_x (	    KalmanVario_PVA_t::ACCELERATION_OBSERVED);
+      float acceleration_projected = acceleration * heading_vector;
+
+      speed_compensation_projected_4 = air_velocity_projected * acceleration_projected * RECIP_GRAVITY;
+
       // blending of three mechanisms for speed-compensation
-      speed_compensation_GNSS = GNSS_INS_speedcomp_fusioner.respond( 0.5f * (speed_compensation_INS_GNSS_1 + speed_compensation_kalman_2), speed_compensation_energy_3);
+//      speed_compensation_GNSS = GNSS_INS_speedcomp_fusioner.respond( 0.3333333f * (speed_compensation_INS_GNSS_1 + speed_compensation_kalman_2 + speed_compensation_projected_4), speed_compensation_energy_3);
+      speed_compensation_GNSS = GNSS_INS_speedcomp_fusioner.respond( 0.5 * (speed_compensation_INS_GNSS_1 + speed_compensation_kalman_2), speed_compensation_energy_3);
       vario_averager_GNSS.respond ( vario_uncompensated_GNSS + speed_compensation_GNSS);
     }
 }
diff --git a/NAV_Algorithms/variometer.h b/NAV_Algorithms/variometer.h
index f1ffcfd..6938a20 100644
--- a/NAV_Algorithms/variometer.h
+++ b/NAV_Algorithms/variometer.h
@@ -56,6 +56,9 @@ class variometer_t
     KalmanVario_GNSS( 0.0f, 0.0f, 0.0f, - GRAVITY),
     KalmanVario_pressure( 0.0f, 0.0f, 0.0f, - GRAVITY),
     specific_energy_differentiator( 1.0f, FAST_SAMPLING_TIME),
+#if USE_OLD_FASHIONED_PRESSURE_VARIO
+    pressure_vario_differentiator( 1.0f, FAST_SAMPLING_TIME),
+#endif
     Kalman_v_a_observer_N(),
     Kalman_v_a_observer_E(),
     GNSS_INS_speedcomp_fusioner(SPEED_COMPENSATION_FUSIONER_FEEDBACK),
@@ -66,7 +69,8 @@ class variometer_t
     specific_energy(0.0f),
     speed_compensation_INS_GNSS_1(0.0f),
     speed_compensation_kalman_2(0.0f),
-    speed_compensation_energy_3(0.0f)
+    speed_compensation_energy_3(0.0f),
+    speed_compensation_projected_4(0.0f)
   {
   };
     void update_at_100Hz
@@ -97,9 +101,12 @@ class variometer_t
 	return speed_compensation_kalman_2;
 	break;
 
-      default:
+      case 2:
 	return speed_compensation_energy_3;
 	break;
+      default:
+	return speed_compensation_projected_4;
+	break;
       }
   }
 
@@ -147,6 +154,9 @@ class variometer_t
 	KalmanVario_PVA_t KalmanVario_GNSS;
 	KalmanVario_t KalmanVario_pressure;
 	differentiator<float,float>specific_energy_differentiator;
+#if USE_OLD_FASHIONED_PRESSURE_VARIO
+	differentiator<float,float>pressure_vario_differentiator;
+#endif
 	Kalman_V_A_Aoff_observer_t Kalman_v_a_observer_N;
 	Kalman_V_A_Aoff_observer_t Kalman_v_a_observer_E;
 	HP_LP_fusion <float, float> GNSS_INS_speedcomp_fusioner;
@@ -161,6 +171,7 @@ class variometer_t
 	float speed_compensation_INS_GNSS_1;
 	float speed_compensation_kalman_2;
 	float speed_compensation_energy_3;
+	float speed_compensation_projected_4;
 };
 
 #endif /* VARIOMETER_H_ */
diff --git a/NAV_Algorithms/wind_observer.h b/NAV_Algorithms/wind_observer.h
index 308cb94..0ebe74e 100644
--- a/NAV_Algorithms/wind_observer.h
+++ b/NAV_Algorithms/wind_observer.h
@@ -31,6 +31,7 @@
 #include "accumulating_averager.h"
 #include "persistent_data.h"
 
+//! mechanisms to filter wind data
 class wind_oberserver_t
 {
 public:
@@ -95,7 +96,7 @@ class wind_oberserver_t
     else
       wind_average_observer.update(
 	  instant_wind,
-	  ahrs.get_euler ().y,
+	  ahrs.get_euler ().yaw,
 	  circling_state);
 
     float3vector relative_wind_NAV  = wind_resampler_100_10Hz.get_output() - wind_average_observer.get_output();
@@ -109,11 +110,12 @@ class wind_oberserver_t
         if( old_circling_state == TRANSITION) // when starting to circle
   	{
   	  circling_wind_averager.reset( wind_average_observer.get_output(), 100);
+  	  corrected_wind_averager.settle(wind_average_observer.get_output());
   	  relative_wind_observer.reset({0});
   	}
         else
           {
-	    relative_wind_observer.update( relative_wind_BODY, ahrs.get_euler ().y, ahrs.get_circling_state ());
+	    relative_wind_observer.update( relative_wind_BODY, ahrs.get_euler ().yaw, ahrs.get_circling_state ());
 	    wind_correction_nav = ahrs.get_body2nav() * relative_wind_observer.get_output();
 	    wind_correction_nav[DOWN]=0.0f;
 
@@ -155,7 +157,7 @@ class wind_oberserver_t
   pt2<float3vector,float> wind_resampler_100_10Hz;
   pt2<float3vector,float> instant_wind_averager;
   soaring_flight_averager< float3vector, true> wind_average_observer; // configure wind average clamping on first circle
-  soaring_flight_averager< float3vector, false, false> relative_wind_observer;
+  soaring_flight_averager< float3vector, true, true> relative_wind_observer;
   pt2<float3vector,float> corrected_wind_averager;
   accumulating_averager < float3vector> circling_wind_averager;
   circle_state_t circling_state;
diff --git a/Output_Formatter/CAN_output.cpp b/Output_Formatter/CAN_output.cpp
index 747d21b..2fbdab8 100644
--- a/Output_Formatter/CAN_output.cpp
+++ b/Output_Formatter/CAN_output.cpp
@@ -29,13 +29,15 @@
 #include "system_state.h"
 #include "CAN_gateway.h"
 
+extern uint32_t UNIQUE_ID[4]; // MPU silicon ID
+
 #define DEGREE_2_RAD 1.7453292e-2f
 
-#ifdef CAN_FORMAT_2021
+#if CAN_FORMAT_2021
 
 enum CAN_ID_SENSOR
 {
-  c_CAN_Id_EulerAngles	= 0x101,    //!< int16_t roll nick yaw / 1/1000 rad
+  c_CAN_Id_EulerAngles	= 0x101,    //!< int16_t roll pitch yaw / 1/1000 rad
   c_CAN_Id_Airspeed     = 0x102,    //!< int16_t TAS, IAS / km/h
   c_CAN_Id_Vario        = 0x103,    //!< int16_t vario, vario-integrator / mm/s
   c_CAN_Id_GPS_Date_Time= 0x104,    //!< uint8_t year-2000, month, day, hour, mins, secs
@@ -46,7 +48,7 @@ enum CAN_ID_SENSOR
   c_CAN_Id_Atmosphere   = 0x109,    //!< uint32_t pressure / Pa uint32_t density / g/m^3
   c_CAN_Id_GPS_Sats     = 0x10a,    //!< uin8_t No of Sats, uint8_t Fix-Type NO=0 2D=1 3D=2 RTK=3
   c_CAN_Id_Acceleration = 0x10b,    //!< int16_t G-force mm / s^2, int16_t vertical acc mm / m^2, vario uncomp mm / s, u_int8_t circle mode
-  c_CAN_Id_TurnCoord    = 0x10c,    //!< slip angle int16_t 1/1000 rad, turn rate int16_t 1/1000 rad/s, nick angle 1/1000 rad
+  c_CAN_Id_TurnCoord    = 0x10c,    //!< slip angle int16_t 1/1000 rad, turn rate int16_t 1/1000 rad/s, pitch angle 1/1000 rad
   c_CAN_Id_SystemState  = 0x10d,    //!< u32 system_state, u32 git_tag dec
   c_CID_KSB_Vdd         = 0x112,    //!< unit16_t as voltage * 10
 };
@@ -59,9 +61,9 @@ void CAN_output ( const output_data_t &x, bool horizon_activated)
     {
       p.id=c_CAN_Id_EulerAngles;		// 0x101
       p.dlc=6;
-      p.data_sh[0] = (int16_t)(round(x.euler.r * 1000.0f)); 	// unit = 1/1000 RAD
-      p.data_sh[1] = (int16_t)(round(x.euler.p * 1000.0f));
-      p.data_sh[2] = (int16_t)(round(x.euler.y * 1000.0f));
+      p.data_sh[0] = (int16_t)(round(x.euler.roll * 1000.0f)); 	// unit = 1/1000 RAD
+      p.data_sh[1] = (int16_t)(round(x.euler.pitch * 1000.0f));
+      p.data_sh[2] = (int16_t)(round(x.euler.yaw * 1000.0f));
       CAN_send(p, 1);
     }
   else
@@ -70,7 +72,7 @@ void CAN_output ( const output_data_t &x, bool horizon_activated)
       p.dlc=6;
       p.data_sh[0] = ZERO;
       p.data_sh[1] = ZERO;
-      p.data_sh[2] = (int16_t)(round(x.euler.y * 1000.0f));
+      p.data_sh[2] = (int16_t)(round(x.euler.yaw * 1000.0f));
       CAN_send(p, 1);
     }
 
@@ -160,7 +162,7 @@ void CAN_output ( const output_data_t &x, bool horizon_activated)
   p.dlc=6;
   p.data_sh[0] = (int16_t)(round(x.slip_angle * 1000.0f));	// slip angle in radiant from body acceleration
   p.data_sh[1] = (int16_t)(round(x.turn_rate  * 1000.0f)); 	// turn rate rad/s
-  p.data_sh[2] = (int16_t)(round(x.pitch_angle * 1000.0f));	// nick angle in radiant from body acceleration
+  p.data_sh[2] = (int16_t)(round(x.pitch_angle * 1000.0f));	// pitch angle in radiant from body acceleration
   if( CAN_send(p, 1)) // check CAN for timeout this time
     system_state |= CAN_OUTPUT_ACTIVE;
   else
@@ -186,98 +188,106 @@ enum CAN_ID_SENSOR
 {
   // all values in SI-STD- (metric) units, angles in radians
   // format IEEE float32 little-endian
-  CAN_Id_Roll_Nick	= 0x400,    //!< float roll-angle, float nick-angle (FRONT-RIGHT-DOWN-system)
-  CAN_Id_Heading	= 0x401,    //!< float true heading, [OPTIONAL float magnetic declination]
-  CAN_Id_Airspeed	= 0x402,    //!< float TAS, float IAS / m/s
-  CAN_Id_Vario		= 0x403,    //!< float vario, float vario-average / m/s
-
-  CAN_Id_GPS_Date_Time	= 0x404,    //!< uint8_t year-2000, month, day, hour, mins, secs
-  CAN_Id_GPS_LatLon	= 0x405,    //!< float latitude, float longitude / degrees(!)
-  CAN_Id_GPS_Alt	= 0x406,    //!< float MSL altitude,float geo separation
-  CAN_Id_GPS_Trk_Spd	= 0x407,    //!< float ground track, float groundspeed / m/s
-  CAN_Id_GPS_Sats	= 0x408,    //!< uin8_t No of Sats, (uint8_t)bool SAT FIX valid, (uint8_t)bool SAT HEADING available
-
-  CAN_Id_Wind		= 0x409,    //!< float wind direction (where from), float wind speed
-  CAN_Id_Wind_Average	= 0x40a,    //!< float average wind direction, float average wind speed m/s
-  CAN_Id_Atmosphere	= 0x40b,    //!< float ambient pressure / Pa, float air density / kg/m^3
-  CAN_Id_Acceleration	= 0x40c,    //!< float body-frame G-load m/s^2, body acceleration angle roll-axis
-  CAN_Id_TurnRate	= 0x40d,    //!< float turn rate to the right, (uint8_t) (enum  { STRAIGHT_FLIGHT, TRANSITION, CIRCLING} )
-  CAN_Id_SystemState	= 0x40e,    //!< u32 system_state, u32 git_tag_dec
-  CAN_Id_Voltage	= 0x40f,    //!< float supply voltage
+  CAN_Id_Roll_Pitch	= 0x120,    //!< float roll-angle, float pitch-angle (FRONT-RIGHT-DOWN-system)
+  CAN_Id_Heading	= 0x121,    //!< float true heading, turn-rate
+  CAN_Id_Airspeed	= 0x122,    //!< float TAS, float IAS / m/s
+  CAN_Id_Vario		= 0x123,    //!< float vario, float vario-average / m/s
+  CAN_Id_Wind		= 0x124,    //!< float wind direction (where from), float wind speed
+  CAN_Id_Wind_Average	= 0x125,    //!< float average wind direction, float average wind speed m/s
+  CAN_Id_Atmosphere	= 0x126,    //!< float ambient pressure / Pa, float air density / kg/m^3
+  CAN_Id_Acceleration	= 0x127,    //!< float body-frame G-load m/s^2, vertical acceleration world frame
+  CAN_Id_SlipPitch	= 0x128,    //!< float slip angle from body-acc, float pitch angle from body-acc
+  CAN_Id_Voltage_Circle	= 0x129,    //!< float supply voltage, uint8_t circle-mode
+  CAN_Id_SystemState    = 0x12a,    //!< u32 system_state, u32 git_tag dec
+
+  CAN_Id_GPS_Date_Time	= 0x140,    //!< uint8_t year-2000, month, day, hour, mins, secs
+  CAN_Id_GPS_Lat	= 0x141,    //!< double latitude
+  CAN_Id_GPS_Lon	= 0x142,    //!< double longitude
+  CAN_Id_GPS_Alt	= 0x143,    //!< float MSL altitude, float geo separation
+  CAN_Id_GPS_Trk_Spd	= 0x144,    //!< float ground track, float groundspeed / m/s
+  CAN_Id_GPS_Sats	= 0x145,    //!< uin8_t No of Sats, (uint8_t)bool SAT FIX type
+
+  CAN_Id_Heartbeat_Sens	= 0x520,
+  CAN_Id_Heartbeat_GNSS	= 0x540,
+  CAN_Id_Heartbeat_IMU	= 0x560
+
 };
 
-void CAN_output ( const output_data_t &x)
+void CAN_output ( const output_data_t &x, bool horizon_activated)
 {
-  CANpacket p;
+  CANpacket p( 0x7ff, 8);
+  if( horizon_activated)
+    {
+      p.id=CAN_Id_Roll_Pitch;
+      p.dlc=8;
+      p.data_f[0] = x.euler.roll;
+      p.data_f[1] = x.euler.pitch;
+      CAN_send(p, 1);
+    }
 
-#if  HORIZON_DATA_SECRET == 0
-  p.id=CAN_Id_Roll_Nick;
-  p.dlc=8;
-  p.data_f[0] = x.euler.r;
-  p.data_f[1] = x.euler.n;
-  CAN_send(p, 1);
-#endif
   p.id=CAN_Id_Heading;
-  p.dlc=4;
-  p.data_f[0] = x.euler.y;
+  p.data_f[0] = x.euler.yaw;
+  p.data_f[1] = x.turn_rate;
   CAN_send(p, 1);
 
   p.id=CAN_Id_Airspeed;
-  p.dlc=8;
   p.data_f[0] = x.TAS;
   p.data_f[1] = x.IAS;
   CAN_send(p, 1);
 
   p.id=CAN_Id_Vario;
-  p.dlc=8;
   p.data_f[0] = x.vario;
   p.data_f[1] = x.integrator_vario;
   CAN_send(p, 1);
 
   p.id=CAN_Id_Wind;
-  p.dlc=8;
   p.data_f[0] = ATAN2( - x.wind[EAST], - x.wind[NORTH]);
   p.data_f[1] = x.wind.abs();
   CAN_send(p, 1);
 
   p.id=CAN_Id_Wind_Average;
-  p.dlc=8;
   p.data_f[0] = ATAN2( - x.wind_average[EAST], - x.wind_average[NORTH]);
-  p.data_f[1] = x.wind.abs();
+  p.data_f[1] = x.wind_average.abs();
   CAN_send(p, 1);
 
   p.id=CAN_Id_Atmosphere;
-  p.dlc=8;
   p.data_f[0] = x.m.static_pressure;
   p.data_f[1] = x.air_density;
   CAN_send(p, 1);
 
   p.id=CAN_Id_Acceleration;
-  p.dlc=7;
   p.data_f[0] = x.G_load;
-  p.data_f[1] = x.slip_angle;
+  p.data_f[1] = x.effective_vertical_acceleration;
   CAN_send(p, 1);
 
-  p.id=CAN_Id_TurnRate;
+  p.id=CAN_Id_SlipPitch;
+  p.data_f[0] = x.slip_angle; // pendulum readings
+  p.data_f[1] = x.pitch_angle;
+  CAN_send(p, 1);
+
+  p.id=CAN_Id_Voltage_Circle;
   p.dlc=5;
-  p.data_f[0] = x.turn_rate;
+  p.data_f[0] = x.m.supply_voltage;
   p.data_b[4] = (uint8_t)(x.circle_mode);
-
+  CAN_send(p, 1);
 
   p.id=CAN_Id_GPS_Date_Time;
-  p.dlc=6;
-  p.data_b[0] = x.c.year;
-  p.data_b[1] = x.c.month;
-  p.data_b[2] = x.c.day;
-  p.data_b[3] = x.c.hour;
-  p.data_b[4] = x.c.minute;
-  p.data_b[5] = x.c.second;
+  p.dlc=7;
+  p.data_h[0] = x.c.year + 2000; // GNSS reports only 2000 + x
+  p.data_b[2] = x.c.month;
+  p.data_b[3] = x.c.day;
+  p.data_b[4] = x.c.hour;
+  p.data_b[5] = x.c.minute;
+  p.data_b[6] = x.c.second;
   CAN_send(p, 1);
 
-  p.id=CAN_Id_GPS_LatLon;
+  p.id=CAN_Id_GPS_Lat;
   p.dlc=8;
-  p.data_f[0] = (float)(x.c.latitude); // -> 4m of accuracy
-  p.data_f[1] = (float)(x.c.longitude);
+  p.data_d = x.c.latitude;
+  CAN_send(p, 1);
+
+  p.id=CAN_Id_GPS_Lon;
+  p.data_d = x.c.longitude;
   CAN_send(p, 1);
 
   p.id=CAN_Id_GPS_Alt;		// 0x106
@@ -287,8 +297,7 @@ void CAN_output ( const output_data_t &x)
   CAN_send(p, 1);
 
   p.id=CAN_Id_GPS_Trk_Spd;
-  p.dlc=8;
-  p.data_f[0] = x.c.heading_motion * DEGREE_2_RAD;
+  p.data_f[0] = x.c.heading_motion;
   p.data_f[1] = x.c.speed_motion;
   CAN_send(p, 1);
 
@@ -296,21 +305,12 @@ void CAN_output ( const output_data_t &x)
   p.dlc=2;
   p.data_b[0] = x.c.SATS_number;
   p.data_b[1] = x.c.sat_fix_type;
-  CAN_send(p, 1);
-
-  p.id=CAN_Id_Voltage;
-  p.dlc=4;
-  p.data_f[0] = x.m.supply_voltage;
 
   if( CAN_send(p, 1)) // check CAN for timeout this time
     system_state |= CAN_OUTPUT_ACTIVE;
   else
     system_state &= ~CAN_OUTPUT_ACTIVE;
 
-#ifndef GIT_TAG_DEC
-#define GIT_TAG_DEC 0xffffffff
-#endif
-
   p.id=CAN_Id_SystemState;
   p.dlc=8;
   p.data_w[0] = system_state;
@@ -318,5 +318,20 @@ void CAN_output ( const output_data_t &x)
   CAN_send(p, 1);
 }
 
+void CAN_heartbeat( void)
+{
+  CANpacket p( CAN_Id_Heartbeat_Sens, 8);
+
+  p.data_h[0] = 2;
+  p.data_h[1] = 0;
+  p.data_w[1] = UNIQUE_ID[0];
+  CAN_send(p, 1);
+
+  p.id=CAN_Id_Heartbeat_GNSS;
+  p.data_h[0] = 3;
+  p.data_h[1] = 0;
+  p.data_w[1] = UNIQUE_ID[0];
+  CAN_send(p, 1);
+}
 
 #endif
diff --git a/Output_Formatter/CAN_output.h b/Output_Formatter/CAN_output.h
index 84b5587..a024c0f 100644
--- a/Output_Formatter/CAN_output.h
+++ b/Output_Formatter/CAN_output.h
@@ -36,7 +36,29 @@
 #define  SYSWIDECONFIG_ITEM_ID_PILOT_KG 	5
 #define  SYSWIDECONFIG_ITEM_ID_VARIO_MODE	6
 
+#define  SENS_CONFIG_ITEM_ROLL			0x2000
+#define  SENS_CONFIG_ITEM_PITCH			0x2001
+#define  SENS_CONFIG_ITEM_YAW			0x2002
+#define  SENS_CONFIG_ITEM_PIT_OFF		0x2003
+#define  SENS_CONFIG_ITEM_PIT_SPAN		0x2004
+#define  SENS_CONFIG_ITEM_QNH_OFF		0x2005
+#define  SENS_CONFIG_MAG_CALIB			0x2006
+#define  SENS_CONFIG_VARIO_TC			0x2007
+#define  SENS_CONFIG_VARIO_INT_TC		0x2008
+#define  SENS_CONFIG_WIND_TC			0x2009
+#define  SENS_CONFIG_WIND_MEAN_TC		0x200a
+#define  SENS_CONFIG_GNSS_CONFIG		0x200b
+#define  SENS_CONFIG_ANT_BASELEN		0x200c
+#define  SENS_CONFIG_ANT_SLAVE_DOWN		0x200d
+#define  SENS_CONFIG_ANT_SLAVE_RIGHT		0x200e
+
+#define  CMD_MEASURE_LEFT			0x3000
+#define  CMD_MEASURE_RIGHT			0x3001
+#define  CMD_MEASURE_LEVEL			0x3002
+#define  CMD_CALCULATE				0x3003
+#define  CMD_TUNE				0x3004
 
 void CAN_output ( const output_data_t &x, bool horizon_activated);
+void CAN_heartbeat( void);
 
 #endif /* SRC_CAN_OUTPUT_H_ */
diff --git a/Output_Formatter/NMEA_format.cpp b/Output_Formatter/NMEA_format.cpp
index 32c5d70..824d58b 100644
--- a/Output_Formatter/NMEA_format.cpp
+++ b/Output_Formatter/NMEA_format.cpp
@@ -439,9 +439,9 @@ void format_NMEA_string_fast( const output_data_t &output_data, string_buffer_t
 
   // aircraft attitude
   if( horizon_available)
-    format_PLARA(output_data.euler.r, output_data.euler.p, output_data.euler.y, next);
+    format_PLARA(output_data.euler.roll, output_data.euler.pitch, output_data.euler.yaw, next);
   else
-    format_PLARA( ZERO, ZERO, output_data.euler.y, next);
+    format_PLARA( ZERO, ZERO, output_data.euler.yaw, next);
 
   // report instant and average total-energy-compensated variometer, pressure altitude, TAS
   format_PLARV ( output_data.vario,
diff --git a/Output_Formatter/generic_CAN_driver.h b/Output_Formatter/generic_CAN_driver.h
index 3eb4385..65bce50 100644
--- a/Output_Formatter/generic_CAN_driver.h
+++ b/Output_Formatter/generic_CAN_driver.h
@@ -53,6 +53,7 @@ class CANpacket
 		uint32_t data_w[2]; 	//!< data seen as 2 times uint32_t
 		int32_t  data_sw[2];	//!< data seen as 2 times int32_t
 		float    data_f[2]; 	//!< data seen as 2 times 32-bit floats
+		double    data_d; 		//!< data seen as 64-bit float
 		uint64_t data_l;    	//!< data seen as 64-bit integer
   };
 } ;
diff --git a/README.md b/README.md
index cbb1951..5a0e10e 100644
--- a/README.md
+++ b/README.md
@@ -18,6 +18,8 @@ The algorithms include:
 
 The sensor output as a set of NMEA sentences ist defined in our subproject [larus-NMEA-protocol](https://github.com/larus-breeze/doc_larus/blob/master/documentation/Larus_NMEA_Protocol.md)
 
+To get an idea about the software structure and the algorithms used you may want to browse the [Doxygen-generated documentation of the library.](https://schaefer.eit.h-da.de/Larus_SIL)
+
 # This library is designed to be imported into another project via a .gitmodules file.
 
 Add as submodule to repository: