1 files changed, 97 insertions, 242 deletions
diff --git a/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_q31.c b/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_q31.c
index 4ca3f85af..011e21dfe 100644
--- a/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_q31.c
+++ b/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df1_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_biquad_cascade_df1_q31.c
  * Description:  Processing function for the Q31 Biquad cascade filter
  *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
  *
  * Target Processor: Cortex-M cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -29,59 +29,54 @@
 #include "arm_math.h"
 
 /**
- * @ingroup groupFilters
+  @ingroup groupFilters
  */
 
 /**
- * @addtogroup BiquadCascadeDF1
- * @{
+  @addtogroup BiquadCascadeDF1
+  @{
  */
 
 /**
- * @brief Processing function for the Q31 Biquad cascade filter.
- * @param[in]  *S         points to an instance of the Q31 Biquad cascade structure.
- * @param[in]  *pSrc      points to the block of input data.
- * @param[out] *pDst      points to the block of output data.
- * @param[in]  blockSize  number of samples to process per call.
- * @return none.
- *
- * <b>Scaling and Overflow Behavior:</b>
- * \par
- * The function is implemented using an internal 64-bit accumulator.
- * The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit.
- * Thus, if the accumulator result overflows it wraps around rather than clip.
- * In order to avoid overflows completely the input signal must be scaled down by 2 bits and lie in the range [-0.25 +0.25).
- * After all 5 multiply-accumulates are performed, the 2.62 accumulator is shifted by <code>postShift</code> bits and the result truncated to
- * 1.31 format by discarding the low 32 bits.
- *
- * \par
- * Refer to the function <code>arm_biquad_cascade_df1_fast_q31()</code> for a faster but less precise implementation of this filter for Cortex-M3 and Cortex-M4.
+  @brief         Processing function for the Q31 Biquad cascade filter.
+  @param[in]     S         points to an instance of the Q31 Biquad cascade structure
+  @param[in]     pSrc      points to the block of input data
+  @param[out]    pDst      points to the block of output data
+  @param[in]     blockSize  number of samples to process
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The function is implemented using an internal 64-bit accumulator.
+                   The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit.
+                   Thus, if the accumulator result overflows it wraps around rather than clip.
+                   In order to avoid overflows completely the input signal must be scaled down by 2 bits and lie in the range [-0.25 +0.25).
+                   After all 5 multiply-accumulates are performed, the 2.62 accumulator is shifted by <code>postShift</code> bits and the result truncated to
+                   1.31 format by discarding the low 32 bits.
+  @remark
+                   Refer to \ref arm_biquad_cascade_df1_fast_q31() for a faster but less precise implementation of this filter.
  */
 
 void arm_biquad_cascade_df1_q31(
   const arm_biquad_casd_df1_inst_q31 * S,
-  q31_t * pSrc,
-  q31_t * pDst,
-  uint32_t blockSize)
+  const q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize)
 {
-  q63_t acc;                                     /*  accumulator                   */
-  uint32_t uShift = ((uint32_t) S->postShift + 1U);
-  uint32_t lShift = 32U - uShift;                /*  Shift to be applied to the output */
-  q31_t *pIn = pSrc;                             /*  input pointer initialization  */
-  q31_t *pOut = pDst;                            /*  output pointer initialization */
-  q31_t *pState = S->pState;                     /*  pState pointer initialization */
-  q31_t *pCoeffs = S->pCoeffs;                   /*  coeff pointer initialization  */
-  q31_t Xn1, Xn2, Yn1, Yn2;                      /*  Filter state variables        */
-  q31_t b0, b1, b2, a1, a2;                      /*  Filter coefficients           */
-  q31_t Xn;                                      /*  temporary input               */
-  uint32_t sample, stage = S->numStages;         /*  loop counters                     */
-
-
-#if defined (ARM_MATH_DSP)
-
-  q31_t acc_l, acc_h;                            /*  temporary output variables    */
-
-  /* Run the below code for Cortex-M4 and Cortex-M3 */
+  const q31_t *pIn = pSrc;                             /* Source pointer */
+        q31_t *pOut = pDst;                            /* Destination pointer */
+        q31_t *pState = S->pState;                     /* pState pointer */
+  const q31_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
+        q63_t acc;                                     /* Accumulator */
+        q31_t b0, b1, b2, a1, a2;                      /* Filter coefficients */
+        q31_t Xn1, Xn2, Yn1, Yn2;                      /* Filter pState variables */
+        q31_t Xn;                                      /* Temporary input */
+        uint32_t uShift = ((uint32_t) S->postShift + 1U);
+        uint32_t lShift = 32U - uShift;                /* Shift to be applied to the output */
+        uint32_t sample, stage = S->numStages;         /* Loop counters */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+        q31_t acc_l, acc_h;                            /* temporary output variables */
+#endif
 
   do
   {
@@ -92,301 +87,161 @@ void arm_biquad_cascade_df1_q31(
     a1 = *pCoeffs++;
     a2 = *pCoeffs++;
 
-    /* Reading the state values */
+    /* Reading the pState values */
     Xn1 = pState[0];
     Xn2 = pState[1];
     Yn1 = pState[2];
     Yn2 = pState[3];
 
+#if defined (ARM_MATH_LOOPUNROLL)
+
     /* Apply loop unrolling and compute 4 output values simultaneously. */
-    /*      The variable acc hold output values that are being computed:
+    /* Variable acc hold output values that are being computed:
      *
-     *    acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
+     * acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
      */
 
+    /* Loop unrolling: Compute 4 outputs at a time */
     sample = blockSize >> 2U;
 
-    /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-     ** a second loop below computes the remaining 1 to 3 samples. */
     while (sample > 0U)
     {
-      /* Read the input */
+      /* Read the first input */
       Xn = *pIn++;
 
       /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
-
-      /* acc =  b0 * x[n] */
-      acc = (q63_t) b0 *Xn;
-      /* acc +=  b1 * x[n-1] */
-      acc += (q63_t) b1 *Xn1;
-      /* acc +=  b[2] * x[n-2] */
-      acc += (q63_t) b2 *Xn2;
-      /* acc +=  a1 * y[n-1] */
-      acc += (q63_t) a1 *Yn1;
-      /* acc +=  a2 * y[n-2] */
-      acc += (q63_t) a2 *Yn2;
+      acc = ((q63_t) b0 * Xn) + ((q63_t) b1 * Xn1) + ((q63_t) b2 * Xn2) + ((q63_t) a1 * Yn1) + ((q63_t) a2 * Yn2);
 
       /* The result is converted to 1.31 , Yn2 variable is reused */
-
-      /* Calc lower part of acc */
-      acc_l = acc & 0xffffffff;
-
-      /* Calc upper part of acc */
-      acc_h = (acc >> 32) & 0xffffffff;
+      acc_l = (acc      ) & 0xffffffff; /* Calc lower part of acc */
+      acc_h = (acc >> 32) & 0xffffffff; /* Calc upper part of acc */
 
       /* Apply shift for lower part of acc and upper part of acc */
       Yn2 = (uint32_t) acc_l >> lShift | acc_h << uShift;
 
-      /* Store the output in the destination buffer. */
+      /* Store output in destination buffer. */
       *pOut++ = Yn2;
 
       /* Read the second input */
       Xn2 = *pIn++;
 
       /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
-
-      /* acc =  b0 * x[n] */
-      acc = (q63_t) b0 *Xn2;
-      /* acc +=  b1 * x[n-1] */
-      acc += (q63_t) b1 *Xn;
-      /* acc +=  b[2] * x[n-2] */
-      acc += (q63_t) b2 *Xn1;
-      /* acc +=  a1 * y[n-1] */
-      acc += (q63_t) a1 *Yn2;
-      /* acc +=  a2 * y[n-2] */
-      acc += (q63_t) a2 *Yn1;
-
+      acc = ((q63_t) b0 * Xn2) + ((q63_t) b1 * Xn) + ((q63_t) b2 * Xn1) + ((q63_t) a1 * Yn2) + ((q63_t) a2 * Yn1);
 
       /* The result is converted to 1.31, Yn1 variable is reused  */
-
-      /* Calc lower part of acc */
-      acc_l = acc & 0xffffffff;
-
-      /* Calc upper part of acc */
-      acc_h = (acc >> 32) & 0xffffffff;
-
+      acc_l = (acc      ) & 0xffffffff; /* Calc lower part of acc */
+      acc_h = (acc >> 32) & 0xffffffff; /* Calc upper part of acc */
 
       /* Apply shift for lower part of acc and upper part of acc */
       Yn1 = (uint32_t) acc_l >> lShift | acc_h << uShift;
 
-      /* Store the output in the destination buffer. */
+      /* Store output in destination buffer. */
       *pOut++ = Yn1;
 
-      /* Read the third input  */
+      /* Read the third input */
       Xn1 = *pIn++;
 
       /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
-
-      /* acc =  b0 * x[n] */
-      acc = (q63_t) b0 *Xn1;
-      /* acc +=  b1 * x[n-1] */
-      acc += (q63_t) b1 *Xn2;
-      /* acc +=  b[2] * x[n-2] */
-      acc += (q63_t) b2 *Xn;
-      /* acc +=  a1 * y[n-1] */
-      acc += (q63_t) a1 *Yn1;
-      /* acc +=  a2 * y[n-2] */
-      acc += (q63_t) a2 *Yn2;
+      acc = ((q63_t) b0 * Xn1) + ((q63_t) b1 * Xn2) + ((q63_t) b2 * Xn) + ((q63_t) a1 * Yn1) + ((q63_t) a2 * Yn2);
 
       /* The result is converted to 1.31, Yn2 variable is reused  */
-      /* Calc lower part of acc */
-      acc_l = acc & 0xffffffff;
-
-      /* Calc upper part of acc */
-      acc_h = (acc >> 32) & 0xffffffff;
-
+      acc_l = (acc      ) & 0xffffffff; /* Calc lower part of acc */
+      acc_h = (acc >> 32) & 0xffffffff; /* Calc upper part of acc */
 
       /* Apply shift for lower part of acc and upper part of acc */
       Yn2 = (uint32_t) acc_l >> lShift | acc_h << uShift;
 
-      /* Store the output in the destination buffer. */
+      /* Store output in destination buffer. */
       *pOut++ = Yn2;
 
       /* Read the forth input */
       Xn = *pIn++;
 
       /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
-
-      /* acc =  b0 * x[n] */
-      acc = (q63_t) b0 *Xn;
-      /* acc +=  b1 * x[n-1] */
-      acc += (q63_t) b1 *Xn1;
-      /* acc +=  b[2] * x[n-2] */
-      acc += (q63_t) b2 *Xn2;
-      /* acc +=  a1 * y[n-1] */
-      acc += (q63_t) a1 *Yn2;
-      /* acc +=  a2 * y[n-2] */
-      acc += (q63_t) a2 *Yn1;
+      acc = ((q63_t) b0 * Xn) + ((q63_t) b1 * Xn1) + ((q63_t) b2 * Xn2) + ((q63_t) a1 * Yn2) + ((q63_t) a2 * Yn1);
 
       /* The result is converted to 1.31, Yn1 variable is reused  */
-      /* Calc lower part of acc */
-      acc_l = acc & 0xffffffff;
-
-      /* Calc upper part of acc */
-      acc_h = (acc >> 32) & 0xffffffff;
+      acc_l = (acc      ) & 0xffffffff; /* Calc lower part of acc */
+      acc_h = (acc >> 32) & 0xffffffff; /* Calc upper part of acc */
 
       /* Apply shift for lower part of acc and upper part of acc */
       Yn1 = (uint32_t) acc_l >> lShift | acc_h << uShift;
 
-      /* Every time after the output is computed state should be updated. */
-      /* The states should be updated as:  */
-      /* Xn2 = Xn1    */
-      /* Xn1 = Xn     */
-      /* Yn2 = Yn1    */
-      /* Yn1 = acc    */
-      Xn2 = Xn1;
-      Xn1 = Xn;
-
-      /* Store the output in the destination buffer. */
+      /* Store output in destination buffer. */
       *pOut++ = Yn1;
 
-      /* decrement the loop counter */
-      sample--;
-    }
-
-    /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
-     ** No loop unrolling is used. */
-    sample = (blockSize & 0x3U);
-
-    while (sample > 0U)
-    {
-      /* Read the input */
-      Xn = *pIn++;
-
-      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
-
-      /* acc =  b0 * x[n] */
-      acc = (q63_t) b0 *Xn;
-      /* acc +=  b1 * x[n-1] */
-      acc += (q63_t) b1 *Xn1;
-      /* acc +=  b[2] * x[n-2] */
-      acc += (q63_t) b2 *Xn2;
-      /* acc +=  a1 * y[n-1] */
-      acc += (q63_t) a1 *Yn1;
-      /* acc +=  a2 * y[n-2] */
-      acc += (q63_t) a2 *Yn2;
-
-      /* The result is converted to 1.31  */
-      acc = acc >> lShift;
-
       /* Every time after the output is computed state should be updated. */
-      /* The states should be updated as:  */
-      /* Xn2 = Xn1    */
-      /* Xn1 = Xn     */
-      /* Yn2 = Yn1    */
-      /* Yn1 = acc    */
+      /* The states should be updated as: */
+      /* Xn2 = Xn1 */
+      /* Xn1 = Xn  */
+      /* Yn2 = Yn1 */
+      /* Yn1 = acc */
       Xn2 = Xn1;
       Xn1 = Xn;
-      Yn2 = Yn1;
-      Yn1 = (q31_t) acc;
 
-      /* Store the output in the destination buffer. */
-      *pOut++ = (q31_t) acc;
-
-      /* decrement the loop counter */
+      /* decrement loop counter */
       sample--;
     }
 
-    /*  The first stage goes from the input buffer to the output buffer. */
-    /*  Subsequent stages occur in-place in the output buffer */
-    pIn = pDst;
-
-    /* Reset to destination pointer */
-    pOut = pDst;
-
-    /*  Store the updated state variables back into the pState array */
-    *pState++ = Xn1;
-    *pState++ = Xn2;
-    *pState++ = Yn1;
-    *pState++ = Yn2;
-
-  } while (--stage);
+    /* Loop unrolling: Compute remaining outputs */
+    sample = blockSize & 0x3U;
 
 #else
 
-  /* Run the below code for Cortex-M0 */
-
-  do
-  {
-    /* Reading the coefficients */
-    b0 = *pCoeffs++;
-    b1 = *pCoeffs++;
-    b2 = *pCoeffs++;
-    a1 = *pCoeffs++;
-    a2 = *pCoeffs++;
-
-    /* Reading the state values */
-    Xn1 = pState[0];
-    Xn2 = pState[1];
-    Yn1 = pState[2];
-    Yn2 = pState[3];
-
-    /*      The variables acc holds the output value that is computed:
-     *    acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
-     */
-
+    /* Initialize blkCnt with number of samples */
     sample = blockSize;
 
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
     while (sample > 0U)
     {
       /* Read the input */
       Xn = *pIn++;
 
       /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
-      /* acc =  b0 * x[n] */
-      acc = (q63_t) b0 *Xn;
-
-      /* acc +=  b1 * x[n-1] */
-      acc += (q63_t) b1 *Xn1;
-      /* acc +=  b[2] * x[n-2] */
-      acc += (q63_t) b2 *Xn2;
-      /* acc +=  a1 * y[n-1] */
-      acc += (q63_t) a1 *Yn1;
-      /* acc +=  a2 * y[n-2] */
-      acc += (q63_t) a2 *Yn2;
+      acc = ((q63_t) b0 * Xn) + ((q63_t) b1 * Xn1) + ((q63_t) b2 * Xn2) + ((q63_t) a1 * Yn1) + ((q63_t) a2 * Yn2);
 
       /* The result is converted to 1.31  */
       acc = acc >> lShift;
 
+      /* Store output in destination buffer. */
+      *pOut++ = (q31_t) acc;
+
       /* Every time after the output is computed state should be updated. */
-      /* The states should be updated as:  */
-      /* Xn2 = Xn1    */
-      /* Xn1 = Xn     */
-      /* Yn2 = Yn1    */
-      /* Yn1 = acc    */
+      /* The states should be updated as: */
+      /* Xn2 = Xn1 */
+      /* Xn1 = Xn  */
+      /* Yn2 = Yn1 */
+      /* Yn1 = acc */
       Xn2 = Xn1;
       Xn1 = Xn;
       Yn2 = Yn1;
       Yn1 = (q31_t) acc;
 
-      /* Store the output in the destination buffer. */
-      *pOut++ = (q31_t) acc;
-
-      /* decrement the loop counter */
+      /* decrement loop counter */
       sample--;
     }
 
-    /*  The first stage goes from the input buffer to the output buffer. */
-    /*  Subsequent stages occur in-place in the output buffer */
-    pIn = pDst;
-
-    /* Reset to destination pointer */
-    pOut = pDst;
-
-    /*  Store the updated state variables back into the pState array */
+    /* Store the updated state variables back into the pState array */
     *pState++ = Xn1;
     *pState++ = Xn2;
     *pState++ = Yn1;
     *pState++ = Yn2;
 
-  } while (--stage);
+    /* The first stage goes from the input buffer to the output buffer. */
+    /* Subsequent numStages occur in-place in the output buffer */
+    pIn = pDst;
 
-#endif /*  #if defined (ARM_MATH_DSP) */
-}
+    /* Reset output pointer */
+    pOut = pDst;
 
+    /* decrement loop counter */
+    stage--;
 
+  } while (stage > 0U);
 
+}
 
 /**
-  * @} end of BiquadCascadeDF1 group
-  */
+  @} end of BiquadCascadeDF1 group
+ */