1 files changed, 293 insertions, 440 deletions
diff --git a/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df2T_f64.c b/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df2T_f64.c
index aad9fbb5e..a8af8cec5 100644
--- a/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df2T_f64.c
+++ b/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_df2T_f64.c
@@ -3,13 +3,13 @@
  * Title:        arm_biquad_cascade_df2T_f64.c
  * Description:  Processing function for floating-point transposed direct form II Biquad cascade filter
  *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
  *
  * Target Processor: Cortex-M cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -29,139 +29,128 @@
 #include "arm_math.h"
 
 /**
-* @ingroup groupFilters
+  @ingroup groupFilters
 */
 
 /**
-* @defgroup BiquadCascadeDF2T Biquad Cascade IIR Filters Using a Direct Form II Transposed Structure
-*
-* This set of functions implements arbitrary order recursive (IIR) filters using a transposed direct form II structure.
-* The filters are implemented as a cascade of second order Biquad sections.
-* These functions provide a slight memory savings as compared to the direct form I Biquad filter functions.
-* Only floating-point data is supported.
-*
-* This function operate on blocks of input and output data and each call to the function
-* processes <code>blockSize</code> samples through the filter.
-* <code>pSrc</code> points to the array of input data and
-* <code>pDst</code> points to the array of output data.
-* Both arrays contain <code>blockSize</code> values.
-*
-* \par Algorithm
-* Each Biquad stage implements a second order filter using the difference equation:
-* <pre>
-*    y[n] = b0 * x[n] + d1
-*    d1 = b1 * x[n] + a1 * y[n] + d2
-*    d2 = b2 * x[n] + a2 * y[n]
-* </pre>
-* where d1 and d2 represent the two state values.
-*
-* \par
-* A Biquad filter using a transposed Direct Form II structure is shown below.
-* \image html BiquadDF2Transposed.gif "Single transposed Direct Form II Biquad"
-* Coefficients <code>b0, b1, and b2 </code> multiply the input signal <code>x[n]</code> and are referred to as the feedforward coefficients.
-* Coefficients <code>a1</code> and <code>a2</code> multiply the output signal <code>y[n]</code> and are referred to as the feedback coefficients.
-* Pay careful attention to the sign of the feedback coefficients.
-* Some design tools flip the sign of the feedback coefficients:
-* <pre>
-*    y[n] = b0 * x[n] + d1;
-*    d1 = b1 * x[n] - a1 * y[n] + d2;
-*    d2 = b2 * x[n] - a2 * y[n];
-* </pre>
-* In this case the feedback coefficients <code>a1</code> and <code>a2</code> must be negated when used with the CMSIS DSP Library.
-*
-* \par
-* Higher order filters are realized as a cascade of second order sections.
-* <code>numStages</code> refers to the number of second order stages used.
-* For example, an 8th order filter would be realized with <code>numStages=4</code> second order stages.
-* A 9th order filter would be realized with <code>numStages=5</code> second order stages with the
-* coefficients for one of the stages configured as a first order filter (<code>b2=0</code> and <code>a2=0</code>).
-*
-* \par
-* <code>pState</code> points to the state variable array.
-* Each Biquad stage has 2 state variables <code>d1</code> and <code>d2</code>.
-* The state variables are arranged in the <code>pState</code> array as:
-* <pre>
-*     {d11, d12, d21, d22, ...}
-* </pre>
-* where <code>d1x</code> refers to the state variables for the first Biquad and
-* <code>d2x</code> refers to the state variables for the second Biquad.
-* The state array has a total length of <code>2*numStages</code> values.
-* The state variables are updated after each block of data is processed; the coefficients are untouched.
-*
-* \par
-* The CMSIS library contains Biquad filters in both Direct Form I and transposed Direct Form II.
-* The advantage of the Direct Form I structure is that it is numerically more robust for fixed-point data types.
-* That is why the Direct Form I structure supports Q15 and Q31 data types.
-* The transposed Direct Form II structure, on the other hand, requires a wide dynamic range for the state variables <code>d1</code> and <code>d2</code>.
-* Because of this, the CMSIS library only has a floating-point version of the Direct Form II Biquad.
-* The advantage of the Direct Form II Biquad is that it requires half the number of state variables, 2 rather than 4, per Biquad stage.
-*
-* \par Instance Structure
-* The coefficients and state variables for a filter are stored together in an instance data structure.
-* A separate instance structure must be defined for each filter.
-* Coefficient arrays may be shared among several instances while state variable arrays cannot be shared.
-*
-* \par Init Functions
-* There is also an associated initialization function.
-* The initialization function performs following operations:
-* - Sets the values of the internal structure fields.
-* - Zeros out the values in the state buffer.
-* To do this manually without calling the init function, assign the follow subfields of the instance structure:
-* numStages, pCoeffs, pState. Also set all of the values in pState to zero.
-*
-* \par
-* Use of the initialization function is optional.
-* However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
-* To place an instance structure into a const data section, the instance structure must be manually initialized.
-* Set the values in the state buffer to zeros before static initialization.
-* For example, to statically initialize the instance structure use
-* <pre>
-*     arm_biquad_cascade_df2T_instance_f64 S1 = {numStages, pState, pCoeffs};
-* </pre>
-* where <code>numStages</code> is the number of Biquad stages in the filter; <code>pState</code> is the address of the state buffer.
-* <code>pCoeffs</code> is the address of the coefficient buffer;
-*
+  @defgroup BiquadCascadeDF2T Biquad Cascade IIR Filters Using a Direct Form II Transposed Structure
+
+  This set of functions implements arbitrary order recursive (IIR) filters using a transposed direct form II structure.
+  The filters are implemented as a cascade of second order Biquad sections.
+  These functions provide a slight memory savings as compared to the direct form I Biquad filter functions.
+  Only floating-point data is supported.
+
+  This function operate on blocks of input and output data and each call to the function
+  processes <code>blockSize</code> samples through the filter.
+  <code>pSrc</code> points to the array of input data and
+  <code>pDst</code> points to the array of output data.
+  Both arrays contain <code>blockSize</code> values.
+
+  @par           Algorithm
+                   Each Biquad stage implements a second order filter using the difference equation:
+  <pre>
+     y[n] = b0 * x[n] + d1
+     d1 = b1 * x[n] + a1 * y[n] + d2
+     d2 = b2 * x[n] + a2 * y[n]
+  </pre>
+                   where d1 and d2 represent the two state values.
+  @par
+                   A Biquad filter using a transposed Direct Form II structure is shown below.
+                   \image html BiquadDF2Transposed.gif "Single transposed Direct Form II Biquad"
+                   Coefficients <code>b0, b1, and b2 </code> multiply the input signal <code>x[n]</code> and are referred to as the feedforward coefficients.
+                   Coefficients <code>a1</code> and <code>a2</code> multiply the output signal <code>y[n]</code> and are referred to as the feedback coefficients.
+                   Pay careful attention to the sign of the feedback coefficients.
+                   Some design tools flip the sign of the feedback coefficients:
+  <pre>
+     y[n] = b0 * x[n] + d1;
+     d1 = b1 * x[n] - a1 * y[n] + d2;
+     d2 = b2 * x[n] - a2 * y[n];
+  </pre>
+                   In this case the feedback coefficients <code>a1</code> and <code>a2</code> must be negated when used with the CMSIS DSP Library.
+  @par
+                   Higher order filters are realized as a cascade of second order sections.
+                   <code>numStages</code> refers to the number of second order stages used.
+                   For example, an 8th order filter would be realized with <code>numStages=4</code> second order stages.
+                   A 9th order filter would be realized with <code>numStages=5</code> second order stages with the
+                   coefficients for one of the stages configured as a first order filter (<code>b2=0</code> and <code>a2=0</code>).
+  @par
+                   <code>pState</code> points to the state variable array.
+                   Each Biquad stage has 2 state variables <code>d1</code> and <code>d2</code>.
+                   The state variables are arranged in the <code>pState</code> array as:
+  <pre>
+      {d11, d12, d21, d22, ...}
+  </pre>
+                   where <code>d1x</code> refers to the state variables for the first Biquad and
+                   <code>d2x</code> refers to the state variables for the second Biquad.
+                   The state array has a total length of <code>2*numStages</code> values.
+                   The state variables are updated after each block of data is processed; the coefficients are untouched.
+  @par
+                   The CMSIS library contains Biquad filters in both Direct Form I and transposed Direct Form II.
+                   The advantage of the Direct Form I structure is that it is numerically more robust for fixed-point data types.
+                   That is why the Direct Form I structure supports Q15 and Q31 data types.
+                   The transposed Direct Form II structure, on the other hand, requires a wide dynamic range for the state variables <code>d1</code> and <code>d2</code>.
+                   Because of this, the CMSIS library only has a floating-point version of the Direct Form II Biquad.
+                   The advantage of the Direct Form II Biquad is that it requires half the number of state variables, 2 rather than 4, per Biquad stage.
+
+  @par           Instance Structure
+                   The coefficients and state variables for a filter are stored together in an instance data structure.
+                   A separate instance structure must be defined for each filter.
+                   Coefficient arrays may be shared among several instances while state variable arrays cannot be shared.
+
+  @par           Init Functions
+                   There is also an associated initialization function.
+                   The initialization function performs following operations:
+                   - Sets the values of the internal structure fields.
+                   - Zeros out the values in the state buffer.
+                   To do this manually without calling the init function, assign the follow subfields of the instance structure:
+                   numStages, pCoeffs, pState. Also set all of the values in pState to zero.
+  @par
+                   Use of the initialization function is optional.
+                   However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
+                   To place an instance structure into a const data section, the instance structure must be manually initialized.
+                   Set the values in the state buffer to zeros before static initialization.
+                   For example, to statically initialize the instance structure use
+  <pre>
+      arm_biquad_cascade_df2T_instance_f64 S1 = {numStages, pState, pCoeffs};
+      arm_biquad_cascade_df2T_instance_f32 S1 = {numStages, pState, pCoeffs};
+  </pre>
+                   where <code>numStages</code> is the number of Biquad stages in the filter;
+                   <code>pState</code> is the address of the state buffer.
+                   <code>pCoeffs</code> is the address of the coefficient buffer;
 */
 
 /**
-* @addtogroup BiquadCascadeDF2T
-* @{
-*/
+  @addtogroup BiquadCascadeDF2T
+  @{
+ */
 
 /**
-* @brief Processing function for the floating-point transposed direct form II Biquad cascade filter.
-* @param[in]  *S        points to an instance of the filter data structure.
-* @param[in]  *pSrc     points to the block of input data.
-* @param[out] *pDst     points to the block of output data
-* @param[in]  blockSize number of samples to process.
-* @return none.
-*/
-
+  @brief         Processing function for the floating-point transposed direct form II Biquad cascade filter.
+  @param[in]     S         points to an instance of the filter data structure
+  @param[in]     pSrc      points to the block of input data
+  @param[out]    pDst      points to the block of output data
+  @param[in]     blockSize number of samples to process
+  @return        none
+ */
 
 LOW_OPTIMIZATION_ENTER
 void arm_biquad_cascade_df2T_f64(
-const arm_biquad_cascade_df2T_instance_f64 * S,
-float64_t * pSrc,
-float64_t * pDst,
-uint32_t blockSize)
+  const arm_biquad_cascade_df2T_instance_f64 * S,
+        float64_t * pSrc,
+        float64_t * pDst,
+        uint32_t blockSize)
 {
 
-   float64_t *pIn = pSrc;                         /*  source pointer            */
-   float64_t *pOut = pDst;                        /*  destination pointer       */
-   float64_t *pState = S->pState;                 /*  State pointer             */
-   float64_t *pCoeffs = S->pCoeffs;               /*  coefficient pointer       */
-   float64_t acc1;                                /*  accumulator               */
-   float64_t b0, b1, b2, a1, a2;                  /*  Filter coefficients       */
-   float64_t Xn1;                                 /*  temporary input           */
-   float64_t d1, d2;                              /*  state variables           */
-   uint32_t sample, stage = S->numStages;         /*  loop counters             */
-
-#if defined(ARM_MATH_CM7)
+        float64_t *pIn = pSrc;                         /* Source pointer */
+        float64_t *pOut = pDst;                        /* Destination pointer */
+        float64_t *pState = S->pState;                 /* State pointer */
+        float64_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
+        float64_t acc1;                                /* Accumulator */
+        float64_t b0, b1, b2, a1, a2;                  /* Filter coefficients */
+        float64_t Xn1;                                 /* Temporary input */
+        float64_t d1, d2;                              /* State variables */
+        uint32_t sample, stage = S->numStages;         /* Loop counters */
 
-   float64_t Xn2, Xn3, Xn4, Xn5, Xn6, Xn7, Xn8;   /*  Input State variables     */
-   float64_t Xn9, Xn10, Xn11, Xn12, Xn13, Xn14, Xn15, Xn16;
-   float64_t acc2, acc3, acc4, acc5, acc6, acc7;  /*  Simulates the accumulator */
-   float64_t acc8, acc9, acc10, acc11, acc12, acc13, acc14, acc15, acc16;
 
    do
    {
@@ -170,421 +159,285 @@ uint32_t blockSize)
       b1 = pCoeffs[1];
       b2 = pCoeffs[2];
       a1 = pCoeffs[3];
-      /* Apply loop unrolling and compute 16 output values simultaneously. */
-      sample = blockSize >> 4U;
       a2 = pCoeffs[4];
 
-      /*Reading the state values */
+      /* Reading the state values */
       d1 = pState[0];
       d2 = pState[1];
 
       pCoeffs += 5U;
 
+#if defined (ARM_MATH_LOOPUNROLL)
+
+    /* Loop unrolling: Compute 16 outputs at a time */
+      sample = blockSize >> 4U;
 
-      /* First part of the processing with loop unrolling.  Compute 16 outputs at a time.
-       ** a second loop below computes the remaining 1 to 15 samples. */
       while (sample > 0U) {
 
          /* y[n] = b0 * x[n] + d1 */
          /* d1 = b1 * x[n] + a1 * y[n] + d2 */
          /* d2 = b2 * x[n] + a2 * y[n] */
 
-         /* Read the first 2 inputs. 2 cycles */
-         Xn1  = pIn[0 ];
-         Xn2  = pIn[1 ];
+/*  1 */
+         Xn1 = *pIn++;
 
-         /* Sample 1. 5 cycles */
-         Xn3  = pIn[2 ];
          acc1 = b0 * Xn1 + d1;
 
-         Xn4  = pIn[3 ];
          d1 = b1 * Xn1 + d2;
+         d1 += a1 * acc1;
 
-         Xn5  = pIn[4 ];
          d2 = b2 * Xn1;
+         d2 += a2 * acc1;
+
+         *pOut++ = acc1;
+
+
+/*  2 */
+         Xn1 = *pIn++;
 
-         Xn6  = pIn[5 ];
+         acc1 = b0 * Xn1 + d1;
+
+         d1 = b1 * Xn1 + d2;
          d1 += a1 * acc1;
 
-         Xn7  = pIn[6 ];
+         d2 = b2 * Xn1;
          d2 += a2 * acc1;
 
-         /* Sample 2. 5 cycles */
-         Xn8  = pIn[7 ];
-         acc2 = b0 * Xn2 + d1;
-
-         Xn9  = pIn[8 ];
-         d1 = b1 * Xn2 + d2;
-
-         Xn10 = pIn[9 ];
-         d2 = b2 * Xn2;
-
-         Xn11 = pIn[10];
-         d1 += a1 * acc2;
-
-         Xn12 = pIn[11];
-         d2 += a2 * acc2;
-
-         /* Sample 3. 5 cycles */
-         Xn13 = pIn[12];
-         acc3 = b0 * Xn3 + d1;
-
-         Xn14 = pIn[13];
-         d1 = b1 * Xn3 + d2;
-
-         Xn15 = pIn[14];
-         d2 = b2 * Xn3;
-
-         Xn16 = pIn[15];
-         d1 += a1 * acc3;
-
-         pIn += 16;
-         d2 += a2 * acc3;
-
-         /* Sample 4. 5 cycles */
-         acc4 = b0 * Xn4 + d1;
-         d1 = b1 * Xn4 + d2;
-         d2 = b2 * Xn4;
-         d1 += a1 * acc4;
-         d2 += a2 * acc4;
-
-         /* Sample 5. 5 cycles */
-         acc5 = b0 * Xn5 + d1;
-         d1 = b1 * Xn5 + d2;
-         d2 = b2 * Xn5;
-         d1 += a1 * acc5;
-         d2 += a2 * acc5;
-
-         /* Sample 6. 5 cycles */
-         acc6 = b0 * Xn6 + d1;
-         d1 = b1 * Xn6 + d2;
-         d2 = b2 * Xn6;
-         d1 += a1 * acc6;
-         d2 += a2 * acc6;
-
-         /* Sample 7. 5 cycles */
-         acc7 = b0 * Xn7 + d1;
-         d1 = b1 * Xn7 + d2;
-         d2 = b2 * Xn7;
-         d1 += a1 * acc7;
-         d2 += a2 * acc7;
-
-         /* Sample 8. 5 cycles */
-         acc8 = b0 * Xn8 + d1;
-         d1 = b1 * Xn8 + d2;
-         d2 = b2 * Xn8;
-         d1 += a1 * acc8;
-         d2 += a2 * acc8;
-
-         /* Sample 9. 5 cycles */
-         acc9 = b0 * Xn9 + d1;
-         d1 = b1 * Xn9 + d2;
-         d2 = b2 * Xn9;
-         d1 += a1 * acc9;
-         d2 += a2 * acc9;
-
-         /* Sample 10. 5 cycles */
-         acc10 = b0 * Xn10 + d1;
-         d1 = b1 * Xn10 + d2;
-         d2 = b2 * Xn10;
-         d1 += a1 * acc10;
-         d2 += a2 * acc10;
-
-         /* Sample 11. 5 cycles */
-         acc11 = b0 * Xn11 + d1;
-         d1 = b1 * Xn11 + d2;
-         d2 = b2 * Xn11;
-         d1 += a1 * acc11;
-         d2 += a2 * acc11;
-
-         /* Sample 12. 5 cycles */
-         acc12 = b0 * Xn12 + d1;
-         d1 = b1 * Xn12 + d2;
-         d2 = b2 * Xn12;
-         d1 += a1 * acc12;
-         d2 += a2 * acc12;
-
-         /* Sample 13. 5 cycles */
-         acc13 = b0 * Xn13 + d1;
-         d1 = b1 * Xn13 + d2;
-         d2 = b2 * Xn13;
-
-         pOut[0 ] = acc1 ;
-         d1 += a1 * acc13;
-
-         pOut[1 ] = acc2 ;
-         d2 += a2 * acc13;
-
-         /* Sample 14. 5 cycles */
-         pOut[2 ] = acc3 ;
-         acc14 = b0 * Xn14 + d1;
-
-         pOut[3 ] = acc4 ;
-         d1 = b1 * Xn14 + d2;
-
-         pOut[4 ] = acc5 ;
-         d2 = b2 * Xn14;
-
-         pOut[5 ] = acc6 ;
-         d1 += a1 * acc14;
-
-         pOut[6 ] = acc7 ;
-         d2 += a2 * acc14;
-
-         /* Sample 15. 5 cycles */
-         pOut[7 ] = acc8 ;
-         pOut[8 ] = acc9 ;
-         acc15 = b0 * Xn15 + d1;
-
-         pOut[9 ] = acc10;
-         d1 = b1 * Xn15 + d2;
-
-         pOut[10] = acc11;
-         d2 = b2 * Xn15;
-
-         pOut[11] = acc12;
-         d1 += a1 * acc15;
-
-         pOut[12] = acc13;
-         d2 += a2 * acc15;
-
-         /* Sample 16. 5 cycles */
-         pOut[13] = acc14;
-         acc16 = b0 * Xn16 + d1;
-
-         pOut[14] = acc15;
-         d1 = b1 * Xn16 + d2;
-
-         pOut[15] = acc16;
-         d2 = b2 * Xn16;
+         *pOut++ = acc1;
 
-         sample--;
-         d1 += a1 * acc16;
+/*  3 */
+         Xn1 = *pIn++;
 
-         pOut += 16;
-         d2 += a2 * acc16;
-      }
+         acc1 = b0 * Xn1 + d1;
+
+         d1 = b1 * Xn1 + d2;
+         d1 += a1 * acc1;
+
+         d2 = b2 * Xn1;
+         d2 += a2 * acc1;
+
+         *pOut++ = acc1;
+
+/*  4 */
+         Xn1 = *pIn++;
 
-      sample = blockSize & 0xFu;
-      while (sample > 0U) {
-         Xn1 = *pIn;
          acc1 = b0 * Xn1 + d1;
 
-         pIn++;
          d1 = b1 * Xn1 + d2;
+         d1 += a1 * acc1;
 
-         *pOut = acc1;
          d2 = b2 * Xn1;
+         d2 += a2 * acc1;
 
-         pOut++;
+         *pOut++ = acc1;
+
+/*  5 */
+         Xn1 = *pIn++;
+
+         acc1 = b0 * Xn1 + d1;
+
+         d1 = b1 * Xn1 + d2;
          d1 += a1 * acc1;
 
-         sample--;
+         d2 = b2 * Xn1;
          d2 += a2 * acc1;
-      }
 
-      /* Store the updated state variables back into the state array */
-      pState[0] = d1;
-      /* The current stage input is given as the output to the next stage */
-      pIn = pDst;
+         *pOut++ = acc1;
 
-      pState[1] = d2;
-      /* decrement the loop counter */
-      stage--;
+/*  6 */
+         Xn1 = *pIn++;
 
-      pState += 2U;
+         acc1 = b0 * Xn1 + d1;
 
-      /*Reset the output working pointer */
-      pOut = pDst;
+         d1 = b1 * Xn1 + d2;
+         d1 += a1 * acc1;
 
-   } while (stage > 0U);
+         d2 = b2 * Xn1;
+         d2 += a2 * acc1;
 
-#elif defined(ARM_MATH_CM0_FAMILY)
+         *pOut++ = acc1;
 
-   /* Run the below code for Cortex-M0 */
+/*  7 */
+         Xn1 = *pIn++;
 
-   do
-   {
-      /* Reading the coefficients */
-      b0 = *pCoeffs++;
-      b1 = *pCoeffs++;
-      b2 = *pCoeffs++;
-      a1 = *pCoeffs++;
-      a2 = *pCoeffs++;
+         acc1 = b0 * Xn1 + d1;
 
-      /*Reading the state values */
-      d1 = pState[0];
-      d2 = pState[1];
+         d1 = b1 * Xn1 + d2;
+         d1 += a1 * acc1;
 
+         d2 = b2 * Xn1;
+         d2 += a2 * acc1;
 
-      sample = blockSize;
+         *pOut++ = acc1;
 
-      while (sample > 0U)
-      {
-         /* Read the input */
+/*  8 */
          Xn1 = *pIn++;
 
-         /* y[n] = b0 * x[n] + d1 */
-         acc1 = (b0 * Xn1) + d1;
+         acc1 = b0 * Xn1 + d1;
+
+         d1 = b1 * Xn1 + d2;
+         d1 += a1 * acc1;
+
+         d2 = b2 * Xn1;
+         d2 += a2 * acc1;
 
-         /* Store the result in the accumulator in the destination buffer. */
          *pOut++ = acc1;
 
-         /* Every time after the output is computed state should be updated. */
-         /* d1 = b1 * x[n] + a1 * y[n] + d2 */
-         d1 = ((b1 * Xn1) + (a1 * acc1)) + d2;
+/*  9 */
+         Xn1 = *pIn++;
 
-         /* d2 = b2 * x[n] + a2 * y[n] */
-         d2 = (b2 * Xn1) + (a2 * acc1);
+         acc1 = b0 * Xn1 + d1;
 
-         /* decrement the loop counter */
-         sample--;
-      }
+         d1 = b1 * Xn1 + d2;
+         d1 += a1 * acc1;
 
-      /* Store the updated state variables back into the state array */
-      *pState++ = d1;
-      *pState++ = d2;
+         d2 = b2 * Xn1;
+         d2 += a2 * acc1;
 
-      /* The current stage input is given as the output to the next stage */
-      pIn = pDst;
+         *pOut++ = acc1;
 
-      /*Reset the output working pointer */
-      pOut = pDst;
+/* 10 */
+         Xn1 = *pIn++;
 
-      /* decrement the loop counter */
-      stage--;
+         acc1 = b0 * Xn1 + d1;
 
-   } while (stage > 0U);
+         d1 = b1 * Xn1 + d2;
+         d1 += a1 * acc1;
 
-#else
+         d2 = b2 * Xn1;
+         d2 += a2 * acc1;
 
-   float64_t Xn2, Xn3, Xn4;                  	  /*  Input State variables     */
-   float64_t acc2, acc3, acc4;              		  /*  accumulator               */
+         *pOut++ = acc1;
 
+/* 11 */
+         Xn1 = *pIn++;
 
-   float64_t p0, p1, p2, p3, p4, A1;
+         acc1 = b0 * Xn1 + d1;
 
-   /* Run the below code for Cortex-M4 and Cortex-M3 */
-   do
-   {
-      /* Reading the coefficients */
-      b0 = *pCoeffs++;
-      b1 = *pCoeffs++;
-      b2 = *pCoeffs++;
-      a1 = *pCoeffs++;
-      a2 = *pCoeffs++;
+         d1 = b1 * Xn1 + d2;
+         d1 += a1 * acc1;
 
+         d2 = b2 * Xn1;
+         d2 += a2 * acc1;
 
-      /*Reading the state values */
-      d1 = pState[0];
-      d2 = pState[1];
+         *pOut++ = acc1;
 
-      /* Apply loop unrolling and compute 4 output values simultaneously. */
-      sample = blockSize >> 2U;
+/* 12 */
+         Xn1 = *pIn++;
 
-      /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-   ** a second loop below computes the remaining 1 to 3 samples. */
-      while (sample > 0U) {
+         acc1 = b0 * Xn1 + d1;
 
-         /* y[n] = b0 * x[n] + d1 */
-         /* d1 = b1 * x[n] + a1 * y[n] + d2 */
-         /* d2 = b2 * x[n] + a2 * y[n] */
+         d1 = b1 * Xn1 + d2;
+         d1 += a1 * acc1;
+
+         d2 = b2 * Xn1;
+         d2 += a2 * acc1;
+
+         *pOut++ = acc1;
+
+/* 13 */
+         Xn1 = *pIn++;
+
+         acc1 = b0 * Xn1 + d1;
+
+         d1 = b1 * Xn1 + d2;
+         d1 += a1 * acc1;
+
+         d2 = b2 * Xn1;
+         d2 += a2 * acc1;
+
+         *pOut++ = acc1;
+
+/* 14 */
+         Xn1 = *pIn++;
+
+         acc1 = b0 * Xn1 + d1;
+
+         d1 = b1 * Xn1 + d2;
+         d1 += a1 * acc1;
+
+         d2 = b2 * Xn1;
+         d2 += a2 * acc1;
+
+         *pOut++ = acc1;
+
+/* 15 */
+         Xn1 = *pIn++;
+
+         acc1 = b0 * Xn1 + d1;
+
+         d1 = b1 * Xn1 + d2;
+         d1 += a1 * acc1;
+
+         d2 = b2 * Xn1;
+         d2 += a2 * acc1;
+
+         *pOut++ = acc1;
+
+/* 16 */
+         Xn1 = *pIn++;
+
+         acc1 = b0 * Xn1 + d1;
 
-         /* Read the four inputs */
-         Xn1 = pIn[0];
-         Xn2 = pIn[1];
-         Xn3 = pIn[2];
-         Xn4 = pIn[3];
-         pIn += 4;
-
-         p0 = b0 * Xn1;
-         p1 = b1 * Xn1;
-         acc1 = p0 + d1;
-         p0 = b0 * Xn2;
-         p3 = a1 * acc1;
-         p2 = b2 * Xn1;
-         A1 = p1 + p3;
-         p4 = a2 * acc1;
-         d1 = A1 + d2;
-         d2 = p2 + p4;
-
-         p1 = b1 * Xn2;
-         acc2 = p0 + d1;
-         p0 = b0 * Xn3;
-         p3 = a1 * acc2;
-         p2 = b2 * Xn2;
-         A1 = p1 + p3;
-         p4 = a2 * acc2;
-         d1 = A1 + d2;
-         d2 = p2 + p4;
-
-         p1 = b1 * Xn3;
-         acc3 = p0 + d1;
-         p0 = b0 * Xn4;
-         p3 = a1 * acc3;
-         p2 = b2 * Xn3;
-         A1 = p1 + p3;
-         p4 = a2 * acc3;
-         d1 = A1 + d2;
-         d2 = p2 + p4;
-
-         acc4 = p0 + d1;
-         p1 = b1 * Xn4;
-         p3 = a1 * acc4;
-         p2 = b2 * Xn4;
-         A1 = p1 + p3;
-         p4 = a2 * acc4;
-         d1 = A1 + d2;
-         d2 = p2 + p4;
-
-         pOut[0] = acc1;
-         pOut[1] = acc2;
-         pOut[2] = acc3;
-         pOut[3] = acc4;
-				 pOut += 4;
+         d1 = b1 * Xn1 + d2;
+         d1 += a1 * acc1;
+
+         d2 = b2 * Xn1;
+         d2 += a2 * acc1;
+
+         *pOut++ = acc1;
 
+         /* decrement loop counter */
          sample--;
       }
 
-      sample = blockSize & 0x3U;
+    /* Loop unrolling: Compute remaining outputs */
+      sample = blockSize & 0xFU;
+
+#else
+
+    /* Initialize blkCnt with number of samples */
+    sample = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
       while (sample > 0U) {
          Xn1 = *pIn++;
 
-         p0 = b0 * Xn1;
-         p1 = b1 * Xn1;
-         acc1 = p0 + d1;
-         p3 = a1 * acc1;
-         p2 = b2 * Xn1;
-         A1 = p1 + p3;
-         p4 = a2 * acc1;
-         d1 = A1 + d2;
-         d2 = p2 + p4;
+         acc1 = b0 * Xn1 + d1;
+
+         d1 = b1 * Xn1 + d2;
+         d1 += a1 * acc1;
+
+         d2 = b2 * Xn1;
+         d2 += a2 * acc1;
 
          *pOut++ = acc1;
 
+         /* decrement loop counter */
          sample--;
       }
 
       /* Store the updated state variables back into the state array */
-      *pState++ = d1;
-      *pState++ = d2;
+      pState[0] = d1;
+      pState[1] = d2;
+
+      pState += 2U;
 
       /* The current stage input is given as the output to the next stage */
       pIn = pDst;
 
-      /*Reset the output working pointer */
+      /* Reset the output working pointer */
       pOut = pDst;
 
-      /* decrement the loop counter */
+      /* decrement loop counter */
       stage--;
 
    } while (stage > 0U);
 
-#endif
-
 }
 LOW_OPTIMIZATION_EXIT
 
 /**
-   * @} end of BiquadCascadeDF2T group
-   */
+  @} end of BiquadCascadeDF2T group
+ */