1 files changed, 164 insertions, 135 deletions
diff --git a/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_q31.c b/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_q31.c
index 3d7d3d078..caa2f51fd 100644
--- a/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_q31.c
+++ b/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_q31.c
@@ -3,13 +3,13 @@
  * Title:        arm_correlate_q31.c
  * Description:  Correlation of Q31 sequences
  *
- * $Date:        27. January 2017
- * $Revision:    V.1.5.1
+ * $Date:        18. March 2019
+ * $Revision:    V1.6.0
  *
  * Target Processor: Cortex-M cores
  * -------------------------------------------------------------------- */
 /*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -29,63 +29,64 @@
 #include "arm_math.h"
 
 /**
- * @ingroup groupFilters
+  @ingroup groupFilters
  */
 
 /**
- * @addtogroup Corr
- * @{
+  @addtogroup Corr
+  @{
  */
 
 /**
- * @brief Correlation of Q31 sequences.
- * @param[in] *pSrcA points to the first input sequence.
- * @param[in] srcALen length of the first input sequence.
- * @param[in] *pSrcB points to the second input sequence.
- * @param[in] srcBLen length of the second input sequence.
- * @param[out] *pDst points to the location where the output result is written.  Length 2 * max(srcALen, srcBLen) - 1.
- * @return none.
- *
- * @details
- * <b>Scaling and Overflow Behavior:</b>
- *
- * \par
- * The function is implemented using an internal 64-bit accumulator.
- * The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit.
- * There is no saturation on intermediate additions.
- * Thus, if the accumulator overflows it wraps around and distorts the result.
- * The input signals should be scaled down to avoid intermediate overflows.
- * Scale down one of the inputs by 1/min(srcALen, srcBLen)to avoid overflows since a
- * maximum of min(srcALen, srcBLen) number of additions is carried internally.
- * The 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result.
- *
- * \par
- * See <code>arm_correlate_fast_q31()</code> for a faster but less precise implementation of this function for Cortex-M3 and Cortex-M4.
+  @brief         Correlation of Q31 sequences.
+  @param[in]     pSrcA      points to the first input sequence
+  @param[in]     srcALen    length of the first input sequence
+  @param[in]     pSrcB      points to the second input sequence
+  @param[in]     srcBLen    length of the second input sequence
+  @param[out]    pDst       points to the location where the output result is written.  Length 2 * max(srcALen, srcBLen) - 1.
+  @return        none
+
+  @par           Scaling and Overflow Behavior
+                   The function is implemented using an internal 64-bit accumulator.
+                   The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit.
+                   There is no saturation on intermediate additions.
+                   Thus, if the accumulator overflows it wraps around and distorts the result.
+                   The input signals should be scaled down to avoid intermediate overflows.
+                   Scale down one of the inputs by 1/min(srcALen, srcBLen)to avoid overflows since a
+                   maximum of min(srcALen, srcBLen) number of additions is carried internally.
+                   The 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result.
+
+  @remark
+                   Refer to \ref arm_correlate_fast_q31() for a faster but less precise implementation of this function.
  */
 
 void arm_correlate_q31(
-  q31_t * pSrcA,
-  uint32_t srcALen,
-  q31_t * pSrcB,
-  uint32_t srcBLen,
-  q31_t * pDst)
+  const q31_t * pSrcA,
+        uint32_t srcALen,
+  const q31_t * pSrcB,
+        uint32_t srcBLen,
+        q31_t * pDst)
 {
 
-#if defined (ARM_MATH_DSP)
-
-  /* Run the below code for Cortex-M4 and Cortex-M3 */
-
-  q31_t *pIn1;                                   /* inputA pointer               */
-  q31_t *pIn2;                                   /* inputB pointer               */
-  q31_t *pOut = pDst;                            /* output pointer               */
-  q31_t *px;                                     /* Intermediate inputA pointer  */
-  q31_t *py;                                     /* Intermediate inputB pointer  */
-  q31_t *pSrc1;                                  /* Intermediate pointers        */
-  q63_t sum, acc0, acc1, acc2;                   /* Accumulators                  */
-  q31_t x0, x1, x2, c0;                          /* temporary variables for holding input and coefficient values */
-  uint32_t j, k = 0U, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3;  /* loop counter                 */
-  int32_t inc = 1;                               /* Destination address modifier */
-
+#if (1)
+//#if !defined(ARM_MATH_CM0_FAMILY)
+
+  const q31_t *pIn1;                                   /* InputA pointer */
+  const q31_t *pIn2;                                   /* InputB pointer */
+        q31_t *pOut = pDst;                            /* Output pointer */
+  const q31_t *px;                                     /* Intermediate inputA pointer */
+  const q31_t *py;                                     /* Intermediate inputB pointer */
+  const q31_t *pSrc1;                                  /* Intermediate pointers */
+        q63_t sum;                                     /* Accumulators */
+        uint32_t blockSize1, blockSize2, blockSize3;   /* Loop counters */
+        uint32_t j, k, count, blkCnt;                  /* Loop counters */
+        uint32_t outBlockSize;
+        int32_t inc = 1;                               /* Destination address modifier */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+        q63_t acc0, acc1, acc2;                        /* Accumulators */
+        q31_t x0, x1, x2, c0;                          /* Temporary variables for holding input and coefficient values */
+#endif
 
   /* The algorithm implementation is based on the lengths of the inputs. */
   /* srcB is always made to slide across srcA. */
@@ -103,10 +104,10 @@ void arm_correlate_q31(
   if (srcALen >= srcBLen)
   {
     /* Initialization of inputA pointer */
-    pIn1 = (pSrcA);
+    pIn1 = pSrcA;
 
     /* Initialization of inputB pointer */
-    pIn2 = (pSrcB);
+    pIn2 = pSrcB;
 
     /* Number of output samples is calculated */
     outBlockSize = (2U * srcALen) - 1U;
@@ -119,15 +120,14 @@ void arm_correlate_q31(
 
     /* Updating the pointer position to non zero value */
     pOut += j;
-
   }
   else
   {
     /* Initialization of inputA pointer */
-    pIn1 = (pSrcB);
+    pIn1 = pSrcB;
 
     /* Initialization of inputB pointer */
-    pIn2 = (pSrcA);
+    pIn2 = pSrcA;
 
     /* srcBLen is always considered as shorter or equal to srcALen */
     j = srcBLen;
@@ -140,18 +140,18 @@ void arm_correlate_q31(
 
     /* Destination address modifier is set to -1 */
     inc = -1;
-
   }
 
   /* The function is internally
-   * divided into three parts according to the number of multiplications that has to be
-   * taken place between inputA samples and inputB samples. In the first part of the
+   * divided into three stages according to the number of multiplications that has to be
+   * taken place between inputA samples and inputB samples. In the first stage of the
    * algorithm, the multiplications increase by one for every iteration.
-   * In the second part of the algorithm, srcBLen number of multiplications are done.
-   * In the third part of the algorithm, the multiplications decrease by one
-   * for every iteration.*/
+   * In the second stage of the algorithm, srcBLen number of multiplications are done.
+   * In the third stage of the algorithm, the multiplications decrease by one
+   * for every iteration. */
+
   /* The algorithm is implemented in three stages.
-   * The loop counters of each stage is initiated here. */
+     The loop counters of each stage is initiated here. */
   blockSize1 = srcBLen - 1U;
   blockSize2 = srcALen - (srcBLen - 1U);
   blockSize3 = blockSize1;
@@ -177,6 +177,7 @@ void arm_correlate_q31(
   pSrc1 = pIn2 + (srcBLen - 1U);
   py = pSrc1;
 
+
   /* ------------------------
    * Stage1 process
    * ----------------------*/
@@ -187,37 +188,46 @@ void arm_correlate_q31(
     /* Accumulator is made zero for every iteration */
     sum = 0;
 
-    /* Apply loop unrolling and compute 4 MACs simultaneously. */
-    k = count >> 2;
+#if defined (ARM_MATH_LOOPUNROLL)
+
+    /* Loop unrolling: Compute 4 outputs at a time */
+    k = count >> 2U;
 
-    /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
-     ** a second loop below computes MACs for the remaining 1 to 3 samples. */
     while (k > 0U)
     {
       /* x[0] * y[srcBLen - 4] */
-      sum += (q63_t) * px++ * (*py++);
+      sum += (q63_t) *px++ * (*py++);
+
       /* x[1] * y[srcBLen - 3] */
-      sum += (q63_t) * px++ * (*py++);
+      sum += (q63_t) *px++ * (*py++);
+
       /* x[2] * y[srcBLen - 2] */
-      sum += (q63_t) * px++ * (*py++);
+      sum += (q63_t) *px++ * (*py++);
+
       /* x[3] * y[srcBLen - 1] */
-      sum += (q63_t) * px++ * (*py++);
+      sum += (q63_t) *px++ * (*py++);
 
-      /* Decrement the loop counter */
+      /* Decrement loop counter */
       k--;
     }
 
-    /* If the count is not a multiple of 4, compute any remaining MACs here.
-     ** No loop unrolling is used. */
+    /* Loop unrolling: Compute remaining outputs */
     k = count % 0x4U;
 
+#else
+
+    /* Initialize k with number of samples */
+    k = count;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
     while (k > 0U)
     {
-      /* Perform the multiply-accumulates */
+      /* Perform the multiply-accumulate */
       /* x[0] * y[srcBLen - 1] */
-      sum += (q63_t) * px++ * (*py++);
+      sum += (q63_t) *px++ * (*py++);
 
-      /* Decrement the loop counter */
+      /* Decrement loop counter */
       k--;
     }
 
@@ -230,10 +240,10 @@ void arm_correlate_q31(
     py = pSrc1 - count;
     px = pIn1;
 
-    /* Increment the MAC count */
+    /* Increment MAC count */
     count++;
 
-    /* Decrement the loop counter */
+    /* Decrement loop counter */
     blockSize1--;
   }
 
@@ -265,6 +275,8 @@ void arm_correlate_q31(
    * srcBLen should be greater than or equal to 4 */
   if (srcBLen >= 4U)
   {
+#if defined (ARM_MATH_LOOPUNROLL)
+
     /* Loop unroll by 3 */
     blkCnt = blockSize2 / 3;
 
@@ -276,8 +288,8 @@ void arm_correlate_q31(
       acc2 = 0;
 
       /* read x[0], x[1] samples */
-      x0 = *(px++);
-      x1 = *(px++);
+      x0 = *px++;
+      x1 = *px++;
 
       /* Apply loop unrolling and compute 3 MACs simultaneously. */
       k = srcBLen / 3;
@@ -288,7 +300,6 @@ void arm_correlate_q31(
       {
         /* Read y[0] sample */
         c0 = *(py);
-
         /* Read x[2] sample */
         x2 = *(px);
 
@@ -302,11 +313,10 @@ void arm_correlate_q31(
 
         /* Read y[1] sample */
         c0 = *(py + 1U);
-
         /* Read x[3] sample */
         x0 = *(px + 1U);
 
-        /* Perform the multiply-accumulates */
+        /* Perform the multiply-accumulate */
         /* acc0 +=  x[1] * y[1] */
         acc0 += ((q63_t) x1 * c0);
         /* acc1 +=  x[2] * y[1] */
@@ -316,11 +326,10 @@ void arm_correlate_q31(
 
         /* Read y[2] sample */
         c0 = *(py + 2U);
-
         /* Read x[4] sample */
         x1 = *(px + 2U);
 
-        /* Perform the multiply-accumulates */
+        /* Perform the multiply-accumulate */
         /* acc0 +=  x[2] * y[2] */
         acc0 += ((q63_t) x2 * c0);
         /* acc1 +=  x[3] * y[2] */
@@ -358,7 +367,7 @@ void arm_correlate_q31(
         x0 = x1;
         x1 = x2;
 
-        /* Decrement the loop counter */
+        /* Decrement loop counter */
         k--;
       }
 
@@ -380,45 +389,56 @@ void arm_correlate_q31(
       px = pIn1 + count;
       py = pIn2;
 
-
-      /* Decrement the loop counter */
+      /* Decrement loop counter */
       blkCnt--;
     }
 
-    /* If the blockSize2 is not a multiple of 3, compute any remaining output samples here.
-     ** No loop unrolling is used. */
+    /* Loop unrolling: Compute remaining outputs */
     blkCnt = blockSize2 - 3 * (blockSize2 / 3);
 
+#else
+
+    /* Initialize blkCnt with number of samples */
+    blkCnt = blockSize2;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
     while (blkCnt > 0U)
     {
       /* Accumulator is made zero for every iteration */
       sum = 0;
 
-      /* Apply loop unrolling and compute 4 MACs simultaneously. */
+#if defined (ARM_MATH_LOOPUNROLL)
+
+    /* Loop unrolling: Compute 4 outputs at a time */
       k = srcBLen >> 2U;
 
-      /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
-       ** a second loop below computes MACs for the remaining 1 to 3 samples. */
       while (k > 0U)
       {
         /* Perform the multiply-accumulates */
-        sum += (q63_t) * px++ * (*py++);
-        sum += (q63_t) * px++ * (*py++);
-        sum += (q63_t) * px++ * (*py++);
-        sum += (q63_t) * px++ * (*py++);
+        sum += (q63_t) *px++ * *py++;
+        sum += (q63_t) *px++ * *py++;
+        sum += (q63_t) *px++ * *py++;
+        sum += (q63_t) *px++ * *py++;
 
-        /* Decrement the loop counter */
+        /* Decrement loop counter */
         k--;
       }
 
-      /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
-       ** No loop unrolling is used. */
+      /* Loop unrolling: Compute remaining outputs */
       k = srcBLen % 0x4U;
 
+#else
+
+      /* Initialize blkCnt with number of samples */
+      k = srcBLen;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
       while (k > 0U)
       {
         /* Perform the multiply-accumulate */
-        sum += (q63_t) * px++ * (*py++);
+        sum += (q63_t) *px++ * *py++;
 
         /* Decrement the loop counter */
         k--;
@@ -429,14 +449,14 @@ void arm_correlate_q31(
       /* Destination pointer is updated according to the address modifier, inc */
       pOut += inc;
 
-      /* Increment the MAC count */
+      /* Increment MAC count */
       count++;
 
       /* Update the inputA and inputB pointers for next MAC calculation */
       px = pIn1 + count;
       py = pIn2;
 
-      /* Decrement the loop counter */
+      /* Decrement loop counter */
       blkCnt--;
     }
   }
@@ -451,13 +471,13 @@ void arm_correlate_q31(
       /* Accumulator is made zero for every iteration */
       sum = 0;
 
-      /* Loop over srcBLen */
+      /* srcBLen number of MACS should be performed */
       k = srcBLen;
 
       while (k > 0U)
       {
         /* Perform the multiply-accumulate */
-        sum += (q63_t) * px++ * (*py++);
+        sum += (q63_t) *px++ * *py++;
 
         /* Decrement the loop counter */
         k--;
@@ -468,18 +488,19 @@ void arm_correlate_q31(
       /* Destination pointer is updated according to the address modifier, inc */
       pOut += inc;
 
-      /* Increment the MAC count */
+      /* Increment MAC count */
       count++;
 
       /* Update the inputA and inputB pointers for next MAC calculation */
       px = pIn1 + count;
       py = pIn2;
 
-      /* Decrement the loop counter */
+      /* Decrement loop counter */
       blkCnt--;
     }
   }
 
+
   /* --------------------------
    * Initializations of stage3
    * -------------------------*/
@@ -511,37 +532,46 @@ void arm_correlate_q31(
     /* Accumulator is made zero for every iteration */
     sum = 0;
 
-    /* Apply loop unrolling and compute 4 MACs simultaneously. */
+#if defined (ARM_MATH_LOOPUNROLL)
+
+    /* Loop unrolling: Compute 4 outputs at a time */
     k = count >> 2U;
 
-    /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
-     ** a second loop below computes MACs for the remaining 1 to 3 samples. */
     while (k > 0U)
     {
-      /* Perform the multiply-accumulates */
+      /* Perform the multiply-accumulate */
       /* sum += x[srcALen - srcBLen + 4] * y[3] */
-      sum += (q63_t) * px++ * (*py++);
+      sum += (q63_t) *px++ * *py++;
+
       /* sum += x[srcALen - srcBLen + 3] * y[2] */
-      sum += (q63_t) * px++ * (*py++);
+      sum += (q63_t) *px++ * *py++;
+
       /* sum += x[srcALen - srcBLen + 2] * y[1] */
-      sum += (q63_t) * px++ * (*py++);
+      sum += (q63_t) *px++ * *py++;
+
       /* sum += x[srcALen - srcBLen + 1] * y[0] */
-      sum += (q63_t) * px++ * (*py++);
+      sum += (q63_t) *px++ * *py++;
 
-      /* Decrement the loop counter */
+      /* Decrement loop counter */
       k--;
     }
 
-    /* If the count is not a multiple of 4, compute any remaining MACs here.
-     ** No loop unrolling is used. */
+    /* Loop unrolling: Compute remaining outputs */
     k = count % 0x4U;
 
+#else
+
+    /* Initialize blkCnt with number of samples */
+    k = count;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
     while (k > 0U)
     {
-      /* Perform the multiply-accumulates */
-      sum += (q63_t) * px++ * (*py++);
+      /* Perform the multiply-accumulate */
+      sum += (q63_t) *px++ * *py++;
 
-      /* Decrement the loop counter */
+      /* Decrement loop counter */
       k--;
     }
 
@@ -554,23 +584,22 @@ void arm_correlate_q31(
     px = ++pSrc1;
     py = pIn2;
 
-    /* Decrement the MAC count */
+    /* Decrement MAC count */
     count--;
 
-    /* Decrement the loop counter */
+    /* Decrement loop counter */
     blockSize3--;
   }
 
 #else
+/* alternate version for CM0_FAMILY */
 
-  /* Run the below code for Cortex-M0 */
-
-  q31_t *pIn1 = pSrcA;                           /* inputA pointer               */
-  q31_t *pIn2 = pSrcB + (srcBLen - 1U);          /* inputB pointer               */
-  q63_t sum;                                     /* Accumulators                  */
-  uint32_t i = 0U, j;                            /* loop counters */
-  uint32_t inv = 0U;                             /* Reverse order flag */
-  uint32_t tot = 0U;                             /* Length */
+  const q31_t *pIn1 = pSrcA;                           /* InputA pointer */
+  const q31_t *pIn2 = pSrcB + (srcBLen - 1U);          /* InputB pointer */
+        q63_t sum;                                     /* Accumulators */
+        uint32_t i = 0U, j;                            /* Loop counters */
+        uint32_t inv = 0U;                             /* Reverse order flag */
+        uint32_t tot = 0U;                             /* Length */
 
   /* The algorithm implementation is based on the lengths of the inputs. */
   /* srcB is always made to slide across srcA. */
@@ -618,25 +647,25 @@ void arm_correlate_q31(
 
     /* Setting the reverse flag */
     inv = 1;
-
   }
 
   /* Loop to calculate correlation for output length number of times */
   for (i = 0U; i <= tot; i++)
   {
-    /* Initialize sum with zero to carry on MAC operations */
+    /* Initialize sum with zero to carry out MAC operations */
     sum = 0;
 
     /* Loop to perform MAC operations according to correlation equation */
     for (j = 0U; j <= i; j++)
     {
       /* Check the array limitations */
-      if ((((i - j) < srcBLen) && (j < srcALen)))
+      if (((i - j) < srcBLen) && (j < srcALen))
       {
         /* z[i] += x[i-j] * y[j] */
         sum += ((q63_t) pIn1[j] * pIn2[-((int32_t) i - j)]);
       }
     }
+
     /* Store the output in the destination buffer */
     if (inv == 1)
       *pDst-- = (q31_t) (sum >> 31U);
@@ -644,10 +673,10 @@ void arm_correlate_q31(
       *pDst++ = (q31_t) (sum >> 31U);
   }
 
-#endif /*   #if defined (ARM_MATH_DSP) */
+#endif /* #if !defined(ARM_MATH_CM0_FAMILY) */
 
 }
 
 /**
- * @} end of Corr group
+  @} end of Corr group
  */