Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/Flipper-Zero/STM32CubeWB.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_q31.c')
-rw-r--r--Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_q31.c354
1 files changed, 221 insertions, 133 deletions
diff --git a/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_q31.c b/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_q31.c
index ed6744265..7af8a44e2 100644
--- a/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_q31.c
+++ b/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_q31.c
@@ -3,13 +3,13 @@
* Title: arm_fir_decimate_q31.c
* Description: Q31 FIR Decimator
*
- * $Date: 27. January 2017
- * $Revision: V.1.5.1
+ * $Date: 18. March 2019
+ * $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
- * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -29,130 +29,164 @@
#include "arm_math.h"
/**
- * @ingroup groupFilters
+ @ingroup groupFilters
*/
/**
- * @addtogroup FIR_decimate
- * @{
+ @addtogroup FIR_decimate
+ @{
*/
/**
- * @brief Processing function for the Q31 FIR decimator.
- * @param[in] *S points to an instance of the Q31 FIR decimator structure.
- * @param[in] *pSrc points to the block of input data.
- * @param[out] *pDst points to the block of output data
- * @param[in] blockSize number of input samples to process per call.
- * @return none
- *
- * <b>Scaling and Overflow Behavior:</b>
- * \par
- * The function is implemented using an internal 64-bit accumulator.
- * The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit.
- * Thus, if the accumulator result overflows it wraps around rather than clip.
- * In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits (where log2 is read as log to the base 2).
- * After all multiply-accumulates are performed, the 2.62 accumulator is truncated to 1.32 format and then saturated to 1.31 format.
- *
- * \par
- * Refer to the function <code>arm_fir_decimate_fast_q31()</code> for a faster but less precise implementation of this function for Cortex-M3 and Cortex-M4.
+ @brief Processing function for the Q31 FIR decimator.
+ @param[in] S points to an instance of the Q31 FIR decimator structure
+ @param[in] pSrc points to the block of input data
+ @param[out] pDst points to the block of output data
+ @param[in] blockSize number of samples to process
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using an internal 64-bit accumulator.
+ The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit.
+ Thus, if the accumulator result overflows it wraps around rather than clip.
+ In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits (where log2 is read as log to the base 2).
+ After all multiply-accumulates are performed, the 2.62 accumulator is truncated to 1.32 format and then saturated to 1.31 format.
+
+ @remark
+ Refer to \ref arm_fir_decimate_fast_q31() for a faster but less precise implementation of this function.
*/
void arm_fir_decimate_q31(
const arm_fir_decimate_instance_q31 * S,
- q31_t * pSrc,
- q31_t * pDst,
- uint32_t blockSize)
+ const q31_t * pSrc,
+ q31_t * pDst,
+ uint32_t blockSize)
{
- q31_t *pState = S->pState; /* State pointer */
- q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
- q31_t *pStateCurnt; /* Points to the current sample of the state */
- q31_t x0, c0; /* Temporary variables to hold state and coefficient values */
- q31_t *px; /* Temporary pointers for state buffer */
- q31_t *pb; /* Temporary pointers for coefficient buffer */
- q63_t sum0; /* Accumulator */
- uint32_t numTaps = S->numTaps; /* Number of taps */
- uint32_t i, tapCnt, blkCnt, outBlockSize = blockSize / S->M; /* Loop counters */
-
-
-#if defined (ARM_MATH_DSP)
-
- /* Run the below code for Cortex-M4 and Cortex-M3 */
+ q31_t *pState = S->pState; /* State pointer */
+ const q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q31_t *pStateCur; /* Points to the current sample of the state */
+ q31_t *px0; /* Temporary pointer for state buffer */
+ const q31_t *pb; /* Temporary pointer for coefficient buffer */
+ q31_t x0, c0; /* Temporary variables to hold state and coefficient values */
+ q63_t acc0; /* Accumulator */
+ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ uint32_t i, tapCnt, blkCnt, outBlockSize = blockSize / S->M; /* Loop counters */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ q31_t *px1, *px2, *px3;
+ q31_t x1, x2, x3;
+ q63_t acc1, acc2, acc3;
+#endif
/* S->pState buffer contains previous frame (numTaps - 1) samples */
- /* pStateCurnt points to the location where the new input data should be written */
- pStateCurnt = S->pState + (numTaps - 1U);
+ /* pStateCur points to the location where the new input data should be written */
+ pStateCur = S->pState + (numTaps - 1U);
- /* Total number of output samples to be computed */
- blkCnt = outBlockSize;
+#if defined (ARM_MATH_LOOPUNROLL)
+ /* Loop unrolling: Compute 4 samples at a time */
+ blkCnt = outBlockSize >> 2U;
+
+ /* Samples loop unrolled by 4 */
while (blkCnt > 0U)
{
- /* Copy decimation factor number of new input samples into the state buffer */
- i = S->M;
+ /* Copy 4 * decimation factor number of new input samples into the state buffer */
+ i = S->M * 4;
do
{
- *pStateCurnt++ = *pSrc++;
+ *pStateCur++ = *pSrc++;
} while (--i);
- /* Set accumulator to zero */
- sum0 = 0;
+ /* Set accumulators to zero */
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+ acc3 = 0;
- /* Initialize state pointer */
- px = pState;
+ /* Initialize state pointer for all the samples */
+ px0 = pState;
+ px1 = pState + S->M;
+ px2 = pState + 2 * S->M;
+ px3 = pState + 3 * S->M;
/* Initialize coeff pointer */
pb = pCoeffs;
- /* Loop unrolling. Process 4 taps at a time. */
- tapCnt = numTaps >> 2;
+ /* Loop unrolling: Compute 4 taps at a time */
+ tapCnt = numTaps >> 2U;
- /* Loop over the number of taps. Unroll by a factor of 4.
- ** Repeat until we've computed numTaps-4 coefficients. */
while (tapCnt > 0U)
{
/* Read the b[numTaps-1] coefficient */
c0 = *(pb++);
- /* Read x[n-numTaps-1] sample */
- x0 = *(px++);
+ /* Read x[n-numTaps-1] sample for acc0 */
+ x0 = *(px0++);
+ /* Read x[n-numTaps-1] sample for acc1 */
+ x1 = *(px1++);
+ /* Read x[n-numTaps-1] sample for acc2 */
+ x2 = *(px2++);
+ /* Read x[n-numTaps-1] sample for acc3 */
+ x3 = *(px3++);
/* Perform the multiply-accumulate */
- sum0 += (q63_t) x0 *c0;
+ acc0 += (q63_t) x0 * c0;
+ acc1 += (q63_t) x1 * c0;
+ acc2 += (q63_t) x2 * c0;
+ acc3 += (q63_t) x3 * c0;
/* Read the b[numTaps-2] coefficient */
c0 = *(pb++);
- /* Read x[n-numTaps-2] sample */
- x0 = *(px++);
+ /* Read x[n-numTaps-2] sample for acc0, acc1, acc2, acc3 */
+ x0 = *(px0++);
+ x1 = *(px1++);
+ x2 = *(px2++);
+ x3 = *(px3++);
/* Perform the multiply-accumulate */
- sum0 += (q63_t) x0 *c0;
+ acc0 += (q63_t) x0 * c0;
+ acc1 += (q63_t) x1 * c0;
+ acc2 += (q63_t) x2 * c0;
+ acc3 += (q63_t) x3 * c0;
/* Read the b[numTaps-3] coefficient */
c0 = *(pb++);
- /* Read x[n-numTaps-3] sample */
- x0 = *(px++);
+ /* Read x[n-numTaps-3] sample acc0, acc1, acc2, acc3 */
+ x0 = *(px0++);
+ x1 = *(px1++);
+ x2 = *(px2++);
+ x3 = *(px3++);
/* Perform the multiply-accumulate */
- sum0 += (q63_t) x0 *c0;
+ acc0 += (q63_t) x0 * c0;
+ acc1 += (q63_t) x1 * c0;
+ acc2 += (q63_t) x2 * c0;
+ acc3 += (q63_t) x3 * c0;
/* Read the b[numTaps-4] coefficient */
c0 = *(pb++);
- /* Read x[n-numTaps-4] sample */
- x0 = *(px++);
+ /* Read x[n-numTaps-4] sample acc0, acc1, acc2, acc3 */
+ x0 = *(px0++);
+ x1 = *(px1++);
+ x2 = *(px2++);
+ x3 = *(px3++);
/* Perform the multiply-accumulate */
- sum0 += (q63_t) x0 *c0;
+ acc0 += (q63_t) x0 * c0;
+ acc1 += (q63_t) x1 * c0;
+ acc2 += (q63_t) x2 * c0;
+ acc3 += (q63_t) x3 * c0;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
- /* If the filter length is not a multiple of 4, compute the remaining filter taps */
+ /* Loop unrolling: Compute remaining taps */
tapCnt = numTaps % 0x4U;
while (tapCnt > 0U)
@@ -160,70 +194,46 @@ void arm_fir_decimate_q31(
/* Read coefficients */
c0 = *(pb++);
- /* Fetch 1 state variable */
- x0 = *(px++);
+ /* Fetch state variables for acc0, acc1, acc2, acc3 */
+ x0 = *(px0++);
+ x1 = *(px1++);
+ x2 = *(px2++);
+ x3 = *(px3++);
/* Perform the multiply-accumulate */
- sum0 += (q63_t) x0 *c0;
+ acc0 += (q63_t) x0 * c0;
+ acc1 += (q63_t) x1 * c0;
+ acc2 += (q63_t) x2 * c0;
+ acc3 += (q63_t) x3 * c0;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
/* Advance the state pointer by the decimation factor
* to process the next group of decimation factor number samples */
- pState = pState + S->M;
+ pState = pState + S->M * 4;
/* The result is in the accumulator, store in the destination buffer. */
- *pDst++ = (q31_t) (sum0 >> 31);
+ *pDst++ = (q31_t) (acc0 >> 31);
+ *pDst++ = (q31_t) (acc1 >> 31);
+ *pDst++ = (q31_t) (acc2 >> 31);
+ *pDst++ = (q31_t) (acc3 >> 31);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
- /* Processing is complete.
- ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
- ** This prepares the state buffer for the next function call. */
-
- /* Points to the start of the state buffer */
- pStateCurnt = S->pState;
-
- i = (numTaps - 1U) >> 2U;
-
- /* copy data */
- while (i > 0U)
- {
- *pStateCurnt++ = *pState++;
- *pStateCurnt++ = *pState++;
- *pStateCurnt++ = *pState++;
- *pStateCurnt++ = *pState++;
-
- /* Decrement the loop counter */
- i--;
- }
-
- i = (numTaps - 1U) % 0x04U;
-
- /* copy data */
- while (i > 0U)
- {
- *pStateCurnt++ = *pState++;
-
- /* Decrement the loop counter */
- i--;
- }
+ /* Loop unrolling: Compute remaining samples */
+ blkCnt = outBlockSize % 0x4U;
#else
-/* Run the below code for Cortex-M0 */
-
- /* S->pState buffer contains previous frame (numTaps - 1) samples */
- /* pStateCurnt points to the location where the new input data should be written */
- pStateCurnt = S->pState + (numTaps - 1U);
-
- /* Total number of output samples to be computed */
+ /* Initialize blkCnt with number of samples */
blkCnt = outBlockSize;
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (blkCnt > 0U)
{
/* Copy decimation factor number of new input samples into the state buffer */
@@ -231,33 +241,88 @@ void arm_fir_decimate_q31(
do
{
- *pStateCurnt++ = *pSrc++;
+ *pStateCur++ = *pSrc++;
} while (--i);
/* Set accumulator to zero */
- sum0 = 0;
+ acc0 = 0;
/* Initialize state pointer */
- px = pState;
+ px0 = pState;
/* Initialize coeff pointer */
pb = pCoeffs;
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time */
+ tapCnt = numTaps >> 2U;
+
+ while (tapCnt > 0U)
+ {
+ /* Read the b[numTaps-1] coefficient */
+ c0 = *pb++;
+
+ /* Read x[n-numTaps-1] sample */
+ x0 = *px0++;
+
+ /* Perform the multiply-accumulate */
+ acc0 += (q63_t) x0 * c0;
+
+ /* Read the b[numTaps-2] coefficient */
+ c0 = *pb++;
+
+ /* Read x[n-numTaps-2] sample */
+ x0 = *px0++;
+
+ /* Perform the multiply-accumulate */
+ acc0 += (q63_t) x0 * c0;
+
+ /* Read the b[numTaps-3] coefficient */
+ c0 = *pb++;
+
+ /* Read x[n-numTaps-3] sample */
+ x0 = *px0++;
+
+ /* Perform the multiply-accumulate */
+ acc0 += (q63_t) x0 * c0;
+
+ /* Read the b[numTaps-4] coefficient */
+ c0 = *pb++;
+
+ /* Read x[n-numTaps-4] sample */
+ x0 = *px0++;
+
+ /* Perform the multiply-accumulate */
+ acc0 += (q63_t) x0 * c0;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining taps */
+ tapCnt = numTaps % 0x4U;
+
+#else
+
+ /* Initialize tapCnt with number of taps */
tapCnt = numTaps;
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
while (tapCnt > 0U)
{
/* Read coefficients */
c0 = *pb++;
/* Fetch 1 state variable */
- x0 = *px++;
+ x0 = *px0++;
/* Perform the multiply-accumulate */
- sum0 += (q63_t) x0 *c0;
+ acc0 += (q63_t) x0 * c0;
- /* Decrement the loop counter */
+ /* Decrement loop counter */
tapCnt--;
}
@@ -266,34 +331,57 @@ void arm_fir_decimate_q31(
pState = pState + S->M;
/* The result is in the accumulator, store in the destination buffer. */
- *pDst++ = (q31_t) (sum0 >> 31);
+ *pDst++ = (q31_t) (acc0 >> 31);
- /* Decrement the loop counter */
+ /* Decrement loop counter */
blkCnt--;
}
/* Processing is complete.
- ** Now copy the last numTaps - 1 samples to the start of the state buffer.
- ** This prepares the state buffer for the next function call. */
+ Now copy the last numTaps - 1 samples to the satrt of the state buffer.
+ This prepares the state buffer for the next function call. */
/* Points to the start of the state buffer */
- pStateCurnt = S->pState;
+ pStateCur = S->pState;
+
+#if defined (ARM_MATH_LOOPUNROLL)
- i = numTaps - 1U;
+ /* Loop unrolling: Compute 4 taps at a time */
+ tapCnt = (numTaps - 1U) >> 2U;
- /* copy data */
- while (i > 0U)
+ /* Copy data */
+ while (tapCnt > 0U)
{
- *pStateCurnt++ = *pState++;
+ *pStateCur++ = *pState++;
+ *pStateCur++ = *pState++;
+ *pStateCur++ = *pState++;
+ *pStateCur++ = *pState++;
- /* Decrement the loop counter */
- i--;
+ /* Decrement loop counter */
+ tapCnt--;
}
-#endif /* #if defined (ARM_MATH_DSP) */
+ /* Loop unrolling: Compute remaining taps */
+ tapCnt = (numTaps - 1U) % 0x04U;
+
+#else
+
+ /* Initialize tapCnt with number of taps */
+ tapCnt = (numTaps - 1U);
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ /* Copy data */
+ while (tapCnt > 0U)
+ {
+ *pStateCur++ = *pState++;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
}
/**
- * @} end of FIR_decimate group
+ @} end of FIR_decimate group
*/