diff options
Diffstat (limited to 'Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_q7.c')
-rw-r--r-- | Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_q7.c | 418 |
1 files changed, 145 insertions, 273 deletions
diff --git a/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_q7.c b/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_q7.c index c1b4ce340..7a2b57f18 100644 --- a/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_q7.c +++ b/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_q7.c @@ -3,13 +3,13 @@ * Title: arm_fir_sparse_q7.c * Description: Q7 sparse FIR filter processing function * - * $Date: 27. January 2017 - * $Revision: V.1.5.1 + * $Date: 18. March 2019 + * $Revision: V1.6.0 * * Target Processor: Cortex-M cores * -------------------------------------------------------------------- */ /* - * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved. * * SPDX-License-Identifier: Apache-2.0 * @@ -28,78 +28,70 @@ #include "arm_math.h" - /** - * @ingroup groupFilters + @ingroup groupFilters */ /** - * @addtogroup FIR_Sparse - * @{ + @addtogroup FIR_Sparse + @{ */ - /** - * @brief Processing function for the Q7 sparse FIR filter. - * @param[in] *S points to an instance of the Q7 sparse FIR structure. - * @param[in] *pSrc points to the block of input data. - * @param[out] *pDst points to the block of output data - * @param[in] *pScratchIn points to a temporary buffer of size blockSize. - * @param[in] *pScratchOut points to a temporary buffer of size blockSize. - * @param[in] blockSize number of input samples to process per call. - * @return none. - * - * <b>Scaling and Overflow Behavior:</b> - * \par - * The function is implemented using a 32-bit internal accumulator. - * Both coefficients and state variables are represented in 1.7 format and multiplications yield a 2.14 result. - * The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format. - * There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved. - * The accumulator is then converted to 18.7 format by discarding the low 7 bits. - * Finally, the result is truncated to 1.7 format. + @brief Processing function for the Q7 sparse FIR filter. + @param[in] S points to an instance of the Q7 sparse FIR structure + @param[in] pSrc points to the block of input data + @param[out] pDst points to the block of output data + @param[in] pScratchIn points to a temporary buffer of size blockSize + @param[in] pScratchOut points to a temporary buffer of size blockSize + @param[in] blockSize number of input samples to process + @return none + + @par Scaling and Overflow Behavior + The function is implemented using a 32-bit internal accumulator. + Both coefficients and state variables are represented in 1.7 format and multiplications yield a 2.14 result. + The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format. + There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved. + The accumulator is then converted to 18.7 format by discarding the low 7 bits. + Finally, the result is truncated to 1.7 format. */ void arm_fir_sparse_q7( - arm_fir_sparse_instance_q7 * S, - q7_t * pSrc, - q7_t * pDst, - q7_t * pScratchIn, - q31_t * pScratchOut, - uint32_t blockSize) + arm_fir_sparse_instance_q7 * S, + const q7_t * pSrc, + q7_t * pDst, + q7_t * pScratchIn, + q31_t * pScratchOut, + uint32_t blockSize) { - - q7_t *pState = S->pState; /* State pointer */ - q7_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ - q7_t *px; /* Scratch buffer pointer */ - q7_t *py = pState; /* Temporary pointers for state buffer */ - q7_t *pb = pScratchIn; /* Temporary pointers for scratch buffer */ - q7_t *pOut = pDst; /* Destination pointer */ - int32_t *pTapDelay = S->pTapDelay; /* Pointer to the array containing offset of the non-zero tap values. */ - uint32_t delaySize = S->maxDelay + blockSize; /* state length */ - uint16_t numTaps = S->numTaps; /* Filter order */ - int32_t readIndex; /* Read index of the state buffer */ - uint32_t tapCnt, blkCnt; /* loop counters */ - q7_t coeff = *pCoeffs++; /* Read the coefficient value */ - q31_t *pScr2 = pScratchOut; /* Working pointer for scratch buffer of output values */ - q31_t in; - - -#if defined (ARM_MATH_DSP) - - /* Run the below code for Cortex-M4 and Cortex-M3 */ - - q7_t in1, in2, in3, in4; + q7_t *pState = S->pState; /* State pointer */ + const q7_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + q7_t *px; /* Scratch buffer pointer */ + q7_t *py = pState; /* Temporary pointers for state buffer */ + q7_t *pb = pScratchIn; /* Temporary pointers for scratch buffer */ + q7_t *pOut = pDst; /* Destination pointer */ + int32_t *pTapDelay = S->pTapDelay; /* Pointer to the array containing offset of the non-zero tap values. */ + uint32_t delaySize = S->maxDelay + blockSize; /* state length */ + uint16_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ + int32_t readIndex; /* Read index of the state buffer */ + uint32_t tapCnt, blkCnt; /* loop counters */ + q31_t *pScr2 = pScratchOut; /* Working pointer for scratch buffer of output values */ + q31_t in; + q7_t coeff = *pCoeffs++; /* Read the coefficient value */ + +#if defined (ARM_MATH_LOOPUNROLL) + q7_t in1, in2, in3, in4; +#endif /* BlockSize of Input samples are copied into the state buffer */ /* StateIndex points to the starting position to write in the state buffer */ - arm_circularWrite_q7(py, (int32_t) delaySize, &S->stateIndex, 1, pSrc, 1, - blockSize); + arm_circularWrite_q7(py, (int32_t) delaySize, &S->stateIndex, 1, pSrc, 1, blockSize); /* Loop over the number of taps. */ tapCnt = numTaps; /* Read Index, from where the state buffer should be read, is calculated. */ - readIndex = ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++; + readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++; /* Wraparound of readIndex */ if (readIndex < 0) @@ -111,8 +103,8 @@ void arm_fir_sparse_q7( py = pState; /* blockSize samples are read from the state buffer */ - arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1, pb, pb, - (int32_t) blockSize, 1, blockSize); + arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1, + pb, pb, (int32_t) blockSize, 1, blockSize); /* Working pointer for the scratch buffer of state values */ px = pb; @@ -120,32 +112,40 @@ void arm_fir_sparse_q7( /* Working pointer for scratch buffer of output values */ pScratchOut = pScr2; - /* Loop over the blockSize. Unroll by a factor of 4. - * Compute 4 multiplications at a time. */ - blkCnt = blockSize >> 2; + +#if defined (ARM_MATH_LOOPUNROLL) + + /* Loop unrolling: Compute 4 outputs at a time. */ + blkCnt = blockSize >> 2U; while (blkCnt > 0U) { /* Perform multiplication and store in the scratch buffer */ - *pScratchOut++ = ((q31_t) * px++ * coeff); - *pScratchOut++ = ((q31_t) * px++ * coeff); - *pScratchOut++ = ((q31_t) * px++ * coeff); - *pScratchOut++ = ((q31_t) * px++ * coeff); + *pScratchOut++ = ((q31_t) *px++ * coeff); + *pScratchOut++ = ((q31_t) *px++ * coeff); + *pScratchOut++ = ((q31_t) *px++ * coeff); + *pScratchOut++ = ((q31_t) *px++ * coeff); - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } - /* If the blockSize is not a multiple of 4, - * compute the remaining samples */ + /* Loop unrolling: Compute remaining outputs */ blkCnt = blockSize % 0x4U; +#else + + /* Initialize blkCnt with number of samples */ + blkCnt = blockSize; + +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ + while (blkCnt > 0U) { - /* Perform multiplication and store in the scratch buffer */ - *pScratchOut++ = ((q31_t) * px++ * coeff); + /* Perform Multiplication and store in the scratch buffer */ + *pScratchOut++ = ((q31_t) *px++ * coeff); - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } @@ -154,7 +154,7 @@ void arm_fir_sparse_q7( coeff = *pCoeffs++; /* Read Index, from where the state buffer should be read, is calculated. */ - readIndex = ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++; + readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++; /* Wraparound of readIndex */ if (readIndex < 0) @@ -171,8 +171,8 @@ void arm_fir_sparse_q7( py = pState; /* blockSize samples are read from the state buffer */ - arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1, pb, pb, - (int32_t) blockSize, 1, blockSize); + arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1, + pb, pb, (int32_t) blockSize, 1, blockSize); /* Working pointer for the scratch buffer of state values */ px = pb; @@ -180,9 +180,11 @@ void arm_fir_sparse_q7( /* Working pointer for scratch buffer of output values */ pScratchOut = pScr2; - /* Loop over the blockSize. Unroll by a factor of 4. - * Compute 4 MACS at a time. */ - blkCnt = blockSize >> 2; + +#if defined (ARM_MATH_LOOPUNROLL) + + /* Loop unrolling: Compute 4 outputs at a time. */ + blkCnt = blockSize >> 2U; while (blkCnt > 0U) { @@ -196,21 +198,27 @@ void arm_fir_sparse_q7( in = *pScratchOut + ((q31_t) * px++ * coeff); *pScratchOut++ = in; - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } - /* If the blockSize is not a multiple of 4, - * compute the remaining samples */ + /* Loop unrolling: Compute remaining outputs */ blkCnt = blockSize % 0x4U; +#else + + /* Initialize blkCnt with number of samples */ + blkCnt = blockSize; + +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ + while (blkCnt > 0U) { /* Perform Multiply-Accumulate */ - in = *pScratchOut + ((q31_t) * px++ * coeff); + in = *pScratchOut + ((q31_t) *px++ * coeff); *pScratchOut++ = in; - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } @@ -219,8 +227,7 @@ void arm_fir_sparse_q7( coeff = *pCoeffs++; /* Read Index, from where the state buffer should be read, is calculated. */ - readIndex = ((int32_t) S->stateIndex - - (int32_t) blockSize) - *pTapDelay++; + readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++; /* Wraparound of readIndex */ if (readIndex < 0) @@ -228,242 +235,107 @@ void arm_fir_sparse_q7( readIndex += (int32_t) delaySize; } - /* Decrement the tap loop counter */ + /* Decrement loop counter */ tapCnt--; } - /* Compute last tap without the final read of pTapDelay */ - - /* Working pointer for state buffer is updated */ - py = pState; - - /* blockSize samples are read from the state buffer */ - arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1, pb, pb, - (int32_t) blockSize, 1, blockSize); - - /* Working pointer for the scratch buffer of state values */ - px = pb; - - /* Working pointer for scratch buffer of output values */ - pScratchOut = pScr2; + /* Compute last tap without the final read of pTapDelay */ - /* Loop over the blockSize. Unroll by a factor of 4. - * Compute 4 MACS at a time. */ - blkCnt = blockSize >> 2; + /* Working pointer for state buffer is updated */ + py = pState; - while (blkCnt > 0U) - { - /* Perform Multiply-Accumulate */ - in = *pScratchOut + ((q31_t) * px++ * coeff); - *pScratchOut++ = in; - in = *pScratchOut + ((q31_t) * px++ * coeff); - *pScratchOut++ = in; - in = *pScratchOut + ((q31_t) * px++ * coeff); - *pScratchOut++ = in; - in = *pScratchOut + ((q31_t) * px++ * coeff); - *pScratchOut++ = in; + /* blockSize samples are read from the state buffer */ + arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1, + pb, pb, (int32_t) blockSize, 1, blockSize); - /* Decrement the loop counter */ - blkCnt--; - } + /* Working pointer for the scratch buffer of state values */ + px = pb; - /* If the blockSize is not a multiple of 4, - * compute the remaining samples */ - blkCnt = blockSize % 0x4U; + /* Working pointer for scratch buffer of output values */ + pScratchOut = pScr2; - while (blkCnt > 0U) - { - /* Perform Multiply-Accumulate */ - in = *pScratchOut + ((q31_t) * px++ * coeff); - *pScratchOut++ = in; - /* Decrement the loop counter */ - blkCnt--; - } +#if defined (ARM_MATH_LOOPUNROLL) - /* All the output values are in pScratchOut buffer. - Convert them into 1.15 format, saturate and store in the destination buffer. */ - /* Loop over the blockSize. */ - blkCnt = blockSize >> 2; + /* Loop unrolling: Compute 4 outputs at a time. */ + blkCnt = blockSize >> 2U; while (blkCnt > 0U) { - in1 = (q7_t) __SSAT(*pScr2++ >> 7, 8); - in2 = (q7_t) __SSAT(*pScr2++ >> 7, 8); - in3 = (q7_t) __SSAT(*pScr2++ >> 7, 8); - in4 = (q7_t) __SSAT(*pScr2++ >> 7, 8); - - *__SIMD32(pOut)++ = __PACKq7(in1, in2, in3, in4); - - /* Decrement the blockSize loop counter */ + /* Perform Multiply-Accumulate */ + in = *pScratchOut + ((q31_t) *px++ * coeff); + *pScratchOut++ = in; + in = *pScratchOut + ((q31_t) *px++ * coeff); + *pScratchOut++ = in; + in = *pScratchOut + ((q31_t) *px++ * coeff); + *pScratchOut++ = in; + in = *pScratchOut + ((q31_t) *px++ * coeff); + *pScratchOut++ = in; + + /* Decrement loop counter */ blkCnt--; } - /* If the blockSize is not a multiple of 4, - remaining samples are processed in the below loop */ + /* Loop unrolling: Compute remaining outputs */ blkCnt = blockSize % 0x4U; - while (blkCnt > 0U) - { - *pOut++ = (q7_t) __SSAT(*pScr2++ >> 7, 8); - - /* Decrement the blockSize loop counter */ - blkCnt--; - } - #else - /* Run the below code for Cortex-M0 */ - - /* BlockSize of Input samples are copied into the state buffer */ - /* StateIndex points to the starting position to write in the state buffer */ - arm_circularWrite_q7(py, (int32_t) delaySize, &S->stateIndex, 1, pSrc, 1, - blockSize); - - /* Loop over the number of taps. */ - tapCnt = numTaps; - - /* Read Index, from where the state buffer should be read, is calculated. */ - readIndex = ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++; - - /* Wraparound of readIndex */ - if (readIndex < 0) - { - readIndex += (int32_t) delaySize; - } - - /* Working pointer for state buffer is updated */ - py = pState; - - /* blockSize samples are read from the state buffer */ - arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1, pb, pb, - (int32_t) blockSize, 1, blockSize); - - /* Working pointer for the scratch buffer of state values */ - px = pb; - - /* Working pointer for scratch buffer of output values */ - pScratchOut = pScr2; - - /* Loop over the blockSize */ + /* Initialize blkCnt with number of samples */ blkCnt = blockSize; +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ + while (blkCnt > 0U) { - /* Perform multiplication and store in the scratch buffer */ - *pScratchOut++ = ((q31_t) * px++ * coeff); + /* Perform Multiply-Accumulate */ + in = *pScratchOut + ((q31_t) *px++ * coeff); + *pScratchOut++ = in; - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } - /* Load the coefficient value and - * increment the coefficient buffer for the next set of state values */ - coeff = *pCoeffs++; - - /* Read Index, from where the state buffer should be read, is calculated. */ - readIndex = ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++; - - /* Wraparound of readIndex */ - if (readIndex < 0) - { - readIndex += (int32_t) delaySize; - } + /* All the output values are in pScratchOut buffer. + Convert them into 1.15 format, saturate and store in the destination buffer. */ +#if defined (ARM_MATH_LOOPUNROLL) - /* Loop over the number of taps. */ - tapCnt = (uint32_t) numTaps - 2U; + /* Loop unrolling: Compute 4 outputs at a time. */ + blkCnt = blockSize >> 2U; - while (tapCnt > 0U) + while (blkCnt > 0U) { - /* Working pointer for state buffer is updated */ - py = pState; - - /* blockSize samples are read from the state buffer */ - arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1, pb, pb, - (int32_t) blockSize, 1, blockSize); - - /* Working pointer for the scratch buffer of state values */ - px = pb; - - /* Working pointer for scratch buffer of output values */ - pScratchOut = pScr2; - - /* Loop over the blockSize */ - blkCnt = blockSize; - - while (blkCnt > 0U) - { - /* Perform Multiply-Accumulate */ - in = *pScratchOut + ((q31_t) * px++ * coeff); - *pScratchOut++ = in; - - /* Decrement the loop counter */ - blkCnt--; - } - - /* Load the coefficient value and - * increment the coefficient buffer for the next set of state values */ - coeff = *pCoeffs++; - - /* Read Index, from where the state buffer should be read, is calculated. */ - readIndex = - ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++; + in1 = (q7_t) __SSAT(*pScr2++ >> 7, 8); + in2 = (q7_t) __SSAT(*pScr2++ >> 7, 8); + in3 = (q7_t) __SSAT(*pScr2++ >> 7, 8); + in4 = (q7_t) __SSAT(*pScr2++ >> 7, 8); - /* Wraparound of readIndex */ - if (readIndex < 0) - { - readIndex += (int32_t) delaySize; - } + write_q7x4_ia (&pOut, __PACKq7(in1, in2, in3, in4)); - /* Decrement the tap loop counter */ - tapCnt--; + /* Decrement loop counter */ + blkCnt--; } - /* Compute last tap without the final read of pTapDelay */ - - /* Working pointer for state buffer is updated */ - py = pState; - - /* blockSize samples are read from the state buffer */ - arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1, pb, pb, - (int32_t) blockSize, 1, blockSize); - - /* Working pointer for the scratch buffer of state values */ - px = pb; - - /* Working pointer for scratch buffer of output values */ - pScratchOut = pScr2; - - /* Loop over the blockSize */ - blkCnt = blockSize; - - while (blkCnt > 0U) - { - /* Perform Multiply-Accumulate */ - in = *pScratchOut + ((q31_t) * px++ * coeff); - *pScratchOut++ = in; + /* Loop unrolling: Compute remaining outputs */ + blkCnt = blockSize % 0x4U; - /* Decrement the loop counter */ - blkCnt--; - } +#else - /* All the output values are in pScratchOut buffer. - Convert them into 1.15 format, saturate and store in the destination buffer. */ - /* Loop over the blockSize. */ + /* Initialize blkCnt with number of samples */ blkCnt = blockSize; +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ + while (blkCnt > 0U) { *pOut++ = (q7_t) __SSAT(*pScr2++ >> 7, 8); - /* Decrement the blockSize loop counter */ + /* Decrement loop counter */ blkCnt--; } -#endif /* #if defined (ARM_MATH_DSP) */ - } /** - * @} end of FIR_Sparse group + @} end of FIR_Sparse group */ |