diff options
Diffstat (limited to 'Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_q15.c')
-rw-r--r-- | Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_q15.c | 437 |
1 files changed, 154 insertions, 283 deletions
diff --git a/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_q15.c b/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_q15.c index e17f2bd98..9cea93e2d 100644 --- a/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_q15.c +++ b/Drivers/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_q15.c @@ -3,13 +3,13 @@ * Title: arm_fir_sparse_q15.c * Description: Q15 sparse FIR filter processing function * - * $Date: 27. January 2017 - * $Revision: V.1.5.1 + * $Date: 18. March 2019 + * $Revision: V1.6.0 * * Target Processor: Cortex-M cores * -------------------------------------------------------------------- */ /* - * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved. + * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved. * * SPDX-License-Identifier: Apache-2.0 * @@ -29,72 +29,68 @@ #include "arm_math.h" /** - * @addtogroup FIR_Sparse - * @{ + @ingroup groupFilters */ /** - * @brief Processing function for the Q15 sparse FIR filter. - * @param[in] *S points to an instance of the Q15 sparse FIR structure. - * @param[in] *pSrc points to the block of input data. - * @param[out] *pDst points to the block of output data - * @param[in] *pScratchIn points to a temporary buffer of size blockSize. - * @param[in] *pScratchOut points to a temporary buffer of size blockSize. - * @param[in] blockSize number of input samples to process per call. - * @return none. - * - * <b>Scaling and Overflow Behavior:</b> - * \par - * The function is implemented using an internal 32-bit accumulator. - * The 1.15 x 1.15 multiplications yield a 2.30 result and these are added to a 2.30 accumulator. - * Thus the full precision of the multiplications is maintained but there is only a single guard bit in the accumulator. - * If the accumulator result overflows it will wrap around rather than saturate. - * After all multiply-accumulates are performed, the 2.30 accumulator is truncated to 2.15 format and then saturated to 1.15 format. - * In order to avoid overflows the input signal or coefficients must be scaled down by log2(numTaps) bits. + @addtogroup FIR_Sparse + @{ */ +/** + @brief Processing function for the Q15 sparse FIR filter. + @param[in] S points to an instance of the Q15 sparse FIR structure + @param[in] pSrc points to the block of input data + @param[out] pDst points to the block of output data + @param[in] pScratchIn points to a temporary buffer of size blockSize + @param[in] pScratchOut points to a temporary buffer of size blockSize + @param[in] blockSize number of input samples to process per call + @return none + + @par Scaling and Overflow Behavior + The function is implemented using an internal 32-bit accumulator. + The 1.15 x 1.15 multiplications yield a 2.30 result and these are added to a 2.30 accumulator. + Thus the full precision of the multiplications is maintained but there is only a single guard bit in the accumulator. + If the accumulator result overflows it will wrap around rather than saturate. + After all multiply-accumulates are performed, the 2.30 accumulator is truncated to 2.15 format and then saturated to 1.15 format. + In order to avoid overflows the input signal or coefficients must be scaled down by log2(numTaps) bits. + */ void arm_fir_sparse_q15( - arm_fir_sparse_instance_q15 * S, - q15_t * pSrc, - q15_t * pDst, - q15_t * pScratchIn, - q31_t * pScratchOut, - uint32_t blockSize) + arm_fir_sparse_instance_q15 * S, + const q15_t * pSrc, + q15_t * pDst, + q15_t * pScratchIn, + q31_t * pScratchOut, + uint32_t blockSize) { - - q15_t *pState = S->pState; /* State pointer */ - q15_t *pIn = pSrc; /* Working pointer for input */ - q15_t *pOut = pDst; /* Working pointer for output */ - q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ - q15_t *px; /* Temporary pointers for scratch buffer */ - q15_t *pb = pScratchIn; /* Temporary pointers for scratch buffer */ - q15_t *py = pState; /* Temporary pointers for state buffer */ - int32_t *pTapDelay = S->pTapDelay; /* Pointer to the array containing offset of the non-zero tap values. */ - uint32_t delaySize = S->maxDelay + blockSize; /* state length */ - uint16_t numTaps = S->numTaps; /* Filter order */ - int32_t readIndex; /* Read index of the state buffer */ - uint32_t tapCnt, blkCnt; /* loop counters */ - q15_t coeff = *pCoeffs++; /* Read the first coefficient value */ - q31_t *pScr2 = pScratchOut; /* Working pointer for pScratchOut */ - - -#if defined (ARM_MATH_DSP) - - /* Run the below code for Cortex-M4 and Cortex-M3 */ - - q31_t in1, in2; /* Temporary variables */ - + q15_t *pState = S->pState; /* State pointer */ + const q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + q15_t *px; /* Temporary pointers for scratch buffer */ + q15_t *py = pState; /* Temporary pointers for state buffer */ + q15_t *pb = pScratchIn; /* Temporary pointers for scratch buffer */ + q15_t *pOut = pDst; /* Working pointer for output */ + int32_t *pTapDelay = S->pTapDelay; /* Pointer to the array containing offset of the non-zero tap values. */ + uint32_t delaySize = S->maxDelay + blockSize; /* state length */ + uint16_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ + int32_t readIndex; /* Read index of the state buffer */ + uint32_t tapCnt, blkCnt; /* loop counters */ + q31_t *pScr2 = pScratchOut; /* Working pointer for scratch buffer of output values */ + q15_t coeff = *pCoeffs++; /* Read the first coefficient value */ + +#if defined (ARM_MATH_LOOPUNROLL) + q31_t in1, in2; /* Temporary variables */ +#endif /* BlockSize of Input samples are copied into the state buffer */ /* StateIndex points to the starting position to write in the state buffer */ - arm_circularWrite_q15(py, delaySize, &S->stateIndex, 1, pIn, 1, blockSize); + arm_circularWrite_q15(py, (int32_t) delaySize, &S->stateIndex, 1,pSrc, 1, blockSize); /* Loop over the number of taps. */ tapCnt = numTaps; /* Read Index, from where the state buffer should be read, is calculated. */ - readIndex = (S->stateIndex - blockSize) - *pTapDelay++; + readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++; /* Wraparound of readIndex */ if (readIndex < 0) @@ -106,8 +102,8 @@ void arm_fir_sparse_q15( py = pState; /* blockSize samples are read from the state buffer */ - arm_circularRead_q15(py, delaySize, &readIndex, 1, - pb, pb, blockSize, 1, blockSize); + arm_circularRead_q15(py, (int32_t) delaySize, &readIndex, 1, + pb, pb, (int32_t) blockSize, 1, blockSize); /* Working pointer for the scratch buffer of state values */ px = pb; @@ -115,32 +111,40 @@ void arm_fir_sparse_q15( /* Working pointer for scratch buffer of output values */ pScratchOut = pScr2; - /* Loop over the blockSize. Unroll by a factor of 4. - * Compute 4 multiplications at a time. */ - blkCnt = blockSize >> 2; + +#if defined (ARM_MATH_LOOPUNROLL) + + /* Loop unrolling: Compute 4 outputs at a time. */ + blkCnt = blockSize >> 2U; while (blkCnt > 0U) { /* Perform multiplication and store in the scratch buffer */ - *pScratchOut++ = ((q31_t) * px++ * coeff); - *pScratchOut++ = ((q31_t) * px++ * coeff); - *pScratchOut++ = ((q31_t) * px++ * coeff); - *pScratchOut++ = ((q31_t) * px++ * coeff); + *pScratchOut++ = ((q31_t) *px++ * coeff); + *pScratchOut++ = ((q31_t) *px++ * coeff); + *pScratchOut++ = ((q31_t) *px++ * coeff); + *pScratchOut++ = ((q31_t) *px++ * coeff); - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } - /* If the blockSize is not a multiple of 4, - * compute the remaining samples */ + /* Loop unrolling: Compute remaining outputs */ blkCnt = blockSize % 0x4U; +#else + + /* Initialize blkCnt with number of samples */ + blkCnt = blockSize; + +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ + while (blkCnt > 0U) { - /* Perform multiplication and store in the scratch buffer */ - *pScratchOut++ = ((q31_t) * px++ * coeff); + /* Perform Multiplication and store in the scratch buffer */ + *pScratchOut++ = ((q31_t) *px++ * coeff); - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } @@ -149,7 +153,7 @@ void arm_fir_sparse_q15( coeff = *pCoeffs++; /* Read Index, from where the state buffer should be read, is calculated. */ - readIndex = (S->stateIndex - blockSize) - *pTapDelay++; + readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++; /* Wraparound of readIndex */ if (readIndex < 0) @@ -166,8 +170,8 @@ void arm_fir_sparse_q15( py = pState; /* blockSize samples are read from the state buffer */ - arm_circularRead_q15(py, delaySize, &readIndex, 1, - pb, pb, blockSize, 1, blockSize); + arm_circularRead_q15(py, (int32_t) delaySize, &readIndex, 1, + pb, pb, (int32_t) blockSize, 1, blockSize); /* Working pointer for the scratch buffer of state values */ px = pb; @@ -175,32 +179,40 @@ void arm_fir_sparse_q15( /* Working pointer for scratch buffer of output values */ pScratchOut = pScr2; - /* Loop over the blockSize. Unroll by a factor of 4. - * Compute 4 MACS at a time. */ - blkCnt = blockSize >> 2; + +#if defined (ARM_MATH_LOOPUNROLL) + + /* Loop unrolling: Compute 4 outputs at a time. */ + blkCnt = blockSize >> 2U; while (blkCnt > 0U) { /* Perform Multiply-Accumulate */ - *pScratchOut++ += (q31_t) * px++ * coeff; - *pScratchOut++ += (q31_t) * px++ * coeff; - *pScratchOut++ += (q31_t) * px++ * coeff; - *pScratchOut++ += (q31_t) * px++ * coeff; + *pScratchOut++ += (q31_t) *px++ * coeff; + *pScratchOut++ += (q31_t) *px++ * coeff; + *pScratchOut++ += (q31_t) *px++ * coeff; + *pScratchOut++ += (q31_t) *px++ * coeff; - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } - /* If the blockSize is not a multiple of 4, - * compute the remaining samples */ + /* Loop unrolling: Compute remaining outputs */ blkCnt = blockSize % 0x4U; +#else + + /* Initialize blkCnt with number of samples */ + blkCnt = blockSize; + +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ + while (blkCnt > 0U) { /* Perform Multiply-Accumulate */ - *pScratchOut++ += (q31_t) * px++ * coeff; + *pScratchOut++ += (q31_t) *px++ * coeff; - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } @@ -209,7 +221,7 @@ void arm_fir_sparse_q15( coeff = *pCoeffs++; /* Read Index, from where the state buffer should be read, is calculated. */ - readIndex = (S->stateIndex - blockSize) - *pTapDelay++; + readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++; /* Wraparound of readIndex */ if (readIndex < 0) @@ -217,254 +229,113 @@ void arm_fir_sparse_q15( readIndex += (int32_t) delaySize; } - /* Decrement the tap loop counter */ + /* Decrement loop counter */ tapCnt--; } - /* Compute last tap without the final read of pTapDelay */ - - /* Working pointer for state buffer is updated */ - py = pState; + /* Compute last tap without the final read of pTapDelay */ - /* blockSize samples are read from the state buffer */ - arm_circularRead_q15(py, delaySize, &readIndex, 1, - pb, pb, blockSize, 1, blockSize); - - /* Working pointer for the scratch buffer of state values */ - px = pb; - - /* Working pointer for scratch buffer of output values */ - pScratchOut = pScr2; - - /* Loop over the blockSize. Unroll by a factor of 4. - * Compute 4 MACS at a time. */ - blkCnt = blockSize >> 2; + /* Working pointer for state buffer is updated */ + py = pState; - while (blkCnt > 0U) - { - /* Perform Multiply-Accumulate */ - *pScratchOut++ += (q31_t) * px++ * coeff; - *pScratchOut++ += (q31_t) * px++ * coeff; - *pScratchOut++ += (q31_t) * px++ * coeff; - *pScratchOut++ += (q31_t) * px++ * coeff; + /* blockSize samples are read from the state buffer */ + arm_circularRead_q15(py, (int32_t) delaySize, &readIndex, 1, + pb, pb, (int32_t) blockSize, 1, blockSize); - /* Decrement the loop counter */ - blkCnt--; - } + /* Working pointer for the scratch buffer of state values */ + px = pb; - /* If the blockSize is not a multiple of 4, - * compute the remaining samples */ - blkCnt = blockSize % 0x4U; + /* Working pointer for scratch buffer of output values */ + pScratchOut = pScr2; - while (blkCnt > 0U) - { - /* Perform Multiply-Accumulate */ - *pScratchOut++ += (q31_t) * px++ * coeff; - /* Decrement the loop counter */ - blkCnt--; - } +#if defined (ARM_MATH_LOOPUNROLL) - /* All the output values are in pScratchOut buffer. - Convert them into 1.15 format, saturate and store in the destination buffer. */ - /* Loop over the blockSize. */ - blkCnt = blockSize >> 2; + /* Loop unrolling: Compute 4 outputs at a time. */ + blkCnt = blockSize >> 2U; while (blkCnt > 0U) { - in1 = *pScr2++; - in2 = *pScr2++; - -#ifndef ARM_MATH_BIG_ENDIAN - - *__SIMD32(pOut)++ = - __PKHBT((q15_t) __SSAT(in1 >> 15, 16), (q15_t) __SSAT(in2 >> 15, 16), - 16); - -#else - *__SIMD32(pOut)++ = - __PKHBT((q15_t) __SSAT(in2 >> 15, 16), (q15_t) __SSAT(in1 >> 15, 16), - 16); - -#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ - - in1 = *pScr2++; - - in2 = *pScr2++; - -#ifndef ARM_MATH_BIG_ENDIAN - - *__SIMD32(pOut)++ = - __PKHBT((q15_t) __SSAT(in1 >> 15, 16), (q15_t) __SSAT(in2 >> 15, 16), - 16); - -#else - - *__SIMD32(pOut)++ = - __PKHBT((q15_t) __SSAT(in2 >> 15, 16), (q15_t) __SSAT(in1 >> 15, 16), - 16); - -#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ - + /* Perform Multiply-Accumulate */ + *pScratchOut++ += (q31_t) *px++ * coeff; + *pScratchOut++ += (q31_t) *px++ * coeff; + *pScratchOut++ += (q31_t) *px++ * coeff; + *pScratchOut++ += (q31_t) *px++ * coeff; + /* Decrement loop counter */ blkCnt--; - } - /* If the blockSize is not a multiple of 4, - remaining samples are processed in the below loop */ + /* Loop unrolling: Compute remaining outputs */ blkCnt = blockSize % 0x4U; - while (blkCnt > 0U) - { - *pOut++ = (q15_t) __SSAT(*pScr2++ >> 15, 16); - blkCnt--; - } - #else - /* Run the below code for Cortex-M0 */ - - /* BlockSize of Input samples are copied into the state buffer */ - /* StateIndex points to the starting position to write in the state buffer */ - arm_circularWrite_q15(py, delaySize, &S->stateIndex, 1, pIn, 1, blockSize); - - /* Loop over the number of taps. */ - tapCnt = numTaps; - - /* Read Index, from where the state buffer should be read, is calculated. */ - readIndex = (S->stateIndex - blockSize) - *pTapDelay++; - - /* Wraparound of readIndex */ - if (readIndex < 0) - { - readIndex += (int32_t) delaySize; - } - - /* Working pointer for state buffer is updated */ - py = pState; - - /* blockSize samples are read from the state buffer */ - arm_circularRead_q15(py, delaySize, &readIndex, 1, - pb, pb, blockSize, 1, blockSize); - - /* Working pointer for the scratch buffer of state values */ - px = pb; - - /* Working pointer for scratch buffer of output values */ - pScratchOut = pScr2; - + /* Initialize blkCnt with number of samples */ blkCnt = blockSize; +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ + while (blkCnt > 0U) { - /* Perform multiplication and store in the scratch buffer */ - *pScratchOut++ = ((q31_t) * px++ * coeff); + /* Perform Multiply-Accumulate */ + *pScratchOut++ += (q31_t) *px++ * coeff; - /* Decrement the loop counter */ + /* Decrement loop counter */ blkCnt--; } - /* Load the coefficient value and - * increment the coefficient buffer for the next set of state values */ - coeff = *pCoeffs++; - - /* Read Index, from where the state buffer should be read, is calculated. */ - readIndex = (S->stateIndex - blockSize) - *pTapDelay++; - - /* Wraparound of readIndex */ - if (readIndex < 0) - { - readIndex += (int32_t) delaySize; - } + /* All the output values are in pScratchOut buffer. + Convert them into 1.15 format, saturate and store in the destination buffer. */ +#if defined (ARM_MATH_LOOPUNROLL) - /* Loop over the number of taps. */ - tapCnt = (uint32_t) numTaps - 2U; + /* Loop unrolling: Compute 4 outputs at a time. */ + blkCnt = blockSize >> 2U; - while (tapCnt > 0U) + while (blkCnt > 0U) { - /* Working pointer for state buffer is updated */ - py = pState; - - /* blockSize samples are read from the state buffer */ - arm_circularRead_q15(py, delaySize, &readIndex, 1, - pb, pb, blockSize, 1, blockSize); - - /* Working pointer for the scratch buffer of state values */ - px = pb; - - /* Working pointer for scratch buffer of output values */ - pScratchOut = pScr2; - - blkCnt = blockSize; - - while (blkCnt > 0U) - { - /* Perform Multiply-Accumulate */ - *pScratchOut++ += (q31_t) * px++ * coeff; - - /* Decrement the loop counter */ - blkCnt--; - } + in1 = *pScr2++; + in2 = *pScr2++; - /* Load the coefficient value and - * increment the coefficient buffer for the next set of state values */ - coeff = *pCoeffs++; +#ifndef ARM_MATH_BIG_ENDIAN + write_q15x2_ia (&pOut, __PKHBT((q15_t) __SSAT(in1 >> 15, 16), (q15_t) __SSAT(in2 >> 15, 16), 16)); +#else + write_q15x2_ia (&pOut, __PKHBT((q15_t) __SSAT(in2 >> 15, 16), (q15_t) __SSAT(in1 >> 15, 16), 16)); +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ - /* Read Index, from where the state buffer should be read, is calculated. */ - readIndex = (S->stateIndex - blockSize) - *pTapDelay++; + in1 = *pScr2++; + in2 = *pScr2++; - /* Wraparound of readIndex */ - if (readIndex < 0) - { - readIndex += (int32_t) delaySize; - } +#ifndef ARM_MATH_BIG_ENDIAN + write_q15x2_ia (&pOut, __PKHBT((q15_t) __SSAT(in1 >> 15, 16), (q15_t) __SSAT(in2 >> 15, 16), 16)); +#else + write_q15x2_ia (&pOut, __PKHBT((q15_t) __SSAT(in2 >> 15, 16), (q15_t) __SSAT(in1 >> 15, 16), 16)); +#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ - /* Decrement the tap loop counter */ - tapCnt--; + /* Decrement loop counter */ + blkCnt--; } - /* Compute last tap without the final read of pTapDelay */ - - /* Working pointer for state buffer is updated */ - py = pState; - - /* blockSize samples are read from the state buffer */ - arm_circularRead_q15(py, delaySize, &readIndex, 1, - pb, pb, blockSize, 1, blockSize); - - /* Working pointer for the scratch buffer of state values */ - px = pb; - - /* Working pointer for scratch buffer of output values */ - pScratchOut = pScr2; - - blkCnt = blockSize; - - while (blkCnt > 0U) - { - /* Perform Multiply-Accumulate */ - *pScratchOut++ += (q31_t) * px++ * coeff; + /* Loop unrolling: Compute remaining outputs */ + blkCnt = blockSize % 0x4U; - /* Decrement the loop counter */ - blkCnt--; - } +#else - /* All the output values are in pScratchOut buffer. - Convert them into 1.15 format, saturate and store in the destination buffer. */ - /* Loop over the blockSize. */ + /* Initialize blkCnt with number of samples */ blkCnt = blockSize; +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ + while (blkCnt > 0U) { *pOut++ = (q15_t) __SSAT(*pScr2++ >> 15, 16); + + /* Decrement loop counter */ blkCnt--; } -#endif /* #if defined (ARM_MATH_DSP) */ - } /** - * @} end of FIR_Sparse group + @} end of FIR_Sparse group */ |