00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 15. July 2011 00005 * $Revision: V1.0.10 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_fir_q15.c 00009 * 00010 * Description: Q15 FIR filter processing function. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 00013 * 00014 * Version 1.0.10 2011/7/15 00015 * Big Endian support added and Merged M0 and M3/M4 Source code. 00016 * 00017 * Version 1.0.3 2010/11/29 00018 * Re-organized the CMSIS folders and updated documentation. 00019 * 00020 * Version 1.0.2 2010/11/11 00021 * Documentation updated. 00022 * 00023 * Version 1.0.1 2010/10/05 00024 * Production release and review comments incorporated. 00025 * 00026 * Version 1.0.0 2010/09/20 00027 * Production release and review comments incorporated. 00028 * 00029 * Version 0.0.5 2010/04/26 00030 * incorporated review comments and updated with latest CMSIS layer 00031 * 00032 * Version 0.0.3 2010/03/10 00033 * Initial version 00034 * -------------------------------------------------------------------- */ 00035 00036 #include "arm_math.h" 00037 00068 void arm_fir_q15( 00069 const arm_fir_instance_q15 * S, 00070 q15_t * pSrc, 00071 q15_t * pDst, 00072 uint32_t blockSize) 00073 { 00074 q15_t *pState = S->pState; /* State pointer */ 00075 q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ 00076 q15_t *pStateCurnt; /* Points to the current sample of the state */ 00077 00078 00079 #ifndef ARM_MATH_CM0 00080 00081 /* Run the below code for Cortex-M4 and Cortex-M3 */ 00082 00083 q15_t *px1; /* Temporary q15 pointer for state buffer */ 00084 q31_t *pb; /* Temporary pointer for coefficient buffer */ 00085 q31_t *px2; /* Temporary q31 pointer for SIMD state buffer accesses */ 00086 q31_t x0, x1, x2, x3, c0; /* Temporary variables to hold SIMD state and coefficient values */ 00087 q63_t acc0, acc1, acc2, acc3; /* Accumulators */ 00088 uint32_t numTaps = S->numTaps; /* Number of taps in the filter */ 00089 uint32_t tapCnt, blkCnt; /* Loop counters */ 00090 00091 /* S->pState points to state array which contains previous frame (numTaps - 1) samples */ 00092 /* pStateCurnt points to the location where the new input data should be written */ 00093 pStateCurnt = &(S->pState[(numTaps - 1u)]); 00094 00095 /* Apply loop unrolling and compute 4 output values simultaneously. 00096 * The variables acc0 ... acc3 hold output values that are being computed: 00097 * 00098 * acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] 00099 * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1] 00100 * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2] 00101 * acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3] 00102 */ 00103 blkCnt = blockSize >> 2; 00104 00105 /* First part of the processing with loop unrolling. Compute 4 outputs at a time. 00106 ** a second loop below computes the remaining 1 to 3 samples. */ 00107 while(blkCnt > 0u) 00108 { 00109 /* Copy four new input samples into the state buffer. 00110 ** Use 32-bit SIMD to move the 16-bit data. Only requires two copies. */ 00111 *__SIMD32(pStateCurnt)++ = *__SIMD32(pSrc)++; 00112 *__SIMD32(pStateCurnt)++ = *__SIMD32(pSrc)++; 00113 00114 /* Set all accumulators to zero */ 00115 acc0 = 0; 00116 acc1 = 0; 00117 acc2 = 0; 00118 acc3 = 0; 00119 00120 /* Initialize state pointer of type q15 */ 00121 px1 = pState; 00122 00123 /* Initialize coeff pointer of type q31 */ 00124 pb = (q31_t *) (pCoeffs); 00125 00126 /* Read the first two samples from the state buffer: x[n-N], x[n-N-1] */ 00127 x0 = *(q31_t *) (px1++); 00128 00129 /* Read the third and forth samples from the state buffer: x[n-N-1], x[n-N-2] */ 00130 x1 = *(q31_t *) (px1++); 00131 00132 /* Loop over the number of taps. Unroll by a factor of 4. 00133 ** Repeat until we've computed numTaps-4 coefficients. */ 00134 tapCnt = numTaps >> 2; 00135 do 00136 { 00137 /* Read the first two coefficients using SIMD: b[N] and b[N-1] coefficients */ 00138 c0 = *(pb++); 00139 00140 /* acc0 += b[N] * x[n-N] + b[N-1] * x[n-N-1] */ 00141 acc0 = __SMLALD(x0, c0, acc0); 00142 00143 /* acc1 += b[N] * x[n-N-1] + b[N-1] * x[n-N-2] */ 00144 acc1 = __SMLALD(x1, c0, acc1); 00145 00146 /* Read state x[n-N-2], x[n-N-3] */ 00147 x2 = *(q31_t *) (px1++); 00148 00149 /* Read state x[n-N-3], x[n-N-4] */ 00150 x3 = *(q31_t *) (px1++); 00151 00152 /* acc2 += b[N] * x[n-N-2] + b[N-1] * x[n-N-3] */ 00153 acc2 = __SMLALD(x2, c0, acc2); 00154 00155 /* acc3 += b[N] * x[n-N-3] + b[N-1] * x[n-N-4] */ 00156 acc3 = __SMLALD(x3, c0, acc3); 00157 00158 /* Read coefficients b[N-2], b[N-3] */ 00159 c0 = *(pb++); 00160 00161 /* acc0 += b[N-2] * x[n-N-2] + b[N-3] * x[n-N-3] */ 00162 acc0 = __SMLALD(x2, c0, acc0); 00163 00164 /* acc1 += b[N-2] * x[n-N-3] + b[N-3] * x[n-N-4] */ 00165 acc1 = __SMLALD(x3, c0, acc1); 00166 00167 /* Read state x[n-N-4], x[n-N-5] */ 00168 x0 = *(q31_t *) (px1++); 00169 00170 /* Read state x[n-N-5], x[n-N-6] */ 00171 x1 = *(q31_t *) (px1++); 00172 00173 /* acc2 += b[N-2] * x[n-N-4] + b[N-3] * x[n-N-5] */ 00174 acc2 = __SMLALD(x0, c0, acc2); 00175 00176 /* acc3 += b[N-2] * x[n-N-5] + b[N-3] * x[n-N-6] */ 00177 acc3 = __SMLALD(x1, c0, acc3); 00178 tapCnt--; 00179 00180 } 00181 while(tapCnt > 0u); 00182 00183 /* If the filter length is not a multiple of 4, compute the remaining filter taps. 00184 ** This is always be 2 taps since the filter length is even. */ 00185 if((numTaps & 0x3u) != 0u) 00186 { 00187 /* Read 2 coefficients */ 00188 c0 = *(pb++); 00189 /* Fetch 4 state variables */ 00190 x2 = *(q31_t *) (px1++); 00191 x3 = *(q31_t *) (px1++); 00192 00193 /* Perform the multiply-accumulates */ 00194 acc0 = __SMLALD(x0, c0, acc0); 00195 acc1 = __SMLALD(x1, c0, acc1); 00196 acc2 = __SMLALD(x2, c0, acc2); 00197 acc3 = __SMLALD(x3, c0, acc3); 00198 } 00199 00200 /* The results in the 4 accumulators are in 2.30 format. Convert to 1.15 with saturation. 00201 ** Then store the 4 outputs in the destination buffer. */ 00202 00203 #ifndef ARM_MATH_BIG_ENDIAN 00204 00205 *__SIMD32(pDst)++ = 00206 __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16); 00207 *__SIMD32(pDst)++ = 00208 __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16); 00209 00210 #else 00211 00212 *__SIMD32(pDst)++ = 00213 __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16); 00214 *__SIMD32(pDst)++ = 00215 __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16); 00216 00217 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ 00218 00219 /* Advance the state pointer by 4 to process the next group of 4 samples */ 00220 pState = pState + 4; 00221 00222 /* Decrement the loop counter */ 00223 blkCnt--; 00224 } 00225 00226 /* If the blockSize is not a multiple of 4, compute any remaining output samples here. 00227 ** No loop unrolling is used. */ 00228 blkCnt = blockSize % 0x4u; 00229 while(blkCnt > 0u) 00230 { 00231 /* Copy two samples into state buffer */ 00232 *pStateCurnt++ = *pSrc++; 00233 00234 /* Set the accumulator to zero */ 00235 acc0 = 0; 00236 00237 /* Use SIMD to hold states and coefficients */ 00238 px2 = (q31_t *) pState; 00239 pb = (q31_t *) (pCoeffs); 00240 tapCnt = numTaps >> 1; 00241 00242 do 00243 { 00244 acc0 = __SMLALD(*px2++, *(pb++), acc0); 00245 tapCnt--; 00246 } 00247 while(tapCnt > 0u); 00248 00249 /* The result is in 2.30 format. Convert to 1.15 with saturation. 00250 ** Then store the output in the destination buffer. */ 00251 *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16)); 00252 00253 /* Advance state pointer by 1 for the next sample */ 00254 pState = pState + 1; 00255 00256 /* Decrement the loop counter */ 00257 blkCnt--; 00258 } 00259 00260 /* Processing is complete. 00261 ** Now copy the last numTaps - 1 samples to the satrt of the state buffer. 00262 ** This prepares the state buffer for the next function call. */ 00263 00264 /* Points to the start of the state buffer */ 00265 pStateCurnt = S->pState; 00266 00267 /* Calculation of count for copying integer writes */ 00268 tapCnt = (numTaps - 1u) >> 2; 00269 00270 while(tapCnt > 0u) 00271 { 00272 *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++; 00273 *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++; 00274 00275 tapCnt--; 00276 00277 } 00278 00279 /* Calculation of count for remaining q15_t data */ 00280 tapCnt = (numTaps - 1u) % 0x4u; 00281 00282 /* copy remaining data */ 00283 while(tapCnt > 0u) 00284 { 00285 *pStateCurnt++ = *pState++; 00286 00287 /* Decrement the loop counter */ 00288 tapCnt--; 00289 } 00290 00291 #else 00292 00293 /* Run the below code for Cortex-M0 */ 00294 00295 q15_t *px; /* Temporary pointer for state buffer */ 00296 q15_t *pb; /* Temporary pointer for coefficient buffer */ 00297 q63_t acc; /* Accumulator */ 00298 uint32_t numTaps = S->numTaps; /* Number of nTaps in the filter */ 00299 uint32_t tapCnt, blkCnt; /* Loop counters */ 00300 00301 /* S->pState buffer contains previous frame (numTaps - 1) samples */ 00302 /* pStateCurnt points to the location where the new input data should be written */ 00303 pStateCurnt = &(S->pState[(numTaps - 1u)]); 00304 00305 /* Initialize blkCnt with blockSize */ 00306 blkCnt = blockSize; 00307 00308 while(blkCnt > 0u) 00309 { 00310 /* Copy one sample at a time into state buffer */ 00311 *pStateCurnt++ = *pSrc++; 00312 00313 /* Set the accumulator to zero */ 00314 acc = 0; 00315 00316 /* Initialize state pointer */ 00317 px = pState; 00318 00319 /* Initialize Coefficient pointer */ 00320 pb = pCoeffs; 00321 00322 tapCnt = numTaps; 00323 00324 /* Perform the multiply-accumulates */ 00325 do 00326 { 00327 /* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */ 00328 acc += (q31_t) * px++ * *pb++; 00329 tapCnt--; 00330 } while(tapCnt > 0u); 00331 00332 /* The result is in 2.30 format. Convert to 1.15 00333 ** Then store the output in the destination buffer. */ 00334 *pDst++ = (q15_t) __SSAT((acc >> 15u), 16); 00335 00336 /* Advance state pointer by 1 for the next sample */ 00337 pState = pState + 1; 00338 00339 /* Decrement the samples loop counter */ 00340 blkCnt--; 00341 } 00342 00343 /* Processing is complete. 00344 ** Now copy the last numTaps - 1 samples to the satrt of the state buffer. 00345 ** This prepares the state buffer for the next function call. */ 00346 00347 /* Points to the start of the state buffer */ 00348 pStateCurnt = S->pState; 00349 00350 /* Copy numTaps number of values */ 00351 tapCnt = (numTaps - 1u); 00352 00353 /* copy data */ 00354 while(tapCnt > 0u) 00355 { 00356 *pStateCurnt++ = *pState++; 00357 00358 /* Decrement the loop counter */ 00359 tapCnt--; 00360 } 00361 00362 #endif /* #ifndef ARM_MATH_CM0 */ 00363 00364 } 00365