00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 15. July 2011 00005 * $Revision: V1.0.10 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_correlate_fast_q31.c 00009 * 00010 * Description: Fast Q31 Correlation. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3 00013 * 00014 * Version 1.0.10 2011/7/15 00015 * Big Endian support added and Merged M0 and M3/M4 Source code. 00016 * 00017 * Version 1.0.3 2010/11/29 00018 * Re-organized the CMSIS folders and updated documentation. 00019 * 00020 * Version 1.0.2 2010/11/11 00021 * Documentation updated. 00022 * 00023 * Version 1.0.1 2010/10/05 00024 * Production release and review comments incorporated. 00025 * 00026 * Version 1.0.0 2010/09/20 00027 * Production release and review comments incorporated. 00028 * -------------------------------------------------------------------- */ 00029 00030 #include "arm_math.h" 00031 00070 void arm_correlate_fast_q31( 00071 q31_t * pSrcA, 00072 uint32_t srcALen, 00073 q31_t * pSrcB, 00074 uint32_t srcBLen, 00075 q31_t * pDst) 00076 { 00077 q31_t *pIn1; /* inputA pointer */ 00078 q31_t *pIn2; /* inputB pointer */ 00079 q31_t *pOut = pDst; /* output pointer */ 00080 q31_t *px; /* Intermediate inputA pointer */ 00081 q31_t *py; /* Intermediate inputB pointer */ 00082 q31_t *pSrc1; /* Intermediate pointers */ 00083 q31_t sum, acc0, acc1, acc2, acc3; /* Accumulators */ 00084 q31_t x0, x1, x2, x3, c0; /* temporary variables for holding input and coefficient values */ 00085 uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3; /* loop counter */ 00086 int32_t inc = 1; /* Destination address modifier */ 00087 00088 00089 /* The algorithm implementation is based on the lengths of the inputs. */ 00090 /* srcB is always made to slide across srcA. */ 00091 /* So srcBLen is always considered as shorter or equal to srcALen */ 00092 if(srcALen >= srcBLen) 00093 { 00094 /* Initialization of inputA pointer */ 00095 pIn1 = (pSrcA); 00096 00097 /* Initialization of inputB pointer */ 00098 pIn2 = (pSrcB); 00099 00100 /* Number of output samples is calculated */ 00101 outBlockSize = (2u * srcALen) - 1u; 00102 00103 /* When srcALen > srcBLen, zero padding is done to srcB 00104 * to make their lengths equal. 00105 * Instead, (outBlockSize - (srcALen + srcBLen - 1)) 00106 * number of output samples are made zero */ 00107 j = outBlockSize - (srcALen + (srcBLen - 1u)); 00108 00109 /* Updating the pointer position to non zero value */ 00110 pOut += j; 00111 00112 } 00113 else 00114 { 00115 /* Initialization of inputA pointer */ 00116 pIn1 = (pSrcB); 00117 00118 /* Initialization of inputB pointer */ 00119 pIn2 = (pSrcA); 00120 00121 /* srcBLen is always considered as shorter or equal to srcALen */ 00122 j = srcBLen; 00123 srcBLen = srcALen; 00124 srcALen = j; 00125 00126 /* CORR(x, y) = Reverse order(CORR(y, x)) */ 00127 /* Hence set the destination pointer to point to the last output sample */ 00128 pOut = pDst + ((srcALen + srcBLen) - 2u); 00129 00130 /* Destination address modifier is set to -1 */ 00131 inc = -1; 00132 00133 } 00134 00135 /* The function is internally 00136 * divided into three parts according to the number of multiplications that has to be 00137 * taken place between inputA samples and inputB samples. In the first part of the 00138 * algorithm, the multiplications increase by one for every iteration. 00139 * In the second part of the algorithm, srcBLen number of multiplications are done. 00140 * In the third part of the algorithm, the multiplications decrease by one 00141 * for every iteration.*/ 00142 /* The algorithm is implemented in three stages. 00143 * The loop counters of each stage is initiated here. */ 00144 blockSize1 = srcBLen - 1u; 00145 blockSize2 = srcALen - (srcBLen - 1u); 00146 blockSize3 = blockSize1; 00147 00148 /* -------------------------- 00149 * Initializations of stage1 00150 * -------------------------*/ 00151 00152 /* sum = x[0] * y[srcBlen - 1] 00153 * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1] 00154 * .... 00155 * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1] 00156 */ 00157 00158 /* In this stage the MAC operations are increased by 1 for every iteration. 00159 The count variable holds the number of MAC operations performed */ 00160 count = 1u; 00161 00162 /* Working pointer of inputA */ 00163 px = pIn1; 00164 00165 /* Working pointer of inputB */ 00166 pSrc1 = pIn2 + (srcBLen - 1u); 00167 py = pSrc1; 00168 00169 /* ------------------------ 00170 * Stage1 process 00171 * ----------------------*/ 00172 00173 /* The first stage starts here */ 00174 while(blockSize1 > 0u) 00175 { 00176 /* Accumulator is made zero for every iteration */ 00177 sum = 0; 00178 00179 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00180 k = count >> 2; 00181 00182 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00183 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00184 while(k > 0u) 00185 { 00186 /* x[0] * y[srcBLen - 4] */ 00187 sum = (q31_t) ((((q63_t) sum << 32) + 00188 ((q63_t) * px++ * (*py++))) >> 32); 00189 /* x[1] * y[srcBLen - 3] */ 00190 sum = (q31_t) ((((q63_t) sum << 32) + 00191 ((q63_t) * px++ * (*py++))) >> 32); 00192 /* x[2] * y[srcBLen - 2] */ 00193 sum = (q31_t) ((((q63_t) sum << 32) + 00194 ((q63_t) * px++ * (*py++))) >> 32); 00195 /* x[3] * y[srcBLen - 1] */ 00196 sum = (q31_t) ((((q63_t) sum << 32) + 00197 ((q63_t) * px++ * (*py++))) >> 32); 00198 00199 /* Decrement the loop counter */ 00200 k--; 00201 } 00202 00203 /* If the count is not a multiple of 4, compute any remaining MACs here. 00204 ** No loop unrolling is used. */ 00205 k = count % 0x4u; 00206 00207 while(k > 0u) 00208 { 00209 /* Perform the multiply-accumulates */ 00210 /* x[0] * y[srcBLen - 1] */ 00211 sum = (q31_t) ((((q63_t) sum << 32) + 00212 ((q63_t) * px++ * (*py++))) >> 32); 00213 00214 /* Decrement the loop counter */ 00215 k--; 00216 } 00217 00218 /* Store the result in the accumulator in the destination buffer. */ 00219 *pOut = sum << 1; 00220 /* Destination pointer is updated according to the address modifier, inc */ 00221 pOut += inc; 00222 00223 /* Update the inputA and inputB pointers for next MAC calculation */ 00224 py = pSrc1 - count; 00225 px = pIn1; 00226 00227 /* Increment the MAC count */ 00228 count++; 00229 00230 /* Decrement the loop counter */ 00231 blockSize1--; 00232 } 00233 00234 /* -------------------------- 00235 * Initializations of stage2 00236 * ------------------------*/ 00237 00238 /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1] 00239 * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1] 00240 * .... 00241 * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] 00242 */ 00243 00244 /* Working pointer of inputA */ 00245 px = pIn1; 00246 00247 /* Working pointer of inputB */ 00248 py = pIn2; 00249 00250 /* count is index by which the pointer pIn1 to be incremented */ 00251 count = 1u; 00252 00253 /* ------------------- 00254 * Stage2 process 00255 * ------------------*/ 00256 00257 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00258 * So, to loop unroll over blockSize2, 00259 * srcBLen should be greater than or equal to 4 */ 00260 if(srcBLen >= 4u) 00261 { 00262 /* Loop unroll over blockSize2, by 4 */ 00263 blkCnt = blockSize2 >> 2u; 00264 00265 while(blkCnt > 0u) 00266 { 00267 /* Set all accumulators to zero */ 00268 acc0 = 0; 00269 acc1 = 0; 00270 acc2 = 0; 00271 acc3 = 0; 00272 00273 /* read x[0], x[1], x[2] samples */ 00274 x0 = *(px++); 00275 x1 = *(px++); 00276 x2 = *(px++); 00277 00278 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00279 k = srcBLen >> 2u; 00280 00281 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00282 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00283 do 00284 { 00285 /* Read y[0] sample */ 00286 c0 = *(py++); 00287 00288 /* Read x[3] sample */ 00289 x3 = *(px++); 00290 00291 /* Perform the multiply-accumulate */ 00292 /* acc0 += x[0] * y[0] */ 00293 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32); 00294 /* acc1 += x[1] * y[0] */ 00295 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32); 00296 /* acc2 += x[2] * y[0] */ 00297 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32); 00298 /* acc3 += x[3] * y[0] */ 00299 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32); 00300 00301 /* Read y[1] sample */ 00302 c0 = *(py++); 00303 00304 /* Read x[4] sample */ 00305 x0 = *(px++); 00306 00307 /* Perform the multiply-accumulates */ 00308 /* acc0 += x[1] * y[1] */ 00309 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x1 * c0)) >> 32); 00310 /* acc1 += x[2] * y[1] */ 00311 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x2 * c0)) >> 32); 00312 /* acc2 += x[3] * y[1] */ 00313 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x3 * c0)) >> 32); 00314 /* acc3 += x[4] * y[1] */ 00315 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x0 * c0)) >> 32); 00316 00317 /* Read y[2] sample */ 00318 c0 = *(py++); 00319 00320 /* Read x[5] sample */ 00321 x1 = *(px++); 00322 00323 /* Perform the multiply-accumulates */ 00324 /* acc0 += x[2] * y[2] */ 00325 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x2 * c0)) >> 32); 00326 /* acc1 += x[3] * y[2] */ 00327 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x3 * c0)) >> 32); 00328 /* acc2 += x[4] * y[2] */ 00329 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x0 * c0)) >> 32); 00330 /* acc3 += x[5] * y[2] */ 00331 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x1 * c0)) >> 32); 00332 00333 /* Read y[3] sample */ 00334 c0 = *(py++); 00335 00336 /* Read x[6] sample */ 00337 x2 = *(px++); 00338 00339 /* Perform the multiply-accumulates */ 00340 /* acc0 += x[3] * y[3] */ 00341 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x3 * c0)) >> 32); 00342 /* acc1 += x[4] * y[3] */ 00343 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x0 * c0)) >> 32); 00344 /* acc2 += x[5] * y[3] */ 00345 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x1 * c0)) >> 32); 00346 /* acc3 += x[6] * y[3] */ 00347 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x2 * c0)) >> 32); 00348 00349 00350 } while(--k); 00351 00352 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00353 ** No loop unrolling is used. */ 00354 k = srcBLen % 0x4u; 00355 00356 while(k > 0u) 00357 { 00358 /* Read y[4] sample */ 00359 c0 = *(py++); 00360 00361 /* Read x[7] sample */ 00362 x3 = *(px++); 00363 00364 /* Perform the multiply-accumulates */ 00365 /* acc0 += x[4] * y[4] */ 00366 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32); 00367 /* acc1 += x[5] * y[4] */ 00368 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32); 00369 /* acc2 += x[6] * y[4] */ 00370 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32); 00371 /* acc3 += x[7] * y[4] */ 00372 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32); 00373 00374 /* Reuse the present samples for the next MAC */ 00375 x0 = x1; 00376 x1 = x2; 00377 x2 = x3; 00378 00379 /* Decrement the loop counter */ 00380 k--; 00381 } 00382 00383 /* Store the result in the accumulator in the destination buffer. */ 00384 *pOut = (q31_t) (acc0 << 1); 00385 /* Destination pointer is updated according to the address modifier, inc */ 00386 pOut += inc; 00387 00388 *pOut = (q31_t) (acc1 << 1); 00389 pOut += inc; 00390 00391 *pOut = (q31_t) (acc2 << 1); 00392 pOut += inc; 00393 00394 *pOut = (q31_t) (acc3 << 1); 00395 pOut += inc; 00396 00397 /* Update the inputA and inputB pointers for next MAC calculation */ 00398 px = pIn1 + (count * 4u); 00399 py = pIn2; 00400 00401 /* Increment the pointer pIn1 index, count by 1 */ 00402 count++; 00403 00404 /* Decrement the loop counter */ 00405 blkCnt--; 00406 } 00407 00408 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. 00409 ** No loop unrolling is used. */ 00410 blkCnt = blockSize2 % 0x4u; 00411 00412 while(blkCnt > 0u) 00413 { 00414 /* Accumulator is made zero for every iteration */ 00415 sum = 0; 00416 00417 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00418 k = srcBLen >> 2u; 00419 00420 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00421 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00422 while(k > 0u) 00423 { 00424 /* Perform the multiply-accumulates */ 00425 sum = (q31_t) ((((q63_t) sum << 32) + 00426 ((q63_t) * px++ * (*py++))) >> 32); 00427 sum = (q31_t) ((((q63_t) sum << 32) + 00428 ((q63_t) * px++ * (*py++))) >> 32); 00429 sum = (q31_t) ((((q63_t) sum << 32) + 00430 ((q63_t) * px++ * (*py++))) >> 32); 00431 sum = (q31_t) ((((q63_t) sum << 32) + 00432 ((q63_t) * px++ * (*py++))) >> 32); 00433 00434 /* Decrement the loop counter */ 00435 k--; 00436 } 00437 00438 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00439 ** No loop unrolling is used. */ 00440 k = srcBLen % 0x4u; 00441 00442 while(k > 0u) 00443 { 00444 /* Perform the multiply-accumulate */ 00445 sum = (q31_t) ((((q63_t) sum << 32) + 00446 ((q63_t) * px++ * (*py++))) >> 32); 00447 00448 /* Decrement the loop counter */ 00449 k--; 00450 } 00451 00452 /* Store the result in the accumulator in the destination buffer. */ 00453 *pOut = sum << 1; 00454 /* Destination pointer is updated according to the address modifier, inc */ 00455 pOut += inc; 00456 00457 /* Update the inputA and inputB pointers for next MAC calculation */ 00458 px = pIn1 + count; 00459 py = pIn2; 00460 00461 /* Increment the MAC count */ 00462 count++; 00463 00464 /* Decrement the loop counter */ 00465 blkCnt--; 00466 } 00467 } 00468 else 00469 { 00470 /* If the srcBLen is not a multiple of 4, 00471 * the blockSize2 loop cannot be unrolled by 4 */ 00472 blkCnt = blockSize2; 00473 00474 while(blkCnt > 0u) 00475 { 00476 /* Accumulator is made zero for every iteration */ 00477 sum = 0; 00478 00479 /* Loop over srcBLen */ 00480 k = srcBLen; 00481 00482 while(k > 0u) 00483 { 00484 /* Perform the multiply-accumulate */ 00485 sum = (q31_t) ((((q63_t) sum << 32) + 00486 ((q63_t) * px++ * (*py++))) >> 32); 00487 00488 /* Decrement the loop counter */ 00489 k--; 00490 } 00491 00492 /* Store the result in the accumulator in the destination buffer. */ 00493 *pOut = sum << 1; 00494 /* Destination pointer is updated according to the address modifier, inc */ 00495 pOut += inc; 00496 00497 /* Update the inputA and inputB pointers for next MAC calculation */ 00498 px = pIn1 + count; 00499 py = pIn2; 00500 00501 /* Increment the MAC count */ 00502 count++; 00503 00504 /* Decrement the loop counter */ 00505 blkCnt--; 00506 } 00507 } 00508 00509 /* -------------------------- 00510 * Initializations of stage3 00511 * -------------------------*/ 00512 00513 /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] 00514 * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] 00515 * .... 00516 * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1] 00517 * sum += x[srcALen-1] * y[0] 00518 */ 00519 00520 /* In this stage the MAC operations are decreased by 1 for every iteration. 00521 The count variable holds the number of MAC operations performed */ 00522 count = srcBLen - 1u; 00523 00524 /* Working pointer of inputA */ 00525 pSrc1 = ((pIn1 + srcALen) - srcBLen) + 1u; 00526 px = pSrc1; 00527 00528 /* Working pointer of inputB */ 00529 py = pIn2; 00530 00531 /* ------------------- 00532 * Stage3 process 00533 * ------------------*/ 00534 00535 while(blockSize3 > 0u) 00536 { 00537 /* Accumulator is made zero for every iteration */ 00538 sum = 0; 00539 00540 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00541 k = count >> 2u; 00542 00543 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00544 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00545 while(k > 0u) 00546 { 00547 /* Perform the multiply-accumulates */ 00548 /* sum += x[srcALen - srcBLen + 4] * y[3] */ 00549 sum = (q31_t) ((((q63_t) sum << 32) + 00550 ((q63_t) * px++ * (*py++))) >> 32); 00551 /* sum += x[srcALen - srcBLen + 3] * y[2] */ 00552 sum = (q31_t) ((((q63_t) sum << 32) + 00553 ((q63_t) * px++ * (*py++))) >> 32); 00554 /* sum += x[srcALen - srcBLen + 2] * y[1] */ 00555 sum = (q31_t) ((((q63_t) sum << 32) + 00556 ((q63_t) * px++ * (*py++))) >> 32); 00557 /* sum += x[srcALen - srcBLen + 1] * y[0] */ 00558 sum = (q31_t) ((((q63_t) sum << 32) + 00559 ((q63_t) * px++ * (*py++))) >> 32); 00560 00561 /* Decrement the loop counter */ 00562 k--; 00563 } 00564 00565 /* If the count is not a multiple of 4, compute any remaining MACs here. 00566 ** No loop unrolling is used. */ 00567 k = count % 0x4u; 00568 00569 while(k > 0u) 00570 { 00571 /* Perform the multiply-accumulates */ 00572 sum = (q31_t) ((((q63_t) sum << 32) + 00573 ((q63_t) * px++ * (*py++))) >> 32); 00574 00575 /* Decrement the loop counter */ 00576 k--; 00577 } 00578 00579 /* Store the result in the accumulator in the destination buffer. */ 00580 *pOut = sum << 1; 00581 /* Destination pointer is updated according to the address modifier, inc */ 00582 pOut += inc; 00583 00584 /* Update the inputA and inputB pointers for next MAC calculation */ 00585 px = ++pSrc1; 00586 py = pIn2; 00587 00588 /* Decrement the MAC count */ 00589 count--; 00590 00591 /* Decrement the loop counter */ 00592 blockSize3--; 00593 } 00594 00595 } 00596