00001 /* ---------------------------------------------------------------------------- 00002 * Copyright (C) 2010 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 15. July 2011 00005 * $Revision: V1.0.10 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_conv_f32.c 00009 * 00010 * Description: Convolution of floating-point sequences. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 00013 * 00014 * Version 1.0.10 2011/7/15 00015 * Big Endian support added and Merged M0 and M3/M4 Source code. 00016 * 00017 * Version 1.0.3 2010/11/29 00018 * Re-organized the CMSIS folders and updated documentation. 00019 * 00020 * Version 1.0.2 2010/11/11 00021 * Documentation updated. 00022 * 00023 * Version 1.0.1 2010/10/05 00024 * Production release and review comments incorporated. 00025 * 00026 * Version 1.0.0 2010/09/20 00027 * Production release and review comments incorporated 00028 * 00029 * Version 0.0.7 2010/06/10 00030 * Misra-C changes done 00031 * 00032 * -------------------------------------------------------------------------- */ 00033 00034 #include "arm_math.h" 00035 00103 void arm_conv_f32( 00104 float32_t * pSrcA, 00105 uint32_t srcALen, 00106 float32_t * pSrcB, 00107 uint32_t srcBLen, 00108 float32_t * pDst) 00109 { 00110 00111 00112 #ifndef ARM_MATH_CM0 00113 00114 /* Run the below code for Cortex-M4 and Cortex-M3 */ 00115 00116 float32_t *pIn1; /* inputA pointer */ 00117 float32_t *pIn2; /* inputB pointer */ 00118 float32_t *pOut = pDst; /* output pointer */ 00119 float32_t *px; /* Intermediate inputA pointer */ 00120 float32_t *py; /* Intermediate inputB pointer */ 00121 float32_t *pSrc1, *pSrc2; /* Intermediate pointers */ 00122 float32_t sum, acc0, acc1, acc2, acc3; /* Accumulator */ 00123 float32_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */ 00124 uint32_t j, k, count, blkCnt, blockSize1, blockSize2, blockSize3; /* loop counters */ 00125 00126 /* The algorithm implementation is based on the lengths of the inputs. */ 00127 /* srcB is always made to slide across srcA. */ 00128 /* So srcBLen is always considered as shorter or equal to srcALen */ 00129 if(srcALen >= srcBLen) 00130 { 00131 /* Initialization of inputA pointer */ 00132 pIn1 = pSrcA; 00133 00134 /* Initialization of inputB pointer */ 00135 pIn2 = pSrcB; 00136 } 00137 else 00138 { 00139 /* Initialization of inputA pointer */ 00140 pIn1 = pSrcB; 00141 00142 /* Initialization of inputB pointer */ 00143 pIn2 = pSrcA; 00144 00145 /* srcBLen is always considered as shorter or equal to srcALen */ 00146 j = srcBLen; 00147 srcBLen = srcALen; 00148 srcALen = j; 00149 } 00150 00151 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 00152 /* The function is internally 00153 * divided into three stages according to the number of multiplications that has to be 00154 * taken place between inputA samples and inputB samples. In the first stage of the 00155 * algorithm, the multiplications increase by one for every iteration. 00156 * In the second stage of the algorithm, srcBLen number of multiplications are done. 00157 * In the third stage of the algorithm, the multiplications decrease by one 00158 * for every iteration. */ 00159 00160 /* The algorithm is implemented in three stages. 00161 The loop counters of each stage is initiated here. */ 00162 blockSize1 = srcBLen - 1u; 00163 blockSize2 = srcALen - (srcBLen - 1u); 00164 blockSize3 = blockSize1; 00165 00166 /* -------------------------- 00167 * initializations of stage1 00168 * -------------------------*/ 00169 00170 /* sum = x[0] * y[0] 00171 * sum = x[0] * y[1] + x[1] * y[0] 00172 * .... 00173 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] 00174 */ 00175 00176 /* In this stage the MAC operations are increased by 1 for every iteration. 00177 The count variable holds the number of MAC operations performed */ 00178 count = 1u; 00179 00180 /* Working pointer of inputA */ 00181 px = pIn1; 00182 00183 /* Working pointer of inputB */ 00184 py = pIn2; 00185 00186 00187 /* ------------------------ 00188 * Stage1 process 00189 * ----------------------*/ 00190 00191 /* The first stage starts here */ 00192 while(blockSize1 > 0u) 00193 { 00194 /* Accumulator is made zero for every iteration */ 00195 sum = 0.0f; 00196 00197 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00198 k = count >> 2u; 00199 00200 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00201 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00202 while(k > 0u) 00203 { 00204 /* x[0] * y[srcBLen - 1] */ 00205 sum += *px++ * *py--; 00206 00207 /* x[1] * y[srcBLen - 2] */ 00208 sum += *px++ * *py--; 00209 00210 /* x[2] * y[srcBLen - 3] */ 00211 sum += *px++ * *py--; 00212 00213 /* x[3] * y[srcBLen - 4] */ 00214 sum += *px++ * *py--; 00215 00216 /* Decrement the loop counter */ 00217 k--; 00218 } 00219 00220 /* If the count is not a multiple of 4, compute any remaining MACs here. 00221 ** No loop unrolling is used. */ 00222 k = count % 0x4u; 00223 00224 while(k > 0u) 00225 { 00226 /* Perform the multiply-accumulate */ 00227 sum += *px++ * *py--; 00228 00229 /* Decrement the loop counter */ 00230 k--; 00231 } 00232 00233 /* Store the result in the accumulator in the destination buffer. */ 00234 *pOut++ = sum; 00235 00236 /* Update the inputA and inputB pointers for next MAC calculation */ 00237 py = pIn2 + count; 00238 px = pIn1; 00239 00240 /* Increment the MAC count */ 00241 count++; 00242 00243 /* Decrement the loop counter */ 00244 blockSize1--; 00245 } 00246 00247 /* -------------------------- 00248 * Initializations of stage2 00249 * ------------------------*/ 00250 00251 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] 00252 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] 00253 * .... 00254 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] 00255 */ 00256 00257 /* Working pointer of inputA */ 00258 px = pIn1; 00259 00260 /* Working pointer of inputB */ 00261 pSrc2 = pIn2 + (srcBLen - 1u); 00262 py = pSrc2; 00263 00264 /* count is index by which the pointer pIn1 to be incremented */ 00265 count = 1u; 00266 00267 /* ------------------- 00268 * Stage2 process 00269 * ------------------*/ 00270 00271 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00272 * So, to loop unroll over blockSize2, 00273 * srcBLen should be greater than or equal to 4 */ 00274 if(srcBLen >= 4u) 00275 { 00276 /* Loop unroll over blockSize2, by 4 */ 00277 blkCnt = blockSize2 >> 2u; 00278 00279 while(blkCnt > 0u) 00280 { 00281 /* Set all accumulators to zero */ 00282 acc0 = 0.0f; 00283 acc1 = 0.0f; 00284 acc2 = 0.0f; 00285 acc3 = 0.0f; 00286 00287 /* read x[0], x[1], x[2] samples */ 00288 x0 = *(px++); 00289 x1 = *(px++); 00290 x2 = *(px++); 00291 00292 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00293 k = srcBLen >> 2u; 00294 00295 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00296 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00297 do 00298 { 00299 /* Read y[srcBLen - 1] sample */ 00300 c0 = *(py--); 00301 00302 /* Read x[3] sample */ 00303 x3 = *(px++); 00304 00305 /* Perform the multiply-accumulate */ 00306 /* acc0 += x[0] * y[srcBLen - 1] */ 00307 acc0 += x0 * c0; 00308 00309 /* acc1 += x[1] * y[srcBLen - 1] */ 00310 acc1 += x1 * c0; 00311 00312 /* acc2 += x[2] * y[srcBLen - 1] */ 00313 acc2 += x2 * c0; 00314 00315 /* acc3 += x[3] * y[srcBLen - 1] */ 00316 acc3 += x3 * c0; 00317 00318 /* Read y[srcBLen - 2] sample */ 00319 c0 = *(py--); 00320 00321 /* Read x[4] sample */ 00322 x0 = *(px++); 00323 00324 /* Perform the multiply-accumulate */ 00325 /* acc0 += x[1] * y[srcBLen - 2] */ 00326 acc0 += x1 * c0; 00327 /* acc1 += x[2] * y[srcBLen - 2] */ 00328 acc1 += x2 * c0; 00329 /* acc2 += x[3] * y[srcBLen - 2] */ 00330 acc2 += x3 * c0; 00331 /* acc3 += x[4] * y[srcBLen - 2] */ 00332 acc3 += x0 * c0; 00333 00334 /* Read y[srcBLen - 3] sample */ 00335 c0 = *(py--); 00336 00337 /* Read x[5] sample */ 00338 x1 = *(px++); 00339 00340 /* Perform the multiply-accumulates */ 00341 /* acc0 += x[2] * y[srcBLen - 3] */ 00342 acc0 += x2 * c0; 00343 /* acc1 += x[3] * y[srcBLen - 2] */ 00344 acc1 += x3 * c0; 00345 /* acc2 += x[4] * y[srcBLen - 2] */ 00346 acc2 += x0 * c0; 00347 /* acc3 += x[5] * y[srcBLen - 2] */ 00348 acc3 += x1 * c0; 00349 00350 /* Read y[srcBLen - 4] sample */ 00351 c0 = *(py--); 00352 00353 /* Read x[6] sample */ 00354 x2 = *(px++); 00355 00356 /* Perform the multiply-accumulates */ 00357 /* acc0 += x[3] * y[srcBLen - 4] */ 00358 acc0 += x3 * c0; 00359 /* acc1 += x[4] * y[srcBLen - 4] */ 00360 acc1 += x0 * c0; 00361 /* acc2 += x[5] * y[srcBLen - 4] */ 00362 acc2 += x1 * c0; 00363 /* acc3 += x[6] * y[srcBLen - 4] */ 00364 acc3 += x2 * c0; 00365 00366 00367 } while(--k); 00368 00369 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00370 ** No loop unrolling is used. */ 00371 k = srcBLen % 0x4u; 00372 00373 while(k > 0u) 00374 { 00375 /* Read y[srcBLen - 5] sample */ 00376 c0 = *(py--); 00377 00378 /* Read x[7] sample */ 00379 x3 = *(px++); 00380 00381 /* Perform the multiply-accumulates */ 00382 /* acc0 += x[4] * y[srcBLen - 5] */ 00383 acc0 += x0 * c0; 00384 /* acc1 += x[5] * y[srcBLen - 5] */ 00385 acc1 += x1 * c0; 00386 /* acc2 += x[6] * y[srcBLen - 5] */ 00387 acc2 += x2 * c0; 00388 /* acc3 += x[7] * y[srcBLen - 5] */ 00389 acc3 += x3 * c0; 00390 00391 /* Reuse the present samples for the next MAC */ 00392 x0 = x1; 00393 x1 = x2; 00394 x2 = x3; 00395 00396 /* Decrement the loop counter */ 00397 k--; 00398 } 00399 00400 /* Store the result in the accumulator in the destination buffer. */ 00401 *pOut++ = acc0; 00402 *pOut++ = acc1; 00403 *pOut++ = acc2; 00404 *pOut++ = acc3; 00405 00406 /* Update the inputA and inputB pointers for next MAC calculation */ 00407 px = pIn1 + (count * 4u); 00408 py = pSrc2; 00409 00410 /* Increment the pointer pIn1 index, count by 1 */ 00411 count++; 00412 00413 /* Decrement the loop counter */ 00414 blkCnt--; 00415 } 00416 00417 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. 00418 ** No loop unrolling is used. */ 00419 blkCnt = blockSize2 % 0x4u; 00420 00421 while(blkCnt > 0u) 00422 { 00423 /* Accumulator is made zero for every iteration */ 00424 sum = 0.0f; 00425 00426 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00427 k = srcBLen >> 2u; 00428 00429 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00430 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00431 while(k > 0u) 00432 { 00433 /* Perform the multiply-accumulates */ 00434 sum += *px++ * *py--; 00435 sum += *px++ * *py--; 00436 sum += *px++ * *py--; 00437 sum += *px++ * *py--; 00438 00439 /* Decrement the loop counter */ 00440 k--; 00441 } 00442 00443 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00444 ** No loop unrolling is used. */ 00445 k = srcBLen % 0x4u; 00446 00447 while(k > 0u) 00448 { 00449 /* Perform the multiply-accumulate */ 00450 sum += *px++ * *py--; 00451 00452 /* Decrement the loop counter */ 00453 k--; 00454 } 00455 00456 /* Store the result in the accumulator in the destination buffer. */ 00457 *pOut++ = sum; 00458 00459 /* Update the inputA and inputB pointers for next MAC calculation */ 00460 px = pIn1 + count; 00461 py = pSrc2; 00462 00463 /* Increment the MAC count */ 00464 count++; 00465 00466 /* Decrement the loop counter */ 00467 blkCnt--; 00468 } 00469 } 00470 else 00471 { 00472 /* If the srcBLen is not a multiple of 4, 00473 * the blockSize2 loop cannot be unrolled by 4 */ 00474 blkCnt = blockSize2; 00475 00476 while(blkCnt > 0u) 00477 { 00478 /* Accumulator is made zero for every iteration */ 00479 sum = 0.0f; 00480 00481 /* srcBLen number of MACS should be performed */ 00482 k = srcBLen; 00483 00484 while(k > 0u) 00485 { 00486 /* Perform the multiply-accumulate */ 00487 sum += *px++ * *py--; 00488 00489 /* Decrement the loop counter */ 00490 k--; 00491 } 00492 00493 /* Store the result in the accumulator in the destination buffer. */ 00494 *pOut++ = sum; 00495 00496 /* Update the inputA and inputB pointers for next MAC calculation */ 00497 px = pIn1 + count; 00498 py = pSrc2; 00499 00500 /* Increment the MAC count */ 00501 count++; 00502 00503 /* Decrement the loop counter */ 00504 blkCnt--; 00505 } 00506 } 00507 00508 00509 /* -------------------------- 00510 * Initializations of stage3 00511 * -------------------------*/ 00512 00513 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] 00514 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] 00515 * .... 00516 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] 00517 * sum += x[srcALen-1] * y[srcBLen-1] 00518 */ 00519 00520 /* In this stage the MAC operations are decreased by 1 for every iteration. 00521 The blockSize3 variable holds the number of MAC operations performed */ 00522 00523 /* Working pointer of inputA */ 00524 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); 00525 px = pSrc1; 00526 00527 /* Working pointer of inputB */ 00528 pSrc2 = pIn2 + (srcBLen - 1u); 00529 py = pSrc2; 00530 00531 /* ------------------- 00532 * Stage3 process 00533 * ------------------*/ 00534 00535 while(blockSize3 > 0u) 00536 { 00537 /* Accumulator is made zero for every iteration */ 00538 sum = 0.0f; 00539 00540 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00541 k = blockSize3 >> 2u; 00542 00543 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00544 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00545 while(k > 0u) 00546 { 00547 /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */ 00548 sum += *px++ * *py--; 00549 00550 /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */ 00551 sum += *px++ * *py--; 00552 00553 /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */ 00554 sum += *px++ * *py--; 00555 00556 /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */ 00557 sum += *px++ * *py--; 00558 00559 /* Decrement the loop counter */ 00560 k--; 00561 } 00562 00563 /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here. 00564 ** No loop unrolling is used. */ 00565 k = blockSize3 % 0x4u; 00566 00567 while(k > 0u) 00568 { 00569 /* Perform the multiply-accumulates */ 00570 /* sum += x[srcALen-1] * y[srcBLen-1] */ 00571 sum += *px++ * *py--; 00572 00573 /* Decrement the loop counter */ 00574 k--; 00575 } 00576 00577 /* Store the result in the accumulator in the destination buffer. */ 00578 *pOut++ = sum; 00579 00580 /* Update the inputA and inputB pointers for next MAC calculation */ 00581 px = ++pSrc1; 00582 py = pSrc2; 00583 00584 /* Decrement the loop counter */ 00585 blockSize3--; 00586 } 00587 00588 #else 00589 00590 /* Run the below code for Cortex-M0 */ 00591 00592 float32_t *pIn1 = pSrcA; /* inputA pointer */ 00593 float32_t *pIn2 = pSrcB; /* inputB pointer */ 00594 float32_t sum; /* Accumulator */ 00595 uint32_t i, j; /* loop counters */ 00596 00597 /* Loop to calculate convolution for output length number of times */ 00598 for (i = 0u; i < ((srcALen + srcBLen) - 1u); i++) 00599 { 00600 /* Initialize sum with zero to carry out MAC operations */ 00601 sum = 0.0f; 00602 00603 /* Loop to perform MAC operations according to convolution equation */ 00604 for (j = 0u; j <= i; j++) 00605 { 00606 /* Check the array limitations */ 00607 if((((i - j) < srcBLen) && (j < srcALen))) 00608 { 00609 /* z[i] += x[i-j] * y[j] */ 00610 sum += pIn1[j] * pIn2[i - j]; 00611 } 00612 } 00613 /* Store the output in the destination buffer */ 00614 pDst[i] = sum; 00615 } 00616 00617 #endif /* #ifndef ARM_MATH_CM0 */ 00618 00619 } 00620