00001 /* ---------------------------------------------------------------------------- 00002 * Copyright (C) 2010 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 15. July 2011 00005 * $Revision: V1.0.10 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_conv_partial_f32.c 00009 * 00010 * Description: Partial convolution of floating-point sequences. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 00013 * 00014 * Version 1.0.10 2011/7/15 00015 * Big Endian support added and Merged M0 and M3/M4 Source code. 00016 * 00017 * Version 1.0.3 2010/11/29 00018 * Re-organized the CMSIS folders and updated documentation. 00019 * 00020 * Version 1.0.2 2010/11/11 00021 * Documentation updated. 00022 * 00023 * Version 1.0.1 2010/10/05 00024 * Production release and review comments incorporated. 00025 * 00026 * Version 1.0.0 2010/09/20 00027 * Production release and review comments incorporated 00028 * 00029 * Version 0.0.7 2010/06/10 00030 * Misra-C changes done 00031 * 00032 * -------------------------------------------------------------------------- */ 00033 00034 #include "arm_math.h" 00035 00074 arm_status arm_conv_partial_f32( 00075 float32_t * pSrcA, 00076 uint32_t srcALen, 00077 float32_t * pSrcB, 00078 uint32_t srcBLen, 00079 float32_t * pDst, 00080 uint32_t firstIndex, 00081 uint32_t numPoints) 00082 { 00083 00084 00085 #ifndef ARM_MATH_CM0 00086 00087 /* Run the below code for Cortex-M4 and Cortex-M3 */ 00088 00089 float32_t *pIn1 = pSrcA; /* inputA pointer */ 00090 float32_t *pIn2 = pSrcB; /* inputB pointer */ 00091 float32_t *pOut = pDst; /* output pointer */ 00092 float32_t *px; /* Intermediate inputA pointer */ 00093 float32_t *py; /* Intermediate inputB pointer */ 00094 float32_t *pSrc1, *pSrc2; /* Intermediate pointers */ 00095 float32_t sum, acc0, acc1, acc2, acc3; /* Accumulator */ 00096 float32_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */ 00097 uint32_t j, k, count = 0u, blkCnt, check; 00098 int32_t blockSize1, blockSize2, blockSize3; /* loop counters */ 00099 arm_status status; /* status of Partial convolution */ 00100 00101 00102 /* Check for range of output samples to be calculated */ 00103 if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) 00104 { 00105 /* Set status as ARM_MATH_ARGUMENT_ERROR */ 00106 status = ARM_MATH_ARGUMENT_ERROR; 00107 } 00108 else 00109 { 00110 00111 /* The algorithm implementation is based on the lengths of the inputs. */ 00112 /* srcB is always made to slide across srcA. */ 00113 /* So srcBLen is always considered as shorter or equal to srcALen */ 00114 if(srcALen >= srcBLen) 00115 { 00116 /* Initialization of inputA pointer */ 00117 pIn1 = pSrcA; 00118 00119 /* Initialization of inputB pointer */ 00120 pIn2 = pSrcB; 00121 } 00122 else 00123 { 00124 /* Initialization of inputA pointer */ 00125 pIn1 = pSrcB; 00126 00127 /* Initialization of inputB pointer */ 00128 pIn2 = pSrcA; 00129 00130 /* srcBLen is always considered as shorter or equal to srcALen */ 00131 j = srcBLen; 00132 srcBLen = srcALen; 00133 srcALen = j; 00134 } 00135 00136 /* Conditions to check which loopCounter holds 00137 * the first and last indices of the output samples to be calculated. */ 00138 check = firstIndex + numPoints; 00139 blockSize3 = (int32_t) check - (int32_t) srcALen; 00140 blockSize3 = (blockSize3 > 0) ? blockSize3 : 0; 00141 blockSize1 = ((int32_t) srcBLen - 1) - (int32_t) firstIndex; 00142 blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 : 00143 (int32_t) numPoints) : 0; 00144 blockSize2 = ((int32_t) check - blockSize3) - 00145 (blockSize1 + (int32_t) firstIndex); 00146 blockSize2 = (blockSize2 > 0) ? blockSize2 : 0; 00147 00148 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 00149 /* The function is internally 00150 * divided into three stages according to the number of multiplications that has to be 00151 * taken place between inputA samples and inputB samples. In the first stage of the 00152 * algorithm, the multiplications increase by one for every iteration. 00153 * In the second stage of the algorithm, srcBLen number of multiplications are done. 00154 * In the third stage of the algorithm, the multiplications decrease by one 00155 * for every iteration. */ 00156 00157 /* Set the output pointer to point to the firstIndex 00158 * of the output sample to be calculated. */ 00159 pOut = pDst + firstIndex; 00160 00161 /* -------------------------- 00162 * Initializations of stage1 00163 * -------------------------*/ 00164 00165 /* sum = x[0] * y[0] 00166 * sum = x[0] * y[1] + x[1] * y[0] 00167 * .... 00168 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] 00169 */ 00170 00171 /* In this stage the MAC operations are increased by 1 for every iteration. 00172 The count variable holds the number of MAC operations performed. 00173 Since the partial convolution starts from from firstIndex 00174 Number of Macs to be performed is firstIndex + 1 */ 00175 count = 1u + firstIndex; 00176 00177 /* Working pointer of inputA */ 00178 px = pIn1; 00179 00180 /* Working pointer of inputB */ 00181 pSrc1 = pIn2 + firstIndex; 00182 py = pSrc1; 00183 00184 /* ------------------------ 00185 * Stage1 process 00186 * ----------------------*/ 00187 00188 /* The first stage starts here */ 00189 while(blockSize1 > 0) 00190 { 00191 /* Accumulator is made zero for every iteration */ 00192 sum = 0.0f; 00193 00194 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00195 k = count >> 2u; 00196 00197 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00198 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00199 while(k > 0u) 00200 { 00201 /* x[0] * y[srcBLen - 1] */ 00202 sum += *px++ * *py--; 00203 00204 /* x[1] * y[srcBLen - 2] */ 00205 sum += *px++ * *py--; 00206 00207 /* x[2] * y[srcBLen - 3] */ 00208 sum += *px++ * *py--; 00209 00210 /* x[3] * y[srcBLen - 4] */ 00211 sum += *px++ * *py--; 00212 00213 /* Decrement the loop counter */ 00214 k--; 00215 } 00216 00217 /* If the count is not a multiple of 4, compute any remaining MACs here. 00218 ** No loop unrolling is used. */ 00219 k = count % 0x4u; 00220 00221 while(k > 0u) 00222 { 00223 /* Perform the multiply-accumulates */ 00224 sum += *px++ * *py--; 00225 00226 /* Decrement the loop counter */ 00227 k--; 00228 } 00229 00230 /* Store the result in the accumulator in the destination buffer. */ 00231 *pOut++ = sum; 00232 00233 /* Update the inputA and inputB pointers for next MAC calculation */ 00234 py = ++pSrc1; 00235 px = pIn1; 00236 00237 /* Increment the MAC count */ 00238 count++; 00239 00240 /* Decrement the loop counter */ 00241 blockSize1--; 00242 } 00243 00244 /* -------------------------- 00245 * Initializations of stage2 00246 * ------------------------*/ 00247 00248 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] 00249 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] 00250 * .... 00251 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] 00252 */ 00253 00254 /* Working pointer of inputA */ 00255 px = pIn1; 00256 00257 /* Working pointer of inputB */ 00258 pSrc2 = pIn2 + (srcBLen - 1u); 00259 py = pSrc2; 00260 00261 /* count is index by which the pointer pIn1 to be incremented */ 00262 count = 1u; 00263 00264 /* ------------------- 00265 * Stage2 process 00266 * ------------------*/ 00267 00268 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00269 * So, to loop unroll over blockSize2, 00270 * srcBLen should be greater than or equal to 4 */ 00271 if(srcBLen >= 4u) 00272 { 00273 /* Loop unroll over blockSize2, by 4 */ 00274 blkCnt = ((uint32_t) blockSize2 >> 2u); 00275 00276 while(blkCnt > 0u) 00277 { 00278 /* Set all accumulators to zero */ 00279 acc0 = 0.0f; 00280 acc1 = 0.0f; 00281 acc2 = 0.0f; 00282 acc3 = 0.0f; 00283 00284 /* read x[0], x[1], x[2] samples */ 00285 x0 = *(px++); 00286 x1 = *(px++); 00287 x2 = *(px++); 00288 00289 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00290 k = srcBLen >> 2u; 00291 00292 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00293 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00294 do 00295 { 00296 /* Read y[srcBLen - 1] sample */ 00297 c0 = *(py--); 00298 00299 /* Read x[3] sample */ 00300 x3 = *(px++); 00301 00302 /* Perform the multiply-accumulate */ 00303 /* acc0 += x[0] * y[srcBLen - 1] */ 00304 acc0 += x0 * c0; 00305 00306 /* acc1 += x[1] * y[srcBLen - 1] */ 00307 acc1 += x1 * c0; 00308 00309 /* acc2 += x[2] * y[srcBLen - 1] */ 00310 acc2 += x2 * c0; 00311 00312 /* acc3 += x[3] * y[srcBLen - 1] */ 00313 acc3 += x3 * c0; 00314 00315 /* Read y[srcBLen - 2] sample */ 00316 c0 = *(py--); 00317 00318 /* Read x[4] sample */ 00319 x0 = *(px++); 00320 00321 /* Perform the multiply-accumulate */ 00322 /* acc0 += x[1] * y[srcBLen - 2] */ 00323 acc0 += x1 * c0; 00324 /* acc1 += x[2] * y[srcBLen - 2] */ 00325 acc1 += x2 * c0; 00326 /* acc2 += x[3] * y[srcBLen - 2] */ 00327 acc2 += x3 * c0; 00328 /* acc3 += x[4] * y[srcBLen - 2] */ 00329 acc3 += x0 * c0; 00330 00331 /* Read y[srcBLen - 3] sample */ 00332 c0 = *(py--); 00333 00334 /* Read x[5] sample */ 00335 x1 = *(px++); 00336 00337 /* Perform the multiply-accumulates */ 00338 /* acc0 += x[2] * y[srcBLen - 3] */ 00339 acc0 += x2 * c0; 00340 /* acc1 += x[3] * y[srcBLen - 2] */ 00341 acc1 += x3 * c0; 00342 /* acc2 += x[4] * y[srcBLen - 2] */ 00343 acc2 += x0 * c0; 00344 /* acc3 += x[5] * y[srcBLen - 2] */ 00345 acc3 += x1 * c0; 00346 00347 /* Read y[srcBLen - 4] sample */ 00348 c0 = *(py--); 00349 00350 /* Read x[6] sample */ 00351 x2 = *(px++); 00352 00353 /* Perform the multiply-accumulates */ 00354 /* acc0 += x[3] * y[srcBLen - 4] */ 00355 acc0 += x3 * c0; 00356 /* acc1 += x[4] * y[srcBLen - 4] */ 00357 acc1 += x0 * c0; 00358 /* acc2 += x[5] * y[srcBLen - 4] */ 00359 acc2 += x1 * c0; 00360 /* acc3 += x[6] * y[srcBLen - 4] */ 00361 acc3 += x2 * c0; 00362 00363 00364 } while(--k); 00365 00366 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00367 ** No loop unrolling is used. */ 00368 k = srcBLen % 0x4u; 00369 00370 while(k > 0u) 00371 { 00372 /* Read y[srcBLen - 5] sample */ 00373 c0 = *(py--); 00374 00375 /* Read x[7] sample */ 00376 x3 = *(px++); 00377 00378 /* Perform the multiply-accumulates */ 00379 /* acc0 += x[4] * y[srcBLen - 5] */ 00380 acc0 += x0 * c0; 00381 /* acc1 += x[5] * y[srcBLen - 5] */ 00382 acc1 += x1 * c0; 00383 /* acc2 += x[6] * y[srcBLen - 5] */ 00384 acc2 += x2 * c0; 00385 /* acc3 += x[7] * y[srcBLen - 5] */ 00386 acc3 += x3 * c0; 00387 00388 /* Reuse the present samples for the next MAC */ 00389 x0 = x1; 00390 x1 = x2; 00391 x2 = x3; 00392 00393 /* Decrement the loop counter */ 00394 k--; 00395 } 00396 00397 /* Store the result in the accumulator in the destination buffer. */ 00398 *pOut++ = acc0; 00399 *pOut++ = acc1; 00400 *pOut++ = acc2; 00401 *pOut++ = acc3; 00402 00403 /* Update the inputA and inputB pointers for next MAC calculation */ 00404 px = pIn1 + (count * 4u); 00405 py = pSrc2; 00406 00407 /* Increment the pointer pIn1 index, count by 1 */ 00408 count++; 00409 00410 /* Decrement the loop counter */ 00411 blkCnt--; 00412 } 00413 00414 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. 00415 ** No loop unrolling is used. */ 00416 blkCnt = (uint32_t) blockSize2 % 0x4u; 00417 00418 while(blkCnt > 0u) 00419 { 00420 /* Accumulator is made zero for every iteration */ 00421 sum = 0.0f; 00422 00423 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00424 k = srcBLen >> 2u; 00425 00426 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00427 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00428 while(k > 0u) 00429 { 00430 /* Perform the multiply-accumulates */ 00431 sum += *px++ * *py--; 00432 sum += *px++ * *py--; 00433 sum += *px++ * *py--; 00434 sum += *px++ * *py--; 00435 00436 /* Decrement the loop counter */ 00437 k--; 00438 } 00439 00440 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00441 ** No loop unrolling is used. */ 00442 k = srcBLen % 0x4u; 00443 00444 while(k > 0u) 00445 { 00446 /* Perform the multiply-accumulate */ 00447 sum += *px++ * *py--; 00448 00449 /* Decrement the loop counter */ 00450 k--; 00451 } 00452 00453 /* Store the result in the accumulator in the destination buffer. */ 00454 *pOut++ = sum; 00455 00456 /* Update the inputA and inputB pointers for next MAC calculation */ 00457 px = pIn1 + count; 00458 py = pSrc2; 00459 00460 /* Increment the MAC count */ 00461 count++; 00462 00463 /* Decrement the loop counter */ 00464 blkCnt--; 00465 } 00466 } 00467 else 00468 { 00469 /* If the srcBLen is not a multiple of 4, 00470 * the blockSize2 loop cannot be unrolled by 4 */ 00471 blkCnt = (uint32_t) blockSize2; 00472 00473 while(blkCnt > 0u) 00474 { 00475 /* Accumulator is made zero for every iteration */ 00476 sum = 0.0f; 00477 00478 /* srcBLen number of MACS should be performed */ 00479 k = srcBLen; 00480 00481 while(k > 0u) 00482 { 00483 /* Perform the multiply-accumulate */ 00484 sum += *px++ * *py--; 00485 00486 /* Decrement the loop counter */ 00487 k--; 00488 } 00489 00490 /* Store the result in the accumulator in the destination buffer. */ 00491 *pOut++ = sum; 00492 00493 /* Update the inputA and inputB pointers for next MAC calculation */ 00494 px = pIn1 + count; 00495 py = pSrc2; 00496 00497 /* Increment the MAC count */ 00498 count++; 00499 00500 /* Decrement the loop counter */ 00501 blkCnt--; 00502 } 00503 } 00504 00505 00506 /* -------------------------- 00507 * Initializations of stage3 00508 * -------------------------*/ 00509 00510 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] 00511 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] 00512 * .... 00513 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] 00514 * sum += x[srcALen-1] * y[srcBLen-1] 00515 */ 00516 00517 /* In this stage the MAC operations are decreased by 1 for every iteration. 00518 The count variable holds the number of MAC operations performed */ 00519 count = srcBLen - 1u; 00520 00521 /* Working pointer of inputA */ 00522 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); 00523 px = pSrc1; 00524 00525 /* Working pointer of inputB */ 00526 pSrc2 = pIn2 + (srcBLen - 1u); 00527 py = pSrc2; 00528 00529 while(blockSize3 > 0) 00530 { 00531 /* Accumulator is made zero for every iteration */ 00532 sum = 0.0f; 00533 00534 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00535 k = count >> 2u; 00536 00537 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00538 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00539 while(k > 0u) 00540 { 00541 /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */ 00542 sum += *px++ * *py--; 00543 00544 /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */ 00545 sum += *px++ * *py--; 00546 00547 /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */ 00548 sum += *px++ * *py--; 00549 00550 /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */ 00551 sum += *px++ * *py--; 00552 00553 /* Decrement the loop counter */ 00554 k--; 00555 } 00556 00557 /* If the count is not a multiple of 4, compute any remaining MACs here. 00558 ** No loop unrolling is used. */ 00559 k = count % 0x4u; 00560 00561 while(k > 0u) 00562 { 00563 /* Perform the multiply-accumulates */ 00564 /* sum += x[srcALen-1] * y[srcBLen-1] */ 00565 sum += *px++ * *py--; 00566 00567 /* Decrement the loop counter */ 00568 k--; 00569 } 00570 00571 /* Store the result in the accumulator in the destination buffer. */ 00572 *pOut++ = sum; 00573 00574 /* Update the inputA and inputB pointers for next MAC calculation */ 00575 px = ++pSrc1; 00576 py = pSrc2; 00577 00578 /* Decrement the MAC count */ 00579 count--; 00580 00581 /* Decrement the loop counter */ 00582 blockSize3--; 00583 00584 } 00585 00586 /* set status as ARM_MATH_SUCCESS */ 00587 status = ARM_MATH_SUCCESS; 00588 } 00589 00590 /* Return to application */ 00591 return (status); 00592 00593 #else 00594 00595 /* Run the below code for Cortex-M0 */ 00596 00597 float32_t *pIn1 = pSrcA; /* inputA pointer */ 00598 float32_t *pIn2 = pSrcB; /* inputB pointer */ 00599 float32_t sum; /* Accumulator */ 00600 uint32_t i, j; /* loop counters */ 00601 arm_status status; /* status of Partial convolution */ 00602 00603 /* Check for range of output samples to be calculated */ 00604 if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) 00605 { 00606 /* Set status as ARM_ARGUMENT_ERROR */ 00607 status = ARM_MATH_ARGUMENT_ERROR; 00608 } 00609 else 00610 { 00611 /* Loop to calculate convolution for output length number of values */ 00612 for (i = firstIndex; i <= (firstIndex + numPoints - 1); i++) 00613 { 00614 /* Initialize sum with zero to carry on MAC operations */ 00615 sum = 0.0f; 00616 00617 /* Loop to perform MAC operations according to convolution equation */ 00618 for (j = 0u; j <= i; j++) 00619 { 00620 /* Check the array limitations for inputs */ 00621 if((((i - j) < srcBLen) && (j < srcALen))) 00622 { 00623 /* z[i] += x[i-j] * y[j] */ 00624 sum += pIn1[j] * pIn2[i - j]; 00625 } 00626 } 00627 /* Store the output in the destination buffer */ 00628 pDst[i] = sum; 00629 } 00630 /* set status as ARM_SUCCESS as there are no argument errors */ 00631 status = ARM_MATH_SUCCESS; 00632 } 00633 return (status); 00634 00635 #endif /* #ifndef ARM_MATH_CM0 */ 00636 00637 } 00638